summaryrefslogtreecommitdiffstats
path: root/daemons/controld
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--daemons/controld/Makefile.am36
-rw-r--r--daemons/controld/controld_callbacks.c20
-rw-r--r--daemons/controld/controld_cib.c298
-rw-r--r--daemons/controld/controld_cib.h12
-rw-r--r--daemons/controld/controld_control.c37
-rw-r--r--daemons/controld/controld_corosync.c8
-rw-r--r--daemons/controld/controld_election.c7
-rw-r--r--daemons/controld/controld_execd.c92
-rw-r--r--daemons/controld/controld_execd_state.c15
-rw-r--r--daemons/controld/controld_fencing.c87
-rw-r--r--daemons/controld/controld_fencing.h2
-rw-r--r--daemons/controld/controld_fsa.c7
-rw-r--r--daemons/controld/controld_globals.h6
-rw-r--r--daemons/controld/controld_join_client.c36
-rw-r--r--daemons/controld/controld_join_dc.c133
-rw-r--r--daemons/controld/controld_lrm.h5
-rw-r--r--daemons/controld/controld_membership.c40
-rw-r--r--daemons/controld/controld_messages.c197
-rw-r--r--daemons/controld/controld_metadata.c6
-rw-r--r--daemons/controld/controld_remote_ra.c99
-rw-r--r--daemons/controld/controld_schedulerd.c23
-rw-r--r--daemons/controld/controld_te_actions.c25
-rw-r--r--daemons/controld/controld_te_callbacks.c6
-rw-r--r--daemons/controld/controld_te_events.c12
-rw-r--r--daemons/controld/controld_te_utils.c175
-rw-r--r--daemons/controld/controld_throttle.c6
-rw-r--r--daemons/controld/controld_transition.c13
-rw-r--r--daemons/controld/controld_transition.h2
-rw-r--r--daemons/controld/controld_utils.c2
-rw-r--r--daemons/controld/pacemaker-controld.c2
-rw-r--r--daemons/controld/pacemaker-controld.h3
31 files changed, 796 insertions, 616 deletions
diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am
index 08be1ff..1312090 100644
--- a/daemons/controld/Makefile.am
+++ b/daemons/controld/Makefile.am
@@ -14,34 +14,20 @@ halibdir = $(CRM_DAEMON_DIR)
halib_PROGRAMS = pacemaker-controld
-noinst_HEADERS = controld_alerts.h \
- controld_callbacks.h \
- controld_cib.h \
- controld_fencing.h \
- controld_fsa.h \
- controld_globals.h \
- controld_lrm.h \
- controld_membership.h \
- controld_messages.h \
- controld_metadata.h \
- controld_throttle.h \
- controld_timers.h \
- controld_transition.h \
- controld_utils.h \
- pacemaker-controld.h
+noinst_HEADERS = $(wildcard *.h)
pacemaker_controld_CFLAGS = $(CFLAGS_HARDENED_EXE)
pacemaker_controld_LDFLAGS = $(LDFLAGS_HARDENED_EXE)
-pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \
- $(top_builddir)/lib/pacemaker/libpacemaker.la \
- $(top_builddir)/lib/pengine/libpe_rules.la \
- $(top_builddir)/lib/cib/libcib.la \
- $(top_builddir)/lib/cluster/libcrmcluster.la \
- $(top_builddir)/lib/common/libcrmcommon.la \
- $(top_builddir)/lib/services/libcrmservice.la \
- $(top_builddir)/lib/lrmd/liblrmd.la \
- $(CLUSTERLIBS)
+pacemaker_controld_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la
+pacemaker_controld_LDADD += $(top_builddir)/lib/cib/libcib.la
+pacemaker_controld_LDADD += $(top_builddir)/lib/pengine/libpe_rules.la
+pacemaker_controld_LDADD += $(top_builddir)/lib/fencing/libstonithd.la
+pacemaker_controld_LDADD += $(top_builddir)/lib/cluster/libcrmcluster.la
+pacemaker_controld_LDADD += $(top_builddir)/lib/lrmd/liblrmd.la
+pacemaker_controld_LDADD += $(top_builddir)/lib/services/libcrmservice.la
+pacemaker_controld_LDADD += $(top_builddir)/lib/common/libcrmcommon.la
+pacemaker_controld_LDADD += $(CLUSTERLIBS)
pacemaker_controld_SOURCES = pacemaker-controld.c \
controld_alerts.c \
@@ -79,9 +65,11 @@ endif
CLEANFILES = $(man7_MANS)
if BUILD_LEGACY_LINKS
+.PHONY: install-exec-hook
install-exec-hook:
cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f crmd && $(LN_S) pacemaker-controld crmd
+.PHONY: uninstall-hook
uninstall-hook:
cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f crmd
endif
diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c
index d578adc..7078739 100644
--- a/daemons/controld/controld_callbacks.c
+++ b/daemons/controld/controld_callbacks.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2022 the Pacemaker project contributors
+ * Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -107,6 +107,8 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
bool appeared = FALSE;
bool is_remote = pcmk_is_set(node->flags, crm_remote_node);
+ controld_node_pending_timer(node);
+
/* The controller waits to receive some information from the membership
* layer before declaring itself operational. If this is being called for a
* cluster node, indicate that we have it.
@@ -274,13 +276,14 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
if (down) {
const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK);
- if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
+ if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
/* tengine_stonith_callback() confirms fence actions */
crm_trace("Updating CIB %s fencer reported fencing of %s complete",
(pcmk_is_set(down->flags, pcmk__graph_action_confirmed)? "after" : "before"), node->uname);
- } else if (!appeared && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_casei)) {
+ } else if (!appeared && pcmk__str_eq(task, PCMK_ACTION_DO_SHUTDOWN,
+ pcmk__str_casei)) {
// Shutdown actions are immediately confirmed (i.e. no_wait)
if (!is_remote) {
@@ -342,6 +345,17 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
}
}
+ if (!appeared && (type == crm_status_processes)
+ && (node->when_member > 1)) {
+ /* The node left CPG but is still a cluster member. Set its
+ * membership time to 1 to record it in the cluster state as a
+ * boolean, so we don't fence it due to node-pending-timeout.
+ */
+ node->when_member = 1;
+ flags |= node_update_cluster;
+ controld_node_pending_timer(node);
+ }
+
/* Update the CIB node state */
update = create_node_state_update(node, flags, NULL, __func__);
if (update == NULL) {
diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
index 94b99dd..865e41f 100644
--- a/daemons/controld/controld_cib.c
+++ b/daemons/controld/controld_cib.c
@@ -22,90 +22,6 @@
// Call ID of the most recent in-progress CIB resource update (or 0 if none)
static int pending_rsc_update = 0;
-// Call IDs of requested CIB replacements that won't trigger a new election
-// (used as a set of gint values)
-static GHashTable *cib_replacements = NULL;
-
-/*!
- * \internal
- * \brief Store the call ID of a CIB replacement that the controller requested
- *
- * The \p do_cib_replaced() callback function will avoid triggering a new
- * election when we're notified of one of these expected replacements.
- *
- * \param[in] call_id CIB call ID (or 0 for a synchronous call)
- *
- * \note This function should be called after making any asynchronous CIB
- * request (or before making any synchronous CIB request) that may replace
- * part of the nodes or status section. This may include CIB sync calls.
- */
-void
-controld_record_cib_replace_call(int call_id)
-{
- CRM_CHECK(call_id >= 0, return);
-
- if (cib_replacements == NULL) {
- cib_replacements = g_hash_table_new(NULL, NULL);
- }
-
- /* If the call ID is already present in the table, then it's old. We may not
- * be removing them properly, and we could improperly ignore replacement
- * notifications if cib_t:call_id wraps around.
- */
- CRM_LOG_ASSERT(g_hash_table_add(cib_replacements,
- GINT_TO_POINTER((gint) call_id)));
-}
-
-/*!
- * \internal
- * \brief Remove the call ID of a CIB replacement from the replacements table
- *
- * \param[in] call_id CIB call ID (or 0 for a synchronous call)
- *
- * \return \p true if \p call_id was found in the table, or \p false otherwise
- *
- * \note CIB notifications run before CIB callbacks. If this function is called
- * from within a callback, \p do_cib_replaced() will have removed
- * \p call_id from the table first if relevant changes triggered a
- * notification.
- */
-bool
-controld_forget_cib_replace_call(int call_id)
-{
- CRM_CHECK(call_id >= 0, return false);
-
- if (cib_replacements == NULL) {
- return false;
- }
- return g_hash_table_remove(cib_replacements,
- GINT_TO_POINTER((gint) call_id));
-}
-
-/*!
- * \internal
- * \brief Empty the hash table containing call IDs of CIB replacement requests
- */
-void
-controld_forget_all_cib_replace_calls(void)
-{
- if (cib_replacements != NULL) {
- g_hash_table_remove_all(cib_replacements);
- }
-}
-
-/*!
- * \internal
- * \brief Free the hash table containing call IDs of CIB replacement requests
- */
-void
-controld_destroy_cib_replacements_table(void)
-{
- if (cib_replacements != NULL) {
- g_hash_table_destroy(cib_replacements);
- cib_replacements = NULL;
- }
-}
-
/*!
* \internal
* \brief Respond to a dropped CIB connection
@@ -127,54 +43,54 @@ handle_cib_disconnect(gpointer user_data)
controld_clear_fsa_input_flags(R_CIB_CONNECTED);
} else { // Expected
- crm_info("Connection to the CIB manager terminated");
+ crm_info("Disconnected from the CIB manager");
}
}
static void
do_cib_updated(const char *event, xmlNode * msg)
{
- if (pcmk__alert_in_patchset(msg, TRUE)) {
- controld_trigger_config();
+ const xmlNode *patchset = NULL;
+ const char *client_name = NULL;
+
+ crm_debug("Received CIB diff notification: DC=%s", pcmk__btoa(AM_I_DC));
+
+ if (cib__get_notify_patchset(msg, &patchset) != pcmk_rc_ok) {
+ return;
}
-}
-static void
-do_cib_replaced(const char *event, xmlNode * msg)
-{
- int call_id = 0;
- const char *client_id = crm_element_value(msg, F_CIB_CLIENTID);
- uint32_t change_section = cib_change_section_nodes
- |cib_change_section_status;
- long long value = 0;
+ if (cib__element_in_patchset(patchset, XML_CIB_TAG_ALERTS)
+ || cib__element_in_patchset(patchset, XML_CIB_TAG_CRMCONFIG)) {
+
+ controld_trigger_config();
+ }
- crm_debug("Updating the CIB after a replace: DC=%s", pcmk__btoa(AM_I_DC));
if (!AM_I_DC) {
+ // We're not in control of the join sequence
return;
}
- if ((crm_element_value_int(msg, F_CIB_CALLID, &call_id) == 0)
- && pcmk__str_eq(client_id, controld_globals.cib_client_id,
- pcmk__str_none)
- && controld_forget_cib_replace_call(call_id)) {
- // We requested this replace op. No need to restart the join.
+ client_name = crm_element_value(msg, F_CIB_CLIENTNAME);
+ if (!cib__client_triggers_refresh(client_name)) {
+ // The CIB is still accurate
return;
}
- if ((crm_element_value_ll(msg, F_CIB_CHANGE_SECTION, &value) < 0)
- || (value < 0) || (value > UINT32_MAX)) {
+ if (cib__element_in_patchset(patchset, XML_CIB_TAG_NODES)
+ || cib__element_in_patchset(patchset, XML_CIB_TAG_STATUS)) {
- crm_trace("Couldn't parse '%s' from message", F_CIB_CHANGE_SECTION);
- } else {
- change_section = (uint32_t) value;
- }
-
- if (pcmk_any_flags_set(change_section, cib_change_section_nodes
- |cib_change_section_status)) {
+ /* An unsafe client modified the nodes or status section. Ensure the
+ * node list is up-to-date, and start the join process again so we get
+ * everyone's current resource history.
+ */
+ if (client_name == NULL) {
+ client_name = crm_element_value(msg, F_CIB_CLIENTID);
+ }
+ crm_notice("Populating nodes and starting an election after %s event "
+ "triggered by %s",
+ event, pcmk__s(client_name, "(unidentified client)"));
- /* start the join process again so we get everyone's LRM status */
populate_cib_nodes(node_update_quick|node_update_all, __func__);
-
register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
}
}
@@ -186,12 +102,10 @@ controld_disconnect_cib_manager(void)
CRM_ASSERT(cib_conn != NULL);
- crm_info("Disconnecting from the CIB manager");
+ crm_debug("Disconnecting from the CIB manager");
controld_clear_fsa_input_flags(R_CIB_CONNECTED);
- cib_conn->cmds->del_notify_callback(cib_conn, T_CIB_REPLACE_NOTIFY,
- do_cib_replaced);
cib_conn->cmds->del_notify_callback(cib_conn, T_CIB_DIFF_NOTIFY,
do_cib_updated);
cib_free_callbacks(cib_conn);
@@ -201,8 +115,6 @@ controld_disconnect_cib_manager(void)
cib_scope_local|cib_discard_reply);
cib_conn->cmds->signoff(cib_conn);
}
-
- crm_notice("Disconnected from the CIB manager");
}
/* A_CIB_STOP, A_CIB_START, O_CIB_RESTART */
@@ -217,7 +129,6 @@ do_cib_control(long long action,
cib_t *cib_conn = controld_globals.cib_conn;
void (*dnotify_fn) (gpointer user_data) = handle_cib_disconnect;
- void (*replace_cb) (const char *event, xmlNodePtr msg) = do_cib_replaced;
void (*update_cb) (const char *event, xmlNodePtr msg) = do_cib_updated;
int rc = pcmk_ok;
@@ -264,11 +175,6 @@ do_cib_control(long long action,
crm_err("Could not set dnotify callback");
} else if (cib_conn->cmds->add_notify_callback(cib_conn,
- T_CIB_REPLACE_NOTIFY,
- replace_cb) != pcmk_ok) {
- crm_err("Could not set CIB notification callback (replace)");
-
- } else if (cib_conn->cmds->add_notify_callback(cib_conn,
T_CIB_DIFF_NOTIFY,
update_cb) != pcmk_ok) {
crm_err("Could not set CIB notification callback (update)");
@@ -276,8 +182,6 @@ do_cib_control(long long action,
} else {
controld_set_fsa_input_flags(R_CIB_CONNECTED);
cib_retries = 0;
- cib_conn->cmds->client_id(cib_conn, &controld_globals.cib_client_id,
- NULL);
}
if (!pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) {
@@ -310,11 +214,12 @@ do_cib_control(long long action,
unsigned int
cib_op_timeout(void)
{
+ // @COMPAT: Drop env_timeout at 3.0.0
static int env_timeout = -1;
unsigned int calculated_timeout = 0;
if (env_timeout == -1) {
- const char *env = getenv("PCMK_cib_timeout");
+ const char *env = pcmk__env_option(PCMK__ENV_CIB_TIMEOUT);
pcmk__scan_min_int(env, &env_timeout, MIN_CIB_OP_TIMEOUT);
crm_trace("Minimum CIB op timeout: %ds (environment: %s)",
@@ -401,67 +306,87 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
/*!
* \internal
- * \brief Delete subsection of a node's CIB node_state
+ * \brief Get the XPath and description of a node state section to be deleted
*
- * \param[in] uname Desired node
- * \param[in] section Subsection of node_state to delete
- * \param[in] options CIB call options to use
+ * \param[in] uname Desired node
+ * \param[in] section Subsection of node_state to be deleted
+ * \param[out] xpath Where to store XPath of \p section
+ * \param[out] desc If not \c NULL, where to store description of \p section
*/
void
-controld_delete_node_state(const char *uname, enum controld_section_e section,
- int options)
+controld_node_state_deletion_strings(const char *uname,
+ enum controld_section_e section,
+ char **xpath, char **desc)
{
- cib_t *cib_conn = controld_globals.cib_conn;
-
- char *xpath = NULL;
- char *desc = NULL;
+ const char *desc_pre = NULL;
// Shutdown locks that started before this time are expired
long long expire = (long long) time(NULL)
- controld_globals.shutdown_lock_limit;
- CRM_CHECK(uname != NULL, return);
switch (section) {
case controld_section_lrm:
- xpath = crm_strdup_printf(XPATH_NODE_LRM, uname);
- desc = crm_strdup_printf("resource history for node %s", uname);
+ *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname);
+ desc_pre = "resource history";
break;
case controld_section_lrm_unlocked:
- xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED,
- uname, uname, expire);
- desc = crm_strdup_printf("resource history (other than shutdown "
- "locks) for node %s", uname);
+ *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED,
+ uname, uname, expire);
+ desc_pre = "resource history (other than shutdown locks)";
break;
case controld_section_attrs:
- xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname);
- desc = crm_strdup_printf("transient attributes for node %s", uname);
+ *xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname);
+ desc_pre = "transient attributes";
break;
case controld_section_all:
- xpath = crm_strdup_printf(XPATH_NODE_ALL, uname);
- desc = crm_strdup_printf("all state for node %s", uname);
+ *xpath = crm_strdup_printf(XPATH_NODE_ALL, uname);
+ desc_pre = "all state";
break;
case controld_section_all_unlocked:
- xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED,
- uname, uname, expire, uname);
- desc = crm_strdup_printf("all state (other than shutdown locks) "
- "for node %s", uname);
+ *xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED,
+ uname, uname, expire, uname);
+ desc_pre = "all state (other than shutdown locks)";
+ break;
+ default:
+ // We called this function incorrectly
+ CRM_ASSERT(false);
break;
}
- if (cib_conn == NULL) {
- crm_warn("Unable to delete %s: no CIB connection", desc);
- free(desc);
- } else {
- int call_id;
-
- cib__set_call_options(options, "node state deletion",
- cib_xpath|cib_multiple);
- call_id = cib_conn->cmds->remove(cib_conn, xpath, NULL, options);
- crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s",
- desc, call_id, xpath);
- fsa_register_cib_callback(call_id, desc, cib_delete_callback);
- // CIB library handles freeing desc
+ if (desc != NULL) {
+ *desc = crm_strdup_printf("%s for node %s", desc_pre, uname);
}
+}
+
+/*!
+ * \internal
+ * \brief Delete subsection of a node's CIB node_state
+ *
+ * \param[in] uname Desired node
+ * \param[in] section Subsection of node_state to delete
+ * \param[in] options CIB call options to use
+ */
+void
+controld_delete_node_state(const char *uname, enum controld_section_e section,
+ int options)
+{
+ cib_t *cib = controld_globals.cib_conn;
+ char *xpath = NULL;
+ char *desc = NULL;
+ int cib_rc = pcmk_ok;
+
+ CRM_ASSERT((uname != NULL) && (cib != NULL));
+
+ controld_node_state_deletion_strings(uname, section, &xpath, &desc);
+
+ cib__set_call_options(options, "node state deletion",
+ cib_xpath|cib_multiple);
+ cib_rc = cib->cmds->remove(cib, xpath, NULL, options);
+ fsa_register_cib_callback(cib_rc, desc, cib_delete_callback);
+ crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s",
+ desc, cib_rc, xpath);
+
+ // CIB library handles freeing desc
free(xpath);
}
@@ -491,11 +416,12 @@ controld_delete_resource_history(const char *rsc_id, const char *node,
char *desc = NULL;
char *xpath = NULL;
int rc = pcmk_rc_ok;
+ cib_t *cib = controld_globals.cib_conn;
CRM_CHECK((rsc_id != NULL) && (node != NULL), return EINVAL);
desc = crm_strdup_printf("resource history for %s on %s", rsc_id, node);
- if (controld_globals.cib_conn == NULL) {
+ if (cib == NULL) {
crm_err("Unable to clear %s: no CIB connection", desc);
free(desc);
return ENOTCONN;
@@ -503,9 +429,10 @@ controld_delete_resource_history(const char *rsc_id, const char *node,
// Ask CIB to delete the entry
xpath = crm_strdup_printf(XPATH_RESOURCE_HISTORY, node, rsc_id);
- rc = cib_internal_op(controld_globals.cib_conn, PCMK__CIB_REQUEST_DELETE,
- NULL, xpath, NULL, NULL, call_options|cib_xpath,
- user_name);
+
+ cib->cmds->set_user(cib, user_name);
+ rc = cib->cmds->remove(cib, xpath, NULL, call_options|cib_xpath);
+ cib->cmds->set_user(cib, NULL);
if (rc < 0) {
rc = pcmk_legacy2rc(rc);
@@ -841,10 +768,17 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use
case pcmk_ok:
case -pcmk_err_diff_failed:
case -pcmk_err_diff_resync:
- crm_trace("Resource update %d complete: rc=%d", call_id, rc);
+ crm_trace("Resource history update completed (call=%d rc=%d)",
+ call_id, rc);
break;
default:
- crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
+ if (call_id > 0) {
+ crm_warn("Resource history update %d failed: %s "
+ CRM_XS " rc=%d", call_id, pcmk_strerror(rc), rc);
+ } else {
+ crm_warn("Resource history update failed: %s " CRM_XS " rc=%d",
+ pcmk_strerror(rc), rc);
+ }
}
if (call_id == pending_rsc_update) {
@@ -863,10 +797,11 @@ should_preserve_lock(lrmd_event_data_t *op)
if (!pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
return false;
}
- if (!strcmp(op->op_type, RSC_STOP) && (op->rc == PCMK_OCF_OK)) {
+ if (!strcmp(op->op_type, PCMK_ACTION_STOP) && (op->rc == PCMK_OCF_OK)) {
return true;
}
- if (!strcmp(op->op_type, RSC_STATUS) && (op->rc == PCMK_OCF_NOT_RUNNING)) {
+ if (!strcmp(op->op_type, PCMK_ACTION_MONITOR)
+ && (op->rc == PCMK_OCF_NOT_RUNNING)) {
return true;
}
return false;
@@ -876,10 +811,10 @@ should_preserve_lock(lrmd_event_data_t *op)
* \internal
* \brief Request a CIB update
*
- * \param[in] section Section of CIB to update
- * \param[in,out] data New XML of CIB section to update
- * \param[in] options CIB call options
- * \param[in] callback If not NULL, set this as the operation callback
+ * \param[in] section Section of CIB to update
+ * \param[in] data New XML of CIB section to update
+ * \param[in] options CIB call options
+ * \param[in] callback If not \c NULL, set this as the operation callback
*
* \return Standard Pacemaker return code
*
@@ -890,14 +825,13 @@ int
controld_update_cib(const char *section, xmlNode *data, int options,
void (*callback)(xmlNode *, int, int, xmlNode *, void *))
{
+ cib_t *cib = controld_globals.cib_conn;
int cib_rc = -ENOTCONN;
CRM_ASSERT(data != NULL);
- if (controld_globals.cib_conn != NULL) {
- cib_rc = cib_internal_op(controld_globals.cib_conn,
- PCMK__CIB_REQUEST_MODIFY, NULL, section,
- data, NULL, options, NULL);
+ if (cib != NULL) {
+ cib_rc = cib->cmds->modify(cib, section, data, options);
if (cib_rc >= 0) {
crm_debug("Submitted CIB update %d for %s section",
cib_rc, section);
@@ -1047,7 +981,6 @@ controld_delete_action_history(const lrmd_event_data_t *op)
controld_globals.cib_conn->cmds->remove(controld_globals.cib_conn,
XML_CIB_TAG_STATUS, xml_top,
cib_none);
-
crm_log_xml_trace(xml_top, "op:cancel");
free_xml(xml_top);
}
@@ -1087,7 +1020,6 @@ controld_cib_delete_last_failure(const char *rsc_id, const char *node,
{
char *xpath = NULL;
char *last_failure_key = NULL;
-
CRM_CHECK((rsc_id != NULL) && (node != NULL), return);
// Generate XPath to match desired entry
diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h
index bd9492a..dcc5a48 100644
--- a/daemons/controld/controld_cib.h
+++ b/daemons/controld/controld_cib.h
@@ -43,11 +43,6 @@ fsa_cib_anon_update_discard_reply(const char *section, xmlNode *data) {
}
}
-void controld_record_cib_replace_call(int call_id);
-bool controld_forget_cib_replace_call(int call_id);
-void controld_forget_all_cib_replace_calls(void);
-void controld_destroy_cib_replacements_table(void);
-
int controld_update_cib(const char *section, xmlNode *data, int options,
void (*callback)(xmlNode *, int, int, xmlNode *,
void *));
@@ -62,6 +57,9 @@ enum controld_section_e {
controld_section_all_unlocked
};
+void controld_node_state_deletion_strings(const char *uname,
+ enum controld_section_e section,
+ char **xpath, char **desc);
void controld_delete_node_state(const char *uname,
enum controld_section_e section, int options);
int controld_delete_resource_history(const char *rsc_id, const char *node,
@@ -118,8 +116,8 @@ int crmd_cib_smart_opt(void);
static inline bool
controld_action_is_recordable(const char *action)
{
- return !pcmk__str_any_of(action, CRMD_ACTION_CANCEL, CRMD_ACTION_DELETE,
- CRMD_ACTION_NOTIFY, CRMD_ACTION_METADATA, NULL);
+ return !pcmk__str_any_of(action, PCMK_ACTION_CANCEL, PCMK_ACTION_DELETE,
+ PCMK_ACTION_NOTIFY, PCMK_ACTION_META_DATA, NULL);
}
#endif // PCMK__CONTROLD_CIB__H
diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c
index ffc62a0..644d686 100644
--- a/daemons/controld/controld_control.c
+++ b/daemons/controld/controld_control.c
@@ -221,6 +221,7 @@ crmd_exit(crm_exit_t exit_code)
g_list_free(controld_globals.fsa_message_queue);
controld_globals.fsa_message_queue = NULL;
+ controld_free_node_pending_timers();
controld_election_fini();
/* Tear down the CIB manager connection, but don't free it yet -- it could
@@ -265,7 +266,6 @@ crmd_exit(crm_exit_t exit_code)
controld_globals.te_uuid = NULL;
free_max_generation();
- controld_destroy_cib_replacements_table();
controld_destroy_failed_sync_table();
controld_destroy_outside_events_table();
@@ -323,20 +323,12 @@ do_exit(long long action,
enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
crm_exit_t exit_code = CRM_EX_OK;
- int log_level = LOG_INFO;
- const char *exit_type = "gracefully";
- if (action & A_EXIT_1) {
- log_level = LOG_ERR;
- exit_type = "forcefully";
+ if (pcmk_is_set(action, A_EXIT_1)) {
exit_code = CRM_EX_ERROR;
+ crm_err("Exiting now due to errors");
}
-
verify_stopped(cur_state, LOG_ERR);
- do_crm_log(log_level, "Performing %s - %s exiting the controller",
- fsa_action2string(action), exit_type);
-
- crm_info("[%s] stopped (%d)", crm_system_name, exit_code);
crmd_exit(exit_code);
}
@@ -504,7 +496,8 @@ do_started(long long action,
} else {
crm_notice("Pacemaker controller successfully started and accepting connections");
}
- controld_trigger_fencer_connect();
+ controld_set_fsa_input_flags(R_ST_REQUIRED);
+ controld_timer_fencer_connect(GINT_TO_POINTER(TRUE));
controld_clear_fsa_input_flags(R_STARTING);
register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL);
@@ -684,6 +677,17 @@ static pcmk__cluster_option_t controller_options[] = {
"passed since the shutdown was initiated, even if the node has not "
"rejoined.")
},
+ {
+ XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT, NULL, "time", NULL,
+ "0", pcmk__valid_interval_spec,
+ N_("How long to wait for a node that has joined the cluster to join "
+ "the controller process group"),
+ N_("Fence nodes that do not join the controller process group within "
+ "this much time after joining the cluster, to allow the cluster "
+ "to continue managing resources. A value of 0 means never fence "
+ "pending nodes. Setting the value to 2h means fence nodes after "
+ "2 hours.")
+ },
};
void
@@ -722,9 +726,8 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
}
crmconfig = output;
- if ((crmconfig) &&
- (crm_element_name(crmconfig)) &&
- (strcmp(crm_element_name(crmconfig), XML_CIB_TAG_CRMCONFIG) != 0)) {
+ if ((crmconfig != NULL)
+ && !pcmk__xe_is(crmconfig, XML_CIB_TAG_CRMCONFIG)) {
crmconfig = first_named_child(crmconfig, XML_CIB_TAG_CRMCONFIG);
}
if (!crmconfig) {
@@ -761,6 +764,10 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
controld_globals.shutdown_lock_limit = crm_parse_interval_spec(value)
/ 1000;
+ value = g_hash_table_lookup(config_hash,
+ XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT);
+ controld_globals.node_pending_timeout = crm_parse_interval_spec(value) / 1000;
+
value = g_hash_table_lookup(config_hash, "cluster-name");
pcmk__str_update(&(controld_globals.cluster_name), value);
diff --git a/daemons/controld/controld_corosync.c b/daemons/controld/controld_corosync.c
index 4378b30..b69e821 100644
--- a/daemons/controld/controld_corosync.c
+++ b/daemons/controld/controld_corosync.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2022 the Pacemaker project contributors
+ * Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -81,9 +81,6 @@ crmd_cs_destroy(gpointer user_data)
if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) {
crm_crit("Lost connection to cluster layer, shutting down");
crmd_exit(CRM_EX_DISCONNECT);
-
- } else {
- crm_info("Corosync connection closed");
}
}
@@ -122,7 +119,8 @@ cpg_membership_callback(cpg_handle_t handle, const struct cpg_name *cpg_name,
if (controld_globals.dc_name != NULL) {
crm_node_t *peer = NULL;
- peer = pcmk__search_cluster_node_cache(0, controld_globals.dc_name);
+ peer = pcmk__search_cluster_node_cache(0, controld_globals.dc_name,
+ NULL);
if (peer != NULL) {
for (int i = 0; i < left_list_entries; ++i) {
if (left_list[i].nodeid == peer->id) {
diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c
index 5f33d5b..70ffecc 100644
--- a/daemons/controld/controld_election.c
+++ b/daemons/controld/controld_election.c
@@ -263,13 +263,6 @@ do_dc_release(long long action,
} else if (action & A_DC_RELEASED) {
crm_info("DC role released");
-#if 0
- if (are there errors) {
- /* we can't stay up if not healthy */
- /* or perhaps I_ERROR and go to S_RECOVER? */
- result = I_SHUTDOWN;
- }
-#endif
if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
xmlNode *update = NULL;
crm_node_t *node = crm_get_peer(0, controld_globals.our_nodename);
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 0de399c..480d37d 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -52,14 +52,10 @@ static void
lrm_connection_destroy(void)
{
if (pcmk_is_set(controld_globals.fsa_input_register, R_LRM_CONNECTED)) {
- crm_crit("Connection to executor failed");
+ crm_crit("Lost connection to local executor");
register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
controld_clear_fsa_input_flags(R_LRM_CONNECTED);
-
- } else {
- crm_info("Disconnected from executor");
}
-
}
static char *
@@ -171,7 +167,7 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_
return;
}
- if (pcmk__str_eq(op->op_type, RSC_NOTIFY, pcmk__str_casei)) {
+ if (pcmk__str_eq(op->op_type, PCMK_ACTION_NOTIFY, pcmk__str_casei)) {
return;
}
@@ -222,10 +218,10 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_
}
entry->last = lrmd_copy_event(op);
- if (op->params && pcmk__strcase_any_of(op->op_type, CRMD_ACTION_START,
- CRMD_ACTION_RELOAD,
- CRMD_ACTION_RELOAD_AGENT,
- CRMD_ACTION_STATUS, NULL)) {
+ if (op->params && pcmk__strcase_any_of(op->op_type, PCMK_ACTION_START,
+ PCMK_ACTION_RELOAD,
+ PCMK_ACTION_RELOAD_AGENT,
+ PCMK_ACTION_MONITOR, NULL)) {
if (entry->stop_params) {
g_hash_table_destroy(entry->stop_params);
}
@@ -243,7 +239,9 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_
op->rsc_id, op->op_type, op->interval_ms);
entry->recurring_op_list = g_list_prepend(entry->recurring_op_list, lrmd_copy_event(op));
- } else if (entry->recurring_op_list && !pcmk__str_eq(op->op_type, RSC_STATUS, pcmk__str_casei)) {
+ } else if ((entry->recurring_op_list != NULL)
+ && !pcmk__str_eq(op->op_type, PCMK_ACTION_MONITOR,
+ pcmk__str_casei)) {
crm_trace("Dropping %d recurring ops because of: " PCMK__OP_FMT,
g_list_length(entry->recurring_op_list), op->rsc_id,
op->op_type, op->interval_ms);
@@ -376,10 +374,8 @@ do_lrm_control(long long action,
}
controld_clear_fsa_input_flags(R_LRM_CONNECTED);
- crm_info("Disconnecting from the executor");
lrm_state_disconnect(lrm_state);
lrm_state_reset_tables(lrm_state, FALSE);
- crm_notice("Disconnected from the executor");
}
if (action & A_LRM_CONNECT) {
@@ -510,11 +506,14 @@ is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id)
crm_trace("Processing %s: %s.%d=%d", rsc_id, entry->last->op_type,
entry->last->interval_ms, entry->last->rc);
- if (entry->last->rc == PCMK_OCF_OK && pcmk__str_eq(entry->last->op_type, CRMD_ACTION_STOP, pcmk__str_casei)) {
+ if ((entry->last->rc == PCMK_OCF_OK)
+ && pcmk__str_eq(entry->last->op_type, PCMK_ACTION_STOP,
+ pcmk__str_casei)) {
return FALSE;
} else if (entry->last->rc == PCMK_OCF_OK
- && pcmk__str_eq(entry->last->op_type, CRMD_ACTION_MIGRATE, pcmk__str_casei)) {
+ && pcmk__str_eq(entry->last->op_type, PCMK_ACTION_MIGRATE_TO,
+ pcmk__str_casei)) {
// A stricter check is too complex ... leave that to the scheduler
return FALSE;
@@ -668,7 +667,7 @@ notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_
crm_info("Notifying %s on %s that %s was%s deleted",
from_sys, (from_host? from_host : "localhost"), rsc_id,
((rc == pcmk_ok)? "" : " not"));
- op = construct_op(lrm_state, input->xml, rsc_id, CRMD_ACTION_DELETE);
+ op = construct_op(lrm_state, input->xml, rsc_id, PCMK_ACTION_DELETE);
controld_rc2event(op, pcmk_legacy2rc(rc));
controld_ack_event_directly(from_host, from_sys, NULL, op, rsc_id);
lrmd_free_event(op);
@@ -1117,7 +1116,8 @@ synthesize_lrmd_failure(lrm_state_t *lrm_state, const xmlNode *action,
op = construct_op(lrm_state, action, ID(xml_rsc), operation);
- if (pcmk__str_eq(operation, RSC_NOTIFY, pcmk__str_casei)) { // Notifications can't fail
+ if (pcmk__str_eq(operation, PCMK_ACTION_NOTIFY, pcmk__str_casei)) {
+ // Notifications can't fail
fake_op_status(lrm_state, op, PCMK_EXEC_DONE, PCMK_OCF_OK, NULL);
} else {
fake_op_status(lrm_state, op, op_status, rc, exit_reason);
@@ -1329,7 +1329,7 @@ do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state,
if (cib_rc != pcmk_rc_ok) {
lrmd_event_data_t *op = NULL;
- op = construct_op(lrm_state, input->xml, rsc->id, CRMD_ACTION_DELETE);
+ op = construct_op(lrm_state, input->xml, rsc->id, PCMK_ACTION_DELETE);
/* These are resource clean-ups, not actions, so no exit reason is
* needed.
@@ -1394,7 +1394,9 @@ metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
result->action_stdout);
}
- do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) {
+ do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ }
free_metadata_cb_data(data);
}
@@ -1438,11 +1440,11 @@ do_lrm_invoke(long long action,
from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
}
- if (pcmk__str_eq(crm_op, CRM_OP_LRM_DELETE, pcmk__str_none)) {
+ if (pcmk__str_eq(crm_op, PCMK_ACTION_LRM_DELETE, pcmk__str_none)) {
if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_none)) {
crm_rsc_delete = TRUE; // from crm_resource
}
- operation = CRMD_ACTION_DELETE;
+ operation = PCMK_ACTION_DELETE;
} else if (input->xml != NULL) {
operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK);
@@ -1486,7 +1488,7 @@ do_lrm_invoke(long long action,
} else if (operation != NULL) {
lrmd_rsc_info_t *rsc = NULL;
xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE);
- gboolean create_rsc = !pcmk__str_eq(operation, CRMD_ACTION_DELETE,
+ gboolean create_rsc = !pcmk__str_eq(operation, PCMK_ACTION_DELETE,
pcmk__str_none);
int rc;
@@ -1534,12 +1536,13 @@ do_lrm_invoke(long long action,
return;
}
- if (pcmk__str_eq(operation, CRMD_ACTION_CANCEL, pcmk__str_none)) {
+ if (pcmk__str_eq(operation, PCMK_ACTION_CANCEL, pcmk__str_none)) {
if (!do_lrm_cancel(input, lrm_state, rsc, from_host, from_sys)) {
crm_log_xml_warn(input->xml, "Bad command");
}
- } else if (pcmk__str_eq(operation, CRMD_ACTION_DELETE, pcmk__str_none)) {
+ } else if (pcmk__str_eq(operation, PCMK_ACTION_DELETE,
+ pcmk__str_none)) {
do_lrm_delete(input, lrm_state, rsc, from_sys, from_host,
crm_rsc_delete, user_name);
@@ -1554,7 +1557,7 @@ do_lrm_invoke(long long action,
* changed (using something like inotify, or a hash or modification
* time of the agent executable).
*/
- if (strcmp(operation, CRMD_ACTION_START) != 0) {
+ if (strcmp(operation, PCMK_ACTION_START) != 0) {
md = controld_get_rsc_metadata(lrm_state, rsc,
controld_metadata_from_cache);
}
@@ -1619,7 +1622,8 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op,
lrmd__set_result(op, PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, NULL);
if (rsc_op == NULL) {
- CRM_LOG_ASSERT(pcmk__str_eq(CRMD_ACTION_STOP, operation, pcmk__str_casei));
+ CRM_LOG_ASSERT(pcmk__str_eq(operation, PCMK_ACTION_STOP,
+ pcmk__str_casei));
op->user_data = NULL;
/* the stop_all_resources() case
* by definition there is no DC (or they'd be shutting
@@ -1654,7 +1658,7 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op,
class = crm_element_value(primitive, XML_AGENT_ATTR_CLASS);
if (pcmk_is_set(pcmk_get_ra_caps(class), pcmk_ra_cap_fence_params)
- && pcmk__str_eq(operation, CRMD_ACTION_STATUS, pcmk__str_casei)
+ && pcmk__str_eq(operation, PCMK_ACTION_MONITOR, pcmk__str_casei)
&& (op->interval_ms > 0)) {
op_timeout = g_hash_table_lookup(params, "pcmk_monitor_timeout");
@@ -1663,7 +1667,7 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op,
}
}
- if (!pcmk__str_eq(operation, RSC_STOP, pcmk__str_casei)) {
+ if (!pcmk__str_eq(operation, PCMK_ACTION_STOP, pcmk__str_casei)) {
op->params = params;
} else {
@@ -1703,7 +1707,8 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op,
op->user_data = strdup(transition);
if (op->interval_ms != 0) {
- if (pcmk__strcase_any_of(operation, CRMD_ACTION_START, CRMD_ACTION_STOP, NULL)) {
+ if (pcmk__strcase_any_of(operation, PCMK_ACTION_START, PCMK_ACTION_STOP,
+ NULL)) {
crm_err("Start and Stop actions cannot have an interval: %u",
op->interval_ms);
op->interval_ms = 0;
@@ -1849,7 +1854,7 @@ static bool
should_cancel_recurring(const char *rsc_id, const char *action, guint interval_ms)
{
if (is_remote_lrmd_ra(NULL, NULL, rsc_id) && (interval_ms == 0)
- && (strcmp(action, CRMD_ACTION_MIGRATE) == 0)) {
+ && (strcmp(action, PCMK_ACTION_MIGRATE_TO) == 0)) {
/* Don't stop monitoring a migrating Pacemaker Remote connection
* resource until the entire migration has completed. We must detect if
* the connection is unexpectedly severed, even during a migration.
@@ -1859,8 +1864,8 @@ should_cancel_recurring(const char *rsc_id, const char *action, guint interval_m
// Cancel recurring actions before changing resource state
return (interval_ms == 0)
- && !pcmk__str_any_of(action, CRMD_ACTION_STATUS, CRMD_ACTION_NOTIFY,
- NULL);
+ && !pcmk__str_any_of(action, PCMK_ACTION_MONITOR,
+ PCMK_ACTION_NOTIFY, NULL);
}
/*!
@@ -1876,7 +1881,7 @@ static const char *
should_nack_action(const char *action)
{
if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)
- && pcmk__str_eq(action, RSC_START, pcmk__str_none)) {
+ && pcmk__str_eq(action, PCMK_ACTION_START, pcmk__str_none)) {
register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL);
return "Not attempting start due to shutdown in progress";
@@ -1888,7 +1893,7 @@ should_nack_action(const char *action)
case S_TRANSITION_ENGINE:
break;
default:
- if (!pcmk__str_eq(action, CRMD_ACTION_STOP, pcmk__str_none)) {
+ if (!pcmk__str_eq(action, PCMK_ACTION_STOP, pcmk__str_none)) {
return "Controller cannot attempt actions at this time";
}
break;
@@ -1930,8 +1935,8 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
return;
}
- if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
- CRMD_ACTION_RELOAD_AGENT, NULL)) {
+ if (pcmk__str_any_of(operation, PCMK_ACTION_RELOAD,
+ PCMK_ACTION_RELOAD_AGENT, NULL)) {
/* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
* will schedule reload-agent actions only. In either case, we need
* to map that to whatever the resource agent actually supports.
@@ -1939,9 +1944,9 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
*/
if ((md != NULL)
&& pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) {
- operation = CRMD_ACTION_RELOAD;
+ operation = PCMK_ACTION_RELOAD;
} else {
- operation = CRMD_ACTION_RELOAD_AGENT;
+ operation = PCMK_ACTION_RELOAD_AGENT;
}
}
@@ -1968,8 +1973,9 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
/* now do the op */
crm_notice("Requesting local execution of %s operation for %s on %s "
CRM_XS " transition_key=%s op_key=" PCMK__OP_FMT,
- crm_action_str(op->op_type, op->interval_ms), rsc->id, lrm_state->node_name,
- pcmk__s(transition, ""), rsc->id, operation, op->interval_ms);
+ pcmk__readable_action(op->op_type, op->interval_ms), rsc->id,
+ lrm_state->node_name, pcmk__s(transition, ""), rsc->id,
+ operation, op->interval_ms);
nack_reason = should_nack_action(operation);
if (nack_reason != NULL) {
@@ -2131,7 +2137,8 @@ log_executor_event(const lrmd_event_data_t *op, const char *op_key,
GString *str = g_string_sized_new(100); // reasonable starting size
pcmk__g_strcat(str,
- "Result of ", crm_action_str(op->op_type, op->interval_ms),
+ "Result of ",
+ pcmk__readable_action(op->op_type, op->interval_ms),
" operation for ", op->rsc_id, NULL);
if (node_name != NULL) {
@@ -2401,7 +2408,8 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
log_executor_event(op, op_key, node_name, removed);
if (lrm_state) {
- if (!pcmk__str_eq(op->op_type, RSC_METADATA, pcmk__str_casei)) {
+ if (!pcmk__str_eq(op->op_type, PCMK_ACTION_META_DATA,
+ pcmk__str_casei)) {
crmd_alert_resource_op(lrm_state->node_name, op);
} else if (rsc && (op->rc == PCMK_OCF_OK)) {
char *metadata = unescape_newlines(op->output);
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 8c68bfc..b90cc5e 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -132,12 +132,6 @@ lrm_state_create(const char *node_name)
return state;
}
-void
-lrm_state_destroy(const char *node_name)
-{
- g_hash_table_remove(lrm_state_table, node_name);
-}
-
static gboolean
remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data)
{
@@ -307,7 +301,7 @@ lrm_state_destroy_all(void)
lrm_state_t *
lrm_state_find(const char *node_name)
{
- if (!node_name) {
+ if ((node_name == NULL) || (lrm_state_table == NULL)) {
return NULL;
}
return g_hash_table_lookup(lrm_state_table, node_name);
@@ -318,6 +312,8 @@ lrm_state_find_or_create(const char *node_name)
{
lrm_state_t *lrm_state;
+ CRM_CHECK(lrm_state_table != NULL, return NULL);
+
lrm_state = g_hash_table_lookup(lrm_state_table, node_name);
if (!lrm_state) {
lrm_state = lrm_state_create(node_name);
@@ -329,6 +325,9 @@ lrm_state_find_or_create(const char *node_name)
GList *
lrm_state_get_list(void)
{
+ if (lrm_state_table == NULL) {
+ return NULL;
+ }
return g_hash_table_get_values(lrm_state_table);
}
@@ -799,7 +798,7 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state,
}
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
- lrm_state_destroy(rsc_id);
+ g_hash_table_remove(lrm_state_table, rsc_id);
return pcmk_ok;
}
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
index 89cb61f..9557d9e 100644
--- a/daemons/controld/controld_fencing.c
+++ b/daemons/controld/controld_fencing.c
@@ -218,8 +218,11 @@ send_stonith_update(pcmk__graph_action_t *action, const char *target,
CRM_CHECK(target != NULL, return);
CRM_CHECK(uuid != NULL, return);
- /* Make sure the membership and join caches are accurate */
- peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
+ /* Make sure the membership and join caches are accurate.
+ * Try getting any existing node cache entry also by node uuid in case it
+ * doesn't have an uname yet.
+ */
+ peer = pcmk__get_peer_full(0, target, uuid, CRM_GET_PEER_ANY);
CRM_CHECK(peer != NULL, return);
@@ -391,7 +394,7 @@ execute_stonith_cleanup(void)
*/
static stonith_t *stonith_api = NULL;
-static crm_trigger_t *stonith_reconnect = NULL;
+static mainloop_timer_t *controld_fencer_connect_timer = NULL;
static char *te_client_id = NULL;
static gboolean
@@ -422,7 +425,7 @@ fail_incompletable_stonith(pcmk__graph_t *graph)
}
task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
- if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
+ if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
last_action = action->xml;
pcmk__update_graph(graph, action);
@@ -447,11 +450,12 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
te_cleanup_stonith_history_sync(st, FALSE);
if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
- crm_crit("Fencing daemon connection failed");
- mainloop_set_trigger(stonith_reconnect);
-
+ crm_err("Lost fencer connection (will attempt to reconnect)");
+ if (!mainloop_timer_running(controld_fencer_connect_timer)) {
+ mainloop_timer_start(controld_fencer_connect_timer);
+ }
} else {
- crm_info("Fencing daemon disconnected");
+ crm_info("Disconnected from fencer");
}
if (stonith_api) {
@@ -515,7 +519,7 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event)
crmd_alert_fencing_op(event);
- if (pcmk__str_eq("on", event->action, pcmk__str_none)) {
+ if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) {
// Unfencing doesn't need special handling, just a log message
if (succeeded) {
crm_notice("%s was unfenced by %s at the request of %s@%s",
@@ -647,14 +651,14 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event)
/*!
* \brief Connect to fencer
*
- * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop
+ * \param[in] user_data If NULL, retry failures now, otherwise retry in mainloop timer
*
- * \return TRUE
+ * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry
* \note If user_data is NULL, this will wait 2s between attempts, for up to
* 30 attempts, meaning the controller could be blocked as long as 58s.
*/
-static gboolean
-te_connect_stonith(gpointer user_data)
+gboolean
+controld_timer_fencer_connect(gpointer user_data)
{
int rc = pcmk_ok;
@@ -662,13 +666,13 @@ te_connect_stonith(gpointer user_data)
stonith_api = stonith_api_new();
if (stonith_api == NULL) {
crm_err("Could not connect to fencer: API memory allocation failed");
- return TRUE;
+ return G_SOURCE_REMOVE;
}
}
if (stonith_api->state != stonith_disconnected) {
crm_trace("Already connected to fencer, no need to retry");
- return TRUE;
+ return G_SOURCE_REMOVE;
}
if (user_data == NULL) {
@@ -681,17 +685,30 @@ te_connect_stonith(gpointer user_data)
} else {
// Non-blocking (retry failures later in main loop)
rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
+
+ if (controld_fencer_connect_timer == NULL) {
+ controld_fencer_connect_timer =
+ mainloop_timer_add("controld_fencer_connect", 1000,
+ TRUE, controld_timer_fencer_connect,
+ GINT_TO_POINTER(TRUE));
+ }
+
if (rc != pcmk_ok) {
if (pcmk_is_set(controld_globals.fsa_input_register,
R_ST_REQUIRED)) {
crm_notice("Fencer connection failed (will retry): %s "
CRM_XS " rc=%d", pcmk_strerror(rc), rc);
- mainloop_set_trigger(stonith_reconnect);
+
+ if (!mainloop_timer_running(controld_fencer_connect_timer)) {
+ mainloop_timer_start(controld_fencer_connect_timer);
+ }
+
+ return G_SOURCE_CONTINUE;
} else {
crm_info("Fencer connection failed (ignoring because no longer required): %s "
CRM_XS " rc=%d", pcmk_strerror(rc), rc);
}
- return TRUE;
+ return G_SOURCE_REMOVE;
}
}
@@ -709,23 +726,7 @@ te_connect_stonith(gpointer user_data)
crm_notice("Fencer successfully connected");
}
- return TRUE;
-}
-
-/*!
- \internal
- \brief Schedule fencer connection attempt in main loop
-*/
-void
-controld_trigger_fencer_connect(void)
-{
- if (stonith_reconnect == NULL) {
- stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
- te_connect_stonith,
- GINT_TO_POINTER(TRUE));
- }
- controld_set_fsa_input_flags(R_ST_REQUIRED);
- mainloop_set_trigger(stonith_reconnect);
+ return G_SOURCE_REMOVE;
}
void
@@ -745,9 +746,9 @@ controld_disconnect_fencer(bool destroy)
stonith_api->cmds->free(stonith_api);
stonith_api = NULL;
}
- if (stonith_reconnect) {
- mainloop_destroy_trigger(stonith_reconnect);
- stonith_reconnect = NULL;
+ if (controld_fencer_connect_timer) {
+ mainloop_timer_del(controld_fencer_connect_timer);
+ controld_fencer_connect_timer = NULL;
}
if (te_client_id) {
free(te_client_id);
@@ -843,7 +844,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
crm_info("Fence operation %d for %s succeeded", data->call_id, target);
if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
te_action_confirmed(action, NULL);
- if (pcmk__str_eq("on", op, pcmk__str_casei)) {
+ if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) {
const char *value = NULL;
char *now = pcmk__ttoa(time(NULL));
gboolean is_remote_node = FALSE;
@@ -981,7 +982,7 @@ controld_execute_fence_action(pcmk__graph_t *graph,
priority_delay ? priority_delay : "");
/* Passing NULL means block until we can connect... */
- te_connect_stonith(NULL);
+ controld_timer_fencer_connect(NULL);
pcmk__scan_min_int(priority_delay, &delay_i, 0);
rc = fence_with_delay(target, type, delay_i);
@@ -1000,12 +1001,14 @@ controld_execute_fence_action(pcmk__graph_t *graph,
bool
controld_verify_stonith_watchdog_timeout(const char *value)
{
+ long st_timeout = value? crm_get_msec(value) : 0;
const char *our_nodename = controld_globals.our_nodename;
gboolean rv = TRUE;
- if (stonith_api && (stonith_api->state != stonith_disconnected) &&
- stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
- our_nodename)) {
+ if (st_timeout == 0
+ || (stonith_api && (stonith_api->state != stonith_disconnected) &&
+ stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
+ our_nodename))) {
rv = pcmk__valid_sbd_timeout(value);
}
return rv;
diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h
index 86a5050..76779c6 100644
--- a/daemons/controld/controld_fencing.h
+++ b/daemons/controld/controld_fencing.h
@@ -19,7 +19,7 @@ void controld_configure_fencing(GHashTable *options);
void st_fail_count_reset(const char * target);
// stonith API client
-void controld_trigger_fencer_connect(void);
+gboolean controld_timer_fencer_connect(gpointer user_data);
void controld_disconnect_fencer(bool destroy);
int controld_execute_fence_action(pcmk__graph_t *graph,
pcmk__graph_action_t *action);
diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c
index 622d1c8..06559b8 100644
--- a/daemons/controld/controld_fsa.c
+++ b/daemons/controld/controld_fsa.c
@@ -205,7 +205,6 @@ s_crmd_fsa(enum crmd_fsa_cause cause)
fsa_data->data_type = fsa_dt_none;
controld_globals.fsa_message_queue
= g_list_append(controld_globals.fsa_message_queue, fsa_data);
- fsa_data = NULL;
}
while ((controld_globals.fsa_message_queue != NULL)
&& !pcmk_is_set(controld_globals.flags, controld_fsa_is_stalled)) {
@@ -275,7 +274,6 @@ s_crmd_fsa(enum crmd_fsa_cause cause)
/* start doing things... */
s_crmd_fsa_actions(fsa_data);
delete_fsa_input(fsa_data);
- fsa_data = NULL;
}
if ((controld_globals.fsa_message_queue != NULL)
@@ -620,11 +618,6 @@ do_state_transition(enum crmd_fsa_state cur_state,
if (next_state != S_ELECTION && cur_state != S_RELEASE_DC) {
controld_stop_current_election_timeout();
}
-#if 0
- if ((controld_globals.fsa_input_register & R_SHUTDOWN)) {
- controld_set_fsa_action_flags(A_DC_TIMER_STOP);
- }
-#endif
if (next_state == S_INTEGRATION) {
controld_set_fsa_action_flags(A_INTEGRATE_TIMER_START);
} else {
diff --git a/daemons/controld/controld_globals.h b/daemons/controld/controld_globals.h
index eff1607..2ff8a57 100644
--- a/daemons/controld/controld_globals.h
+++ b/daemons/controld/controld_globals.h
@@ -45,9 +45,6 @@ typedef struct {
//! Connection to the CIB
cib_t *cib_conn;
- //! CIB connection's client ID
- const char *cib_client_id;
-
// Scheduler
@@ -93,6 +90,9 @@ typedef struct {
//! Max lifetime (in seconds) of a resource's shutdown lock to a node
guint shutdown_lock_limit;
+ //! Node pending timeout
+ guint node_pending_timeout;
+
//! Main event loop
GMainLoop *mainloop;
} controld_globals_t;
diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c
index da6a9d6..805ecbd 100644
--- a/daemons/controld/controld_join_client.c
+++ b/daemons/controld/controld_join_client.c
@@ -112,15 +112,6 @@ do_cl_join_offer_respond(long long action,
CRM_CHECK(input != NULL, return);
-#if 0
- if (we are sick) {
- log error;
-
- /* save the request for later? */
- return;
- }
-#endif
-
welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);
join_id = crm_element_value(input->msg, F_CRM_JOIN_ID);
crm_trace("Accepting cluster join offer from node %s "CRM_XS" join-%s",
@@ -195,32 +186,34 @@ join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *
free_xml(generation);
}
-static void
-set_join_state(const char * start_state)
+void
+set_join_state(const char *start_state, const char *node_name, const char *node_uuid,
+ bool remote)
{
if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
- "environment", controld_globals.our_nodename, start_state);
+ "environment", node_name, start_state);
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
- XML_CIB_TAG_NODES, controld_globals.our_uuid,
- NULL, NULL, NULL, "standby", "on", NULL, NULL);
+ XML_CIB_TAG_NODES, node_uuid,
+ NULL, NULL, NULL, "standby", "on", NULL,
+ remote ? "remote" : NULL);
} else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
- "environment", controld_globals.our_nodename, start_state);
+ "environment", node_name, start_state);
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
- XML_CIB_TAG_NODES, controld_globals.our_uuid,
- NULL, NULL, NULL, "standby", "off", NULL, NULL);
+ XML_CIB_TAG_NODES, node_uuid,
+ NULL, NULL, NULL, "standby", "off", NULL,
+ remote ? "remote" : NULL);
} else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) {
- crm_debug("Not forcing a starting state on node %s",
- controld_globals.our_nodename);
+ crm_debug("Not forcing a starting state on node %s", node_name);
} else {
crm_warn("Unrecognized start state '%s', using 'default' (%s)",
- start_state, controld_globals.our_nodename);
+ start_state, node_name);
}
}
@@ -335,7 +328,8 @@ do_cl_join_finalize_respond(long long action,
first_join = FALSE;
if (start_state) {
- set_join_state(start_state);
+ set_join_state(start_state, controld_globals.our_nodename,
+ controld_globals.our_uuid, false);
}
}
diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c
index f82b132..2fe6710 100644
--- a/daemons/controld/controld_join_dc.c
+++ b/daemons/controld/controld_join_dc.c
@@ -172,7 +172,6 @@ start_join_round(void)
max_generation_xml = NULL;
}
controld_clear_fsa_input_flags(R_HAVE_CIB);
- controld_forget_all_cib_replace_calls();
}
/*!
@@ -607,10 +606,6 @@ do_dc_join_finalize(long long action,
rc = controld_globals.cib_conn->cmds->sync_from(controld_globals.cib_conn,
sync_from, NULL, cib_none);
-
- if (pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) {
- controld_record_cib_replace_call(rc);
- }
fsa_register_cib_callback(rc, sync_from, finalize_sync_callback);
}
@@ -629,8 +624,6 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi
{
CRM_LOG_ASSERT(-EPERM != rc);
- controld_forget_cib_replace_call(call_id);
-
if (rc != pcmk_ok) {
const char *sync_from = (const char *) user_data;
@@ -674,22 +667,25 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi
}
static void
-join_update_complete_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
+join_node_state_commit_callback(xmlNode *msg, int call_id, int rc,
+ xmlNode *output, void *user_data)
{
- fsa_data_t *msg_data = NULL;
+ const char *node = user_data;
- if (rc == pcmk_ok) {
- crm_debug("join-%d node history update (via CIB call %d) complete",
- current_join_id, call_id);
- check_join_state(controld_globals.fsa_state, __func__);
+ if (rc != pcmk_ok) {
+ fsa_data_t *msg_data = NULL; // for register_fsa_error() macro
- } else {
- crm_err("join-%d node history update (via CIB call %d) failed: %s "
- "(next transition may determine resource status incorrectly)",
- current_join_id, call_id, pcmk_strerror(rc));
+ crm_crit("join-%d node history update (via CIB call %d) for node %s "
+ "failed: %s",
+ current_join_id, call_id, node, pcmk_strerror(rc));
crm_log_xml_debug(msg, "failed");
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
+
+ crm_debug("join-%d node history update (via CIB call %d) for node %s "
+ "complete",
+ current_join_id, call_id, node);
+ check_join_state(controld_globals.fsa_state, __func__);
}
/* A_DC_JOIN_PROCESS_ACK */
@@ -701,33 +697,39 @@ do_dc_join_ack(long long action,
{
int join_id = -1;
ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);
- enum controld_section_e section = controld_section_lrm;
- const int cib_opts = cib_scope_local|cib_can_create;
const char *op = crm_element_value(join_ack->msg, F_CRM_TASK);
- const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);
+ char *join_from = crm_element_value_copy(join_ack->msg, F_CRM_HOST_FROM);
crm_node_t *peer = NULL;
+ enum controld_section_e section = controld_section_lrm;
+ char *xpath = NULL;
+ xmlNode *state = join_ack->xml;
+ xmlNode *execd_state = NULL;
+
+ cib_t *cib = controld_globals.cib_conn;
+ int rc = pcmk_ok;
+
// Sanity checks
if (join_from == NULL) {
crm_warn("Ignoring message received without node identification");
- return;
+ goto done;
}
if (op == NULL) {
crm_warn("Ignoring message received from %s without task", join_from);
- return;
+ goto done;
}
if (strcmp(op, CRM_OP_JOIN_CONFIRM)) {
crm_debug("Ignoring '%s' message from %s while waiting for '%s'",
op, join_from, CRM_OP_JOIN_CONFIRM);
- return;
+ goto done;
}
if (crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id) != 0) {
crm_warn("Ignoring join confirmation from %s without valid join ID",
join_from);
- return;
+ goto done;
}
peer = crm_get_peer(0, join_from);
@@ -736,7 +738,7 @@ do_dc_join_ack(long long action,
"(currently %s not %s)",
join_id, join_from, crm_join_phase_str(peer->join),
crm_join_phase_str(crm_join_finalized));
- return;
+ goto done;
}
if (join_id != current_join_id) {
@@ -744,40 +746,85 @@ do_dc_join_ack(long long action,
"because currently on join-%d",
join_id, join_from, current_join_id);
crm_update_peer_join(__func__, peer, crm_join_nack);
- return;
+ goto done;
}
crm_update_peer_join(__func__, peer, crm_join_confirmed);
/* Update CIB with node's current executor state. A new transition will be
- * triggered later, when the CIB notifies us of the change.
+ * triggered later, when the CIB manager notifies us of the change.
+ *
+ * The delete and modify requests are part of an atomic transaction.
*/
+ rc = cib->cmds->init_transaction(cib);
+ if (rc != pcmk_ok) {
+ goto done;
+ }
+
+ // Delete relevant parts of node's current executor state from CIB
if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
section = controld_section_lrm_unlocked;
}
- controld_delete_node_state(join_from, section, cib_scope_local);
+ controld_node_state_deletion_strings(join_from, section, &xpath, NULL);
+
+ rc = cib->cmds->remove(cib, xpath, NULL,
+ cib_scope_local
+ |cib_xpath
+ |cib_multiple
+ |cib_transaction);
+ if (rc != pcmk_ok) {
+ goto done;
+ }
+
+ // Update CIB with node's latest known executor state
if (pcmk__str_eq(join_from, controld_globals.our_nodename,
pcmk__str_casei)) {
- xmlNode *now_dc_lrmd_state = controld_query_executor_state();
-
- if (now_dc_lrmd_state != NULL) {
- crm_debug("Updating local node history for join-%d "
- "from query result", join_id);
- controld_update_cib(XML_CIB_TAG_STATUS, now_dc_lrmd_state, cib_opts,
- join_update_complete_callback);
- free_xml(now_dc_lrmd_state);
+
+ // Use the latest possible state if processing our own join ack
+ execd_state = controld_query_executor_state();
+
+ if (execd_state != NULL) {
+ crm_debug("Updating local node history for join-%d from query "
+ "result",
+ current_join_id);
+ state = execd_state;
+
} else {
crm_warn("Updating local node history from join-%d confirmation "
- "because query failed", join_id);
- controld_update_cib(XML_CIB_TAG_STATUS, join_ack->xml, cib_opts,
- join_update_complete_callback);
+ "because query failed",
+ current_join_id);
}
+
} else {
crm_debug("Updating node history for %s from join-%d confirmation",
- join_from, join_id);
- controld_update_cib(XML_CIB_TAG_STATUS, join_ack->xml, cib_opts,
- join_update_complete_callback);
+ join_from, current_join_id);
+ }
+
+ rc = cib->cmds->modify(cib, XML_CIB_TAG_STATUS, state,
+ cib_scope_local|cib_can_create|cib_transaction);
+ free_xml(execd_state);
+ if (rc != pcmk_ok) {
+ goto done;
+ }
+
+ // Commit the transaction
+ rc = cib->cmds->end_transaction(cib, true, cib_scope_local);
+ fsa_register_cib_callback(rc, join_from, join_node_state_commit_callback);
+
+ if (rc > 0) {
+ // join_from will be freed after callback
+ join_from = NULL;
+ rc = pcmk_ok;
+ }
+
+done:
+ if (rc != pcmk_ok) {
+ crm_crit("join-%d node history update for node %s failed: %s",
+ current_join_id, join_from, pcmk_strerror(rc));
+ register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
+ free(join_from);
+ free(xpath);
}
void
@@ -808,7 +855,7 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data)
*/
crm_trace("Updating node name and UUID in CIB for %s", join_to);
tmp1 = create_xml_node(NULL, XML_CIB_TAG_NODE);
- set_uuid(tmp1, XML_ATTR_ID, join_node);
+ crm_xml_add(tmp1, XML_ATTR_ID, crm_peer_uuid(join_node));
crm_xml_add(tmp1, XML_ATTR_UNAME, join_to);
fsa_cib_anon_update(XML_CIB_TAG_NODES, tmp1);
free_xml(tmp1);
diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h
index 25f3db3..c3113e4 100644
--- a/daemons/controld/controld_lrm.h
+++ b/daemons/controld/controld_lrm.h
@@ -109,11 +109,6 @@ gboolean lrm_state_init_local(void);
void lrm_state_destroy_all(void);
/*!
- * \brief Destroy executor connection by node name
- */
-void lrm_state_destroy(const char *node_name);
-
-/*!
* \brief Find lrm_state data by node name
*/
lrm_state_t *lrm_state_find(const char *node_name);
diff --git a/daemons/controld/controld_membership.c b/daemons/controld/controld_membership.c
index 1f7e4c0..f25d1e9 100644
--- a/daemons/controld/controld_membership.c
+++ b/daemons/controld/controld_membership.c
@@ -138,10 +138,8 @@ create_node_state_update(crm_node_t *node, int flags, xmlNode *parent,
pcmk__xe_set_bool_attr(node_state, XML_NODE_IS_REMOTE, true);
}
- set_uuid(node_state, XML_ATTR_ID, node);
-
- if (crm_element_value(node_state, XML_ATTR_ID) == NULL) {
- crm_info("Node update for %s cancelled: no id", node->uname);
+ if (crm_xml_add(node_state, XML_ATTR_ID, crm_peer_uuid(node)) == NULL) {
+ crm_info("Node update for %s cancelled: no ID", node->uname);
free_xml(node_state);
return NULL;
}
@@ -149,17 +147,31 @@ create_node_state_update(crm_node_t *node, int flags, xmlNode *parent,
crm_xml_add(node_state, XML_ATTR_UNAME, node->uname);
if ((flags & node_update_cluster) && node->state) {
- pcmk__xe_set_bool_attr(node_state, XML_NODE_IN_CLUSTER,
- pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei));
+ if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) {
+ // A value 0 means the node is not a cluster member.
+ crm_xml_add_ll(node_state, PCMK__XA_IN_CCM, node->when_member);
+
+ } else {
+ pcmk__xe_set_bool_attr(node_state, PCMK__XA_IN_CCM,
+ pcmk__str_eq(node->state, CRM_NODE_MEMBER,
+ pcmk__str_casei));
+ }
}
if (!pcmk_is_set(node->flags, crm_remote_node)) {
if (flags & node_update_peer) {
- value = OFFLINESTATUS;
- if (pcmk_is_set(node->processes, crm_get_cluster_proc())) {
- value = ONLINESTATUS;
+ if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) {
+ // A value 0 means the peer is offline in CPG.
+ crm_xml_add_ll(node_state, PCMK__XA_CRMD, node->when_online);
+
+ } else {
+ // @COMPAT DCs < 2.1.7 use online/offline rather than timestamp
+ value = OFFLINESTATUS;
+ if (pcmk_is_set(node->processes, crm_get_cluster_proc())) {
+ value = ONLINESTATUS;
+ }
+ crm_xml_add(node_state, PCMK__XA_CRMD, value);
}
- crm_xml_add(node_state, XML_NODE_IS_PEER, value);
}
if (flags & node_update_join) {
@@ -168,11 +180,11 @@ create_node_state_update(crm_node_t *node, int flags, xmlNode *parent,
} else {
value = CRMD_JOINSTATE_MEMBER;
}
- crm_xml_add(node_state, XML_NODE_JOIN_STATE, value);
+ crm_xml_add(node_state, PCMK__XA_JOIN, value);
}
if (flags & node_update_expected) {
- crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected);
+ crm_xml_add(node_state, PCMK__XA_EXPECTED, node->expected);
}
}
@@ -210,7 +222,7 @@ search_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
return;
}
- if (pcmk__str_eq(crm_element_name(output), XML_CIB_TAG_NODE, pcmk__str_casei)) {
+ if (pcmk__xe_is(output, XML_CIB_TAG_NODE)) {
node_xml = output;
} else {
@@ -224,7 +236,7 @@ search_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
crm_node_t *node = NULL;
gboolean known = FALSE;
- if (!pcmk__str_eq(crm_element_name(node_xml), XML_CIB_TAG_NODE, pcmk__str_casei)) {
+ if (!pcmk__xe_is(node_xml, XML_CIB_TAG_NODE)) {
continue;
}
diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
index 54b27ec..39f3c7a 100644
--- a/daemons/controld/controld_messages.c
+++ b/daemons/controld/controld_messages.c
@@ -328,52 +328,80 @@ route_message(enum crmd_fsa_cause cause, xmlNode * input)
gboolean
relay_message(xmlNode * msg, gboolean originated_locally)
{
- int dest = 1;
+ enum crm_ais_msg_types dest = crm_msg_ais;
bool is_for_dc = false;
bool is_for_dcib = false;
bool is_for_te = false;
bool is_for_crm = false;
bool is_for_cib = false;
bool is_local = false;
- const char *host_to = crm_element_value(msg, F_CRM_HOST_TO);
- const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO);
- const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
- const char *type = crm_element_value(msg, F_TYPE);
- const char *task = crm_element_value(msg, F_CRM_TASK);
- const char *ref = crm_element_value(msg, XML_ATTR_REFERENCE);
+ bool broadcast = false;
+ const char *host_to = NULL;
+ const char *sys_to = NULL;
+ const char *sys_from = NULL;
+ const char *type = NULL;
+ const char *task = NULL;
+ const char *ref = NULL;
+ crm_node_t *node_to = NULL;
+
+ CRM_CHECK(msg != NULL, return TRUE);
+
+ host_to = crm_element_value(msg, F_CRM_HOST_TO);
+ sys_to = crm_element_value(msg, F_CRM_SYS_TO);
+ sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
+ type = crm_element_value(msg, F_TYPE);
+ task = crm_element_value(msg, F_CRM_TASK);
+ ref = crm_element_value(msg, XML_ATTR_REFERENCE);
+
+ broadcast = pcmk__str_empty(host_to);
if (ref == NULL) {
ref = "without reference ID";
}
- if (msg == NULL) {
- crm_warn("Cannot route empty message");
- return TRUE;
-
- } else if (pcmk__str_eq(task, CRM_OP_HELLO, pcmk__str_casei)) {
- crm_trace("No routing needed for hello message %s", ref);
+ if (pcmk__str_eq(task, CRM_OP_HELLO, pcmk__str_casei)) {
+ crm_trace("Received hello %s from %s (no processing needed)",
+ ref, pcmk__s(sys_from, "unidentified source"));
+ crm_log_xml_trace(msg, "hello");
return TRUE;
+ }
- } else if (!pcmk__str_eq(type, T_CRM, pcmk__str_casei)) {
- crm_warn("Received invalid message %s: type '%s' not '" T_CRM "'",
+ // Require message type (set by create_request())
+ if (!pcmk__str_eq(type, T_CRM, pcmk__str_casei)) {
+ crm_warn("Ignoring invalid message %s with type '%s' (not '" T_CRM "')",
ref, pcmk__s(type, ""));
- crm_log_xml_warn(msg, "[bad message type]");
+ crm_log_xml_trace(msg, "ignored");
return TRUE;
+ }
- } else if (sys_to == NULL) {
- crm_warn("Received invalid message %s: no subsystem", ref);
- crm_log_xml_warn(msg, "[no subsystem]");
+ // Require a destination subsystem (also set by create_request())
+ if (sys_to == NULL) {
+ crm_warn("Ignoring invalid message %s with no " F_CRM_SYS_TO, ref);
+ crm_log_xml_trace(msg, "ignored");
return TRUE;
}
+ // Get the message type appropriate to the destination subsystem
+ if (is_corosync_cluster()) {
+ dest = text2msg_type(sys_to);
+ if ((dest < crm_msg_ais) || (dest > crm_msg_stonith_ng)) {
+ /* Unrecognized value, use a sane default
+ *
+ * @TODO Maybe we should bail instead
+ */
+ dest = crm_msg_crmd;
+ }
+ }
+
is_for_dc = (strcasecmp(CRM_SYSTEM_DC, sys_to) == 0);
is_for_dcib = (strcasecmp(CRM_SYSTEM_DCIB, sys_to) == 0);
is_for_te = (strcasecmp(CRM_SYSTEM_TENGINE, sys_to) == 0);
is_for_cib = (strcasecmp(CRM_SYSTEM_CIB, sys_to) == 0);
is_for_crm = (strcasecmp(CRM_SYSTEM_CRMD, sys_to) == 0);
+ // Check whether message should be processed locally
is_local = false;
- if (pcmk__str_empty(host_to)) {
+ if (broadcast) {
if (is_for_dc || is_for_te) {
is_local = false;
@@ -397,6 +425,7 @@ relay_message(xmlNode * msg, gboolean originated_locally)
} else if (pcmk__str_eq(controld_globals.our_nodename, host_to,
pcmk__str_casei)) {
is_local = true;
+
} else if (is_for_crm && pcmk__str_eq(task, CRM_OP_LRM_DELETE, pcmk__str_casei)) {
xmlNode *msg_data = get_message_xml(msg, F_CRM_DATA);
const char *mode = crm_element_value(msg_data, PCMK__XA_MODE);
@@ -407,69 +436,68 @@ relay_message(xmlNode * msg, gboolean originated_locally)
}
}
- if (is_for_dc || is_for_dcib || is_for_te) {
- if (AM_I_DC && is_for_te) {
- crm_trace("Route message %s locally as transition request", ref);
- send_msg_via_ipc(msg, sys_to);
+ // Check whether message should be relayed
- } else if (AM_I_DC) {
+ if (is_for_dc || is_for_dcib || is_for_te) {
+ if (AM_I_DC) {
+ if (is_for_te) {
+ crm_trace("Route message %s locally as transition request",
+ ref);
+ crm_log_xml_trace(msg, sys_to);
+ send_msg_via_ipc(msg, sys_to);
+ return TRUE; // No further processing of message is needed
+ }
crm_trace("Route message %s locally as DC request", ref);
return FALSE; // More to be done by caller
+ }
- } else if (originated_locally && !pcmk__strcase_any_of(sys_from, CRM_SYSTEM_PENGINE,
- CRM_SYSTEM_TENGINE, NULL)) {
-
- if (is_corosync_cluster()) {
- dest = text2msg_type(sys_to);
+ if (originated_locally
+ && !pcmk__strcase_any_of(sys_from, CRM_SYSTEM_PENGINE,
+ CRM_SYSTEM_TENGINE, NULL)) {
+ crm_trace("Relay message %s to DC (via %s)",
+ ref, pcmk__s(host_to, "broadcast"));
+ crm_log_xml_trace(msg, "relayed");
+ if (!broadcast) {
+ node_to = crm_get_peer(0, host_to);
}
- crm_trace("Relay message %s to DC", ref);
- send_cluster_message(host_to ? crm_get_peer(0, host_to) : NULL, dest, msg, TRUE);
-
- } else {
- /* Neither the TE nor the scheduler should be sending messages
- * to DCs on other nodes. By definition, if we are no longer the DC,
- * then the scheduler's or TE's data should be discarded.
- */
- crm_trace("Discard message %s because we are not DC", ref);
+ send_cluster_message(node_to, dest, msg, TRUE);
+ return TRUE;
}
- } else if (is_local && (is_for_crm || is_for_cib)) {
- crm_trace("Route message %s locally as controller request", ref);
- return FALSE; // More to be done by caller
-
- } else if (is_local) {
- crm_trace("Relay message %s locally to %s",
- ref, (sys_to? sys_to : "unknown client"));
- crm_log_xml_trace(msg, "[IPC relay]");
- send_msg_via_ipc(msg, sys_to);
-
- } else {
- crm_node_t *node_to = NULL;
-
- if (is_corosync_cluster()) {
- dest = text2msg_type(sys_to);
+ /* Transition engine and scheduler messages are sent only to the DC on
+ * the same node. If we are no longer the DC, discard this message.
+ */
+ crm_trace("Ignoring message %s because we are no longer DC", ref);
+ crm_log_xml_trace(msg, "ignored");
+ return TRUE; // No further processing of message is needed
+ }
- if (dest == crm_msg_none || dest > crm_msg_stonith_ng) {
- dest = crm_msg_crmd;
- }
+ if (is_local) {
+ if (is_for_crm || is_for_cib) {
+ crm_trace("Route message %s locally as controller request", ref);
+ return FALSE; // More to be done by caller
}
+ crm_trace("Relay message %s locally to %s", ref, sys_to);
+ crm_log_xml_trace(msg, "IPC-relay");
+ send_msg_via_ipc(msg, sys_to);
+ return TRUE;
+ }
- if (host_to) {
- node_to = pcmk__search_cluster_node_cache(0, host_to);
- if (node_to == NULL) {
- crm_warn("Cannot route message %s: Unknown node %s",
- ref, host_to);
- return TRUE;
- }
- crm_trace("Relay message %s to %s",
- ref, (node_to->uname? node_to->uname : "peer"));
- } else {
- crm_trace("Broadcast message %s to all peers", ref);
+ if (!broadcast) {
+ node_to = pcmk__search_cluster_node_cache(0, host_to, NULL);
+ if (node_to == NULL) {
+ crm_warn("Ignoring message %s because node %s is unknown",
+ ref, host_to);
+ crm_log_xml_trace(msg, "ignored");
+ return TRUE;
}
- send_cluster_message(host_to ? node_to : NULL, dest, msg, TRUE);
}
- return TRUE; // No further processing of message is needed
+ crm_trace("Relay message %s to %s",
+ ref, pcmk__s(host_to, "all peers"));
+ crm_log_xml_trace(msg, "relayed");
+ send_cluster_message(node_to, dest, msg, TRUE);
+ return TRUE;
}
// Return true if field contains a positive integer
@@ -546,6 +574,7 @@ controld_authorize_ipc_message(const xmlNode *client_msg, pcmk__client_t *curr_c
}
crm_trace("Validated IPC hello from client %s", client_name);
+ crm_log_xml_trace(client_msg, "hello");
if (curr_client) {
curr_client->userdata = strdup(client_name);
}
@@ -553,6 +582,7 @@ controld_authorize_ipc_message(const xmlNode *client_msg, pcmk__client_t *curr_c
return false;
rejected:
+ crm_log_xml_trace(client_msg, "rejected");
if (curr_client) {
qb_ipcs_disconnect(curr_client->ipcs);
}
@@ -575,7 +605,9 @@ handle_message(xmlNode *msg, enum crmd_fsa_cause cause)
return I_NULL;
}
- crm_err("Unknown message type: %s", type);
+ crm_warn("Ignoring message with unknown " F_CRM_MSG_TYPE " '%s'",
+ pcmk__s(type, ""));
+ crm_log_xml_trace(msg, "bad");
return I_NULL;
}
@@ -701,7 +733,7 @@ handle_lrm_delete(xmlNode *stored_msg)
crm_info("Notifying %s on %s that %s was%s deleted",
from_sys, (from_host? from_host : "local node"), rsc_id,
((rc == pcmk_rc_ok)? "" : " not"));
- op = lrmd_new_event(rsc_id, CRMD_ACTION_DELETE, 0);
+ op = lrmd_new_event(rsc_id, PCMK_ACTION_DELETE, 0);
op->type = lrmd_event_exec_complete;
op->user_data = strdup(transition? transition : FAKE_TE_ID);
op->params = pcmk__strkey_table(free, free);
@@ -732,7 +764,7 @@ handle_remote_state(const xmlNode *msg)
bool remote_is_up = false;
int rc = pcmk_rc_ok;
- rc = pcmk__xe_get_bool_attr(msg, XML_NODE_IN_CLUSTER, &remote_is_up);
+ rc = pcmk__xe_get_bool_attr(msg, PCMK__XA_IN_CCM, &remote_is_up);
CRM_CHECK(remote_uname && rc == pcmk_rc_ok, return I_NULL);
@@ -818,7 +850,7 @@ handle_node_list(const xmlNode *request)
crm_xml_add_ll(xml, XML_ATTR_ID, (long long) node->id); // uint32_t
crm_xml_add(xml, XML_ATTR_UNAME, node->uname);
- crm_xml_add(xml, XML_NODE_IN_CLUSTER, node->state);
+ crm_xml_add(xml, PCMK__XA_IN_CCM, node->state);
}
// Create and send reply
@@ -875,7 +907,7 @@ handle_node_info_request(const xmlNode *msg)
if (node) {
crm_xml_add(reply_data, XML_ATTR_ID, node->uuid);
crm_xml_add(reply_data, XML_ATTR_UNAME, node->uname);
- crm_xml_add(reply_data, XML_NODE_IS_PEER, node->state);
+ crm_xml_add(reply_data, PCMK__XA_CRMD, node->state);
pcmk__xe_set_bool_attr(reply_data, XML_NODE_IS_REMOTE,
pcmk_is_set(node->flags, crm_remote_node));
}
@@ -988,14 +1020,15 @@ handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause)
/* Optimize this for the DC - it has the most to do */
+ crm_log_xml_trace(stored_msg, "request");
if (op == NULL) {
- crm_log_xml_warn(stored_msg, "[request without " F_CRM_TASK "]");
+ crm_warn("Ignoring request without " F_CRM_TASK);
return I_NULL;
}
if (strcmp(op, CRM_OP_SHUTDOWN_REQ) == 0) {
const char *from = crm_element_value(stored_msg, F_CRM_HOST_FROM);
- crm_node_t *node = pcmk__search_cluster_node_cache(0, from);
+ crm_node_t *node = pcmk__search_cluster_node_cache(0, from, NULL);
pcmk__update_peer_expected(__func__, node, CRMD_JOINSTATE_DOWN);
if(AM_I_DC == FALSE) {
@@ -1062,11 +1095,6 @@ handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause)
if (controld_globals.fsa_state == S_HALT) {
crm_debug("Forcing an election from S_HALT");
return I_ELECTION;
-#if 0
- } else if (AM_I_DC) {
- /* This is the old way of doing things but what is gained? */
- return I_ELECTION;
-#endif
}
} else if (strcmp(op, CRM_OP_JOIN_OFFER) == 0) {
@@ -1157,8 +1185,9 @@ handle_response(xmlNode *stored_msg)
{
const char *op = crm_element_value(stored_msg, F_CRM_TASK);
+ crm_log_xml_trace(stored_msg, "reply");
if (op == NULL) {
- crm_log_xml_err(stored_msg, "Bad message");
+ crm_warn("Ignoring reply without " F_CRM_TASK);
} else if (AM_I_DC && strcmp(op, CRM_OP_PECALC) == 0) {
// Check whether scheduler answer been superseded by subsequent request
@@ -1295,7 +1324,7 @@ broadcast_remote_state_message(const char *node_name, bool node_up)
node_name, node_up? "coming up" : "going down");
crm_xml_add(msg, XML_ATTR_ID, node_name);
- pcmk__xe_set_bool_attr(msg, XML_NODE_IN_CLUSTER, node_up);
+ pcmk__xe_set_bool_attr(msg, PCMK__XA_IN_CCM, node_up);
if (node_up) {
crm_xml_add(msg, PCMK__XA_CONN_HOST, controld_globals.our_nodename);
diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c
index 240a978..c813ceb 100644
--- a/daemons/controld/controld_metadata.c
+++ b/daemons/controld/controld_metadata.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2017-2022 the Pacemaker project contributors
+ * Copyright 2017-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -172,7 +172,7 @@ controld_cache_metadata(GHashTable *mdc, const lrmd_rsc_info_t *rsc,
const char *action_name = crm_element_value(match, "name");
- if (pcmk__str_eq(action_name, CRMD_ACTION_RELOAD_AGENT,
+ if (pcmk__str_eq(action_name, PCMK_ACTION_RELOAD_AGENT,
pcmk__str_none)) {
if (ocf1_1) {
controld_set_ra_flags(md, key, ra_supports_reload_agent);
@@ -181,7 +181,7 @@ controld_cache_metadata(GHashTable *mdc, const lrmd_rsc_info_t *rsc,
"because it does not support OCF 1.1 or later", key);
}
- } else if (!ocf1_1 && pcmk__str_eq(action_name, CRMD_ACTION_RELOAD,
+ } else if (!ocf1_1 && pcmk__str_eq(action_name, PCMK_ACTION_RELOAD,
pcmk__str_casei)) {
controld_set_ra_flags(md, key, ra_supports_legacy_reload);
}
diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
index f24b755..d692ef6 100644
--- a/daemons/controld/controld_remote_ra.c
+++ b/daemons/controld/controld_remote_ra.c
@@ -280,6 +280,7 @@ remote_node_up(const char *node_name)
int call_opt;
xmlNode *update, *state;
crm_node_t *node;
+ lrm_state_t *connection_rsc = NULL;
CRM_CHECK(node_name != NULL, return);
crm_info("Announcing Pacemaker Remote node %s", node_name);
@@ -301,6 +302,20 @@ remote_node_up(const char *node_name)
purge_remote_node_attrs(call_opt, node);
pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
+ /* Apply any start state that we were given from the environment on the
+ * remote node.
+ */
+ connection_rsc = lrm_state_find(node->uname);
+
+ if (connection_rsc != NULL) {
+ lrmd_t *lrm = connection_rsc->conn;
+ const char *start_state = lrmd__node_start_state(lrm);
+
+ if (start_state) {
+ set_join_state(start_state, node->uname, node->uuid, true);
+ }
+ }
+
/* pacemaker_remote nodes don't participate in the membership layer,
* so cluster nodes don't automatically get notified when they come and go.
* We send a cluster message to the DC, and update the CIB node state entry,
@@ -392,10 +407,11 @@ check_remote_node_state(const remote_ra_cmd_t *cmd)
return;
}
- if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
+ if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
remote_node_up(cmd->rsc_id);
- } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) {
+ } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
+ pcmk__str_casei)) {
/* After a successful migration, we don't need to do remote_node_up()
* because the DC already knows the node is up, and we don't want to
* clear LRM history etc. We do need to add the remote node to this
@@ -408,7 +424,7 @@ check_remote_node_state(const remote_ra_cmd_t *cmd)
CRM_CHECK(node != NULL, return);
pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
- } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
+ } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
@@ -510,7 +526,8 @@ retry_start_cmd_cb(gpointer data)
return FALSE;
}
cmd = ra_data->cur_cmd;
- if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) {
+ if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
+ PCMK_ACTION_MIGRATE_FROM, NULL)) {
return FALSE;
}
update_remaining_timeout(cmd);
@@ -681,7 +698,8 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
handle_remote_ra_stop(lrm_state, NULL);
remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
/* now fake the reply of a successful 'stop' */
- synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
+ synthesize_lrmd_success(NULL, lrm_state->node_name,
+ PCMK_ACTION_STOP);
}
return;
}
@@ -695,8 +713,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
/* Start actions and migrate from actions complete after connection
* comes back to us. */
- if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start",
- "migrate_from", NULL)) {
+ if ((op->type == lrmd_event_connect)
+ && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
+ PCMK_ACTION_MIGRATE_FROM, NULL)) {
if (op->connection_rc < 0) {
update_remaining_timeout(cmd);
@@ -731,7 +750,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
report_remote_ra_result(cmd);
cmd_handled = TRUE;
- } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
+ } else if ((op->type == lrmd_event_poke)
+ && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
+ pcmk__str_casei)) {
if (cmd->monitor_timeout_id) {
g_source_remove(cmd->monitor_timeout_id);
@@ -758,7 +779,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
}
cmd_handled = TRUE;
- } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
+ } else if ((op->type == lrmd_event_disconnect)
+ && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
+ pcmk__str_casei)) {
if (pcmk_is_set(ra_data->status, remote_active) &&
!pcmk_is_set(cmd->status, cmd_cancel)) {
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
@@ -771,7 +794,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
}
cmd_handled = TRUE;
- } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
+ } else if ((op->type == lrmd_event_new_client)
+ && pcmk__str_eq(cmd->action, PCMK_ACTION_STOP,
+ pcmk__str_casei)) {
handle_remote_ra_stop(lrm_state, cmd);
cmd_handled = TRUE;
@@ -882,7 +907,8 @@ handle_remote_ra_exec(gpointer user_data)
ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
g_list_free_1(first);
- if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
+ if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
+ PCMK_ACTION_MIGRATE_FROM, NULL)) {
lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
if (handle_remote_ra_start(lrm_state, cmd,
cmd->timeout) == pcmk_rc_ok) {
@@ -894,7 +920,7 @@ handle_remote_ra_exec(gpointer user_data)
}
report_remote_ra_result(cmd);
- } else if (!strcmp(cmd->action, "monitor")) {
+ } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
if (lrm_state_is_connected(lrm_state) == TRUE) {
rc = lrm_state_poke_connection(lrm_state);
@@ -917,7 +943,7 @@ handle_remote_ra_exec(gpointer user_data)
}
report_remote_ra_result(cmd);
- } else if (!strcmp(cmd->action, "stop")) {
+ } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
if (pcmk_is_set(ra_data->status, expect_takeover)) {
/* briefly wait on stop for the takeover event to occur. If the
@@ -933,13 +959,14 @@ handle_remote_ra_exec(gpointer user_data)
handle_remote_ra_stop(lrm_state, cmd);
- } else if (!strcmp(cmd->action, "migrate_to")) {
+ } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
lrm_remote_clear_flags(lrm_state, takeover_complete);
lrm_remote_set_flags(lrm_state, expect_takeover);
pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
report_remote_ra_result(cmd);
- } else if (pcmk__str_any_of(cmd->action, CRMD_ACTION_RELOAD,
- CRMD_ACTION_RELOAD_AGENT, NULL)) {
+
+ } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
+ PCMK_ACTION_RELOAD_AGENT, NULL)) {
/* Currently the only reloadable parameter is reconnect_interval,
* which is only used by the scheduler via the CIB, so reloads are a
* no-op.
@@ -1029,13 +1056,13 @@ static gboolean
is_remote_ra_supported_action(const char *action)
{
return pcmk__str_any_of(action,
- CRMD_ACTION_START,
- CRMD_ACTION_STOP,
- CRMD_ACTION_STATUS,
- CRMD_ACTION_MIGRATE,
- CRMD_ACTION_MIGRATED,
- CRMD_ACTION_RELOAD_AGENT,
- CRMD_ACTION_RELOAD,
+ PCMK_ACTION_START,
+ PCMK_ACTION_STOP,
+ PCMK_ACTION_MONITOR,
+ PCMK_ACTION_MIGRATE_TO,
+ PCMK_ACTION_MIGRATE_FROM,
+ PCMK_ACTION_RELOAD_AGENT,
+ PCMK_ACTION_RELOAD,
NULL);
}
@@ -1048,7 +1075,9 @@ fail_all_monitor_cmds(GList * list)
for (gIter = list; gIter != NULL; gIter = gIter->next) {
cmd = gIter->data;
- if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
+ if ((cmd->interval_ms > 0)
+ && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
+ pcmk__str_casei)) {
rm_list = g_list_append(rm_list, cmd);
}
}
@@ -1137,8 +1166,9 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
if (ra_data->cur_cmd &&
!pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
- (ra_data->cur_cmd->interval_ms == interval_ms) &&
- pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) {
+ (ra_data->cur_cmd->interval_ms == interval_ms)
+ && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
+ pcmk__str_casei)) {
cmd = ra_data->cur_cmd;
goto handle_dup;
@@ -1147,7 +1177,8 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
cmd = gIter->data;
if ((cmd->interval_ms == interval_ms)
- && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
+ && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
+ pcmk__str_casei)) {
goto handle_dup;
}
}
@@ -1155,7 +1186,8 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
cmd = gIter->data;
if ((cmd->interval_ms == interval_ms)
- && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
+ && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
+ pcmk__str_casei)) {
goto handle_dup;
}
}
@@ -1165,7 +1197,7 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
handle_dup:
crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
- cmd->rsc_id, "monitor", interval_ms);
+ cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
/* update the userdata */
if (userdata) {
@@ -1385,7 +1417,7 @@ remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
}
#define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
- "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
+ "[@" XML_LRM_ATTR_TASK "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
XML_GRAPH_TAG_MAINTENANCE
/*!
@@ -1403,9 +1435,10 @@ remote_ra_process_maintenance_nodes(xmlNode *xml)
xmlNode *node;
int cnt = 0, cnt_remote = 0;
- for (node =
- first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
- node != NULL; node = pcmk__xml_next(node)) {
+ for (node = first_named_child(getXpathResult(search, 0),
+ XML_CIB_TAG_NODE);
+ node != NULL; node = crm_next_same_xml(node)) {
+
lrm_state_t *lrm_state = lrm_state_find(ID(node));
cnt++;
diff --git a/daemons/controld/controld_schedulerd.c b/daemons/controld/controld_schedulerd.c
index 912f9a5..8aca83f 100644
--- a/daemons/controld/controld_schedulerd.c
+++ b/daemons/controld/controld_schedulerd.c
@@ -45,11 +45,11 @@ controld_shutdown_schedulerd_ipc(void)
* \internal
* \brief Save CIB query result to file, raising FSA error
*
- * \param[in] msg Ignored
- * \param[in] call_id Call ID of CIB query
- * \param[in] rc Return code of CIB query
- * \param[in,out] output Result of CIB query
- * \param[in] user_data Unique identifier for filename
+ * \param[in] msg Ignored
+ * \param[in] call_id Call ID of CIB query
+ * \param[in] rc Return code of CIB query
+ * \param[in] output Result of CIB query
+ * \param[in] user_data Unique identifier for filename
*
* \note This is intended to be called after a scheduler connection fails.
*/
@@ -90,8 +90,9 @@ handle_disconnect(void)
int rc = pcmk_ok;
char *uuid_str = crm_generate_uuid();
- crm_crit("Connection to the scheduler failed "
- CRM_XS " uuid=%s", uuid_str);
+ crm_crit("Lost connection to the scheduler "
+ CRM_XS " CIB will be saved to " PE_STATE_DIR "/pe-core-%s.bz2",
+ uuid_str);
/*
* The scheduler died...
@@ -107,9 +108,6 @@ handle_disconnect(void)
NULL, NULL,
cib_scope_local);
fsa_register_cib_callback(rc, uuid_str, save_cib_contents);
-
- } else {
- crm_info("Connection to the scheduler released");
}
controld_clear_fsa_input_flags(R_PE_CONNECTED);
@@ -199,9 +197,10 @@ new_schedulerd_ipc_connection(void)
pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL);
- rc = pcmk_connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main);
+ rc = pcmk__connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main, 3);
if (rc != pcmk_rc_ok) {
- crm_err("Error connecting to the scheduler: %s", pcmk_rc_str(rc));
+ crm_err("Error connecting to %s: %s",
+ pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc));
return false;
}
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
index d8cfcad..fe6b744 100644
--- a/daemons/controld/controld_te_actions.c
+++ b/daemons/controld/controld_te_actions.c
@@ -47,7 +47,7 @@ execute_pseudo_action(pcmk__graph_t *graph, pcmk__graph_action_t *pseudo)
const char *task = crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK);
/* send to peers as well? */
- if (pcmk__str_eq(task, CRM_OP_MAINTENANCE_NODES, pcmk__str_casei)) {
+ if (pcmk__str_eq(task, PCMK_ACTION_MAINTENANCE_NODES, pcmk__str_casei)) {
GHashTableIter iter;
crm_node_t *node = NULL;
@@ -125,7 +125,7 @@ execute_cluster_action(pcmk__graph_t *graph, pcmk__graph_action_t *action)
router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
if (router_node == NULL) {
router_node = on_node;
- if (pcmk__str_eq(task, CRM_OP_LRM_DELETE, pcmk__str_none)) {
+ if (pcmk__str_eq(task, PCMK_ACTION_LRM_DELETE, pcmk__str_none)) {
const char *mode = crm_element_value(action->xml, PCMK__XA_MODE);
if (pcmk__str_eq(mode, XML_TAG_CIB, pcmk__str_none)) {
@@ -148,7 +148,8 @@ execute_cluster_action(pcmk__graph_t *graph, pcmk__graph_action_t *action)
id, task, on_node, (is_local? " locally" : ""),
(no_wait? " without waiting" : ""));
- if (is_local && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_none)) {
+ if (is_local
+ && pcmk__str_eq(task, PCMK_ACTION_DO_SHUTDOWN, pcmk__str_none)) {
/* defer until everything else completes */
crm_info("Controller request '%s' is a local shutdown", id);
graph->completion_action = pcmk__graph_shutdown;
@@ -156,7 +157,7 @@ execute_cluster_action(pcmk__graph_t *graph, pcmk__graph_action_t *action)
te_action_confirmed(action, graph);
return pcmk_rc_ok;
- } else if (pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_none)) {
+ } else if (pcmk__str_eq(task, PCMK_ACTION_DO_SHUTDOWN, pcmk__str_none)) {
crm_node_t *peer = crm_get_peer(0, router_node);
pcmk__update_peer_expected(__func__, peer, CRMD_JOINSTATE_DOWN);
@@ -318,7 +319,7 @@ controld_record_action_timeout(pcmk__graph_action_t *action)
int target_rc = get_target_rc(action);
crm_warn("%s %d: %s on %s timed out",
- crm_element_name(action->xml), action->id, task_uuid, target);
+ action->xml->name, action->id, task_uuid, target);
op = synthesize_timeout_event(action, target_rc);
controld_record_action_event(action, op);
@@ -528,9 +529,9 @@ te_update_job_count(pcmk__graph_action_t *action, int offset)
* the connection resources */
target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
- if ((target == NULL) && pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE,
- CRMD_ACTION_MIGRATED, NULL)) {
-
+ if ((target == NULL)
+ && pcmk__strcase_any_of(task, PCMK_ACTION_MIGRATE_TO,
+ PCMK_ACTION_MIGRATE_FROM, NULL)) {
const char *t1 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE);
const char *t2 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET);
@@ -586,7 +587,8 @@ allowed_on_node(const pcmk__graph_t *graph, const pcmk__graph_action_t *action,
return false;
} else if(graph->migration_limit > 0 && r->migrate_jobs >= graph->migration_limit) {
- if (pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) {
+ if (pcmk__strcase_any_of(task, PCMK_ACTION_MIGRATE_TO,
+ PCMK_ACTION_MIGRATE_FROM, NULL)) {
crm_trace("Peer %s is over their migration job limit of %d (%d): deferring %s",
target, graph->migration_limit, r->migrate_jobs, id);
return false;
@@ -624,8 +626,9 @@ graph_action_allowed(pcmk__graph_t *graph, pcmk__graph_action_t *action)
* the connection resources */
target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
- if ((target == NULL) && pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE,
- CRMD_ACTION_MIGRATED, NULL)) {
+ if ((target == NULL)
+ && pcmk__strcase_any_of(task, PCMK_ACTION_MIGRATE_TO,
+ PCMK_ACTION_MIGRATE_FROM, NULL)) {
target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE);
if (!allowed_on_node(graph, action, target)) {
return false;
diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c
index cf9de83..c26e757 100644
--- a/daemons/controld/controld_te_callbacks.c
+++ b/daemons/controld/controld_te_callbacks.c
@@ -225,12 +225,12 @@ process_resource_updates(const char *node, xmlNode *xml, xmlNode *change,
return;
}
- if (strcmp(TYPE(xml), XML_CIB_TAG_LRM) == 0) {
+ if (pcmk__xe_is(xml, XML_CIB_TAG_LRM)) {
xml = first_named_child(xml, XML_LRM_TAG_RESOURCES);
CRM_CHECK(xml != NULL, return);
}
- CRM_CHECK(strcmp(TYPE(xml), XML_LRM_TAG_RESOURCES) == 0, return);
+ CRM_CHECK(pcmk__xe_is(xml, XML_LRM_TAG_RESOURCES), return);
/*
* Updates by, or in response to, TE actions will never contain updates
@@ -558,7 +558,7 @@ te_update_diff(const char *event, xmlNode * msg)
p_del[0], p_del[1], p_del[2], p_add[0], p_add[1], p_add[2],
fsa_state2string(controld_globals.fsa_state));
- crm_element_value_int(diff, "format", &format);
+ crm_element_value_int(diff, PCMK_XA_FORMAT, &format);
switch (format) {
case 1:
te_update_diff_v1(event, diff);
diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c
index d4e2b0f..28977c0 100644
--- a/daemons/controld/controld_te_events.c
+++ b/daemons/controld/controld_te_events.c
@@ -111,7 +111,7 @@ fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node)
} else if (action->type == pcmk__cluster_graph_action) {
const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
- if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
+ if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
continue;
}
}
@@ -196,16 +196,16 @@ update_failcount(const xmlNode *event, const char *event_node_uuid, int rc,
/* Decide whether update is necessary and what value to use */
if ((interval_ms > 0)
- || pcmk__str_eq(task, CRMD_ACTION_PROMOTE, pcmk__str_none)
- || pcmk__str_eq(task, CRMD_ACTION_DEMOTE, pcmk__str_none)) {
+ || pcmk__str_eq(task, PCMK_ACTION_PROMOTE, pcmk__str_none)
+ || pcmk__str_eq(task, PCMK_ACTION_DEMOTE, pcmk__str_none)) {
do_update = TRUE;
- } else if (pcmk__str_eq(task, CRMD_ACTION_START, pcmk__str_none)) {
+ } else if (pcmk__str_eq(task, PCMK_ACTION_START, pcmk__str_none)) {
do_update = TRUE;
value = pcmk__s(controld_globals.transition_graph->failed_start_offset,
CRM_INFINITY_S);
- } else if (pcmk__str_eq(task, CRMD_ACTION_STOP, pcmk__str_none)) {
+ } else if (pcmk__str_eq(task, PCMK_ACTION_STOP, pcmk__str_none)) {
do_update = TRUE;
value = pcmk__s(controld_globals.transition_graph->failed_stop_offset,
CRM_INFINITY_S);
@@ -314,7 +314,7 @@ get_cancel_action(const char *id, const char *node)
pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data;
task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
- if (!pcmk__str_eq(CRMD_ACTION_CANCEL, task, pcmk__str_casei)) {
+ if (!pcmk__str_eq(PCMK_ACTION_CANCEL, task, pcmk__str_casei)) {
continue;
}
diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c
index ecbc0b2..5a9f029 100644
--- a/daemons/controld/controld_te_utils.c
+++ b/daemons/controld/controld_te_utils.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2022 the Pacemaker project contributors
+ * Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -17,6 +17,8 @@
//! Triggers transition graph processing
static crm_trigger_t *transition_trigger = NULL;
+static GHashTable *node_pending_timers = NULL;
+
gboolean
stop_te_timer(pcmk__graph_action_t *action)
{
@@ -132,11 +134,13 @@ static struct abort_timer_s {
static gboolean
abort_timer_popped(gpointer data)
{
- if (AM_I_DC && (abort_timer.aborted == FALSE)) {
- abort_transition(abort_timer.priority, abort_timer.action,
- abort_timer.text, NULL);
+ struct abort_timer_s *abort_timer = (struct abort_timer_s *) data;
+
+ if (AM_I_DC && (abort_timer->aborted == FALSE)) {
+ abort_transition(abort_timer->priority, abort_timer->action,
+ abort_timer->text, NULL);
}
- abort_timer.id = 0;
+ abort_timer->id = 0;
return FALSE; // do not immediately reschedule timer
}
@@ -158,7 +162,143 @@ abort_after_delay(int abort_priority, enum pcmk__graph_next abort_action,
abort_timer.priority = abort_priority;
abort_timer.action = abort_action;
abort_timer.text = abort_text;
- abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, NULL);
+ abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, &abort_timer);
+}
+
+static void
+free_node_pending_timer(gpointer data)
+{
+ struct abort_timer_s *node_pending_timer = (struct abort_timer_s *) data;
+
+ if (node_pending_timer->id != 0) {
+ g_source_remove(node_pending_timer->id);
+ node_pending_timer->id = 0;
+ }
+
+ free(node_pending_timer);
+}
+
+static gboolean
+node_pending_timer_popped(gpointer key)
+{
+ struct abort_timer_s *node_pending_timer = NULL;
+
+ if (node_pending_timers == NULL) {
+ return FALSE;
+ }
+
+ node_pending_timer = g_hash_table_lookup(node_pending_timers, key);
+ if (node_pending_timer == NULL) {
+ return FALSE;
+ }
+
+ crm_warn("Node with id '%s' pending timed out (%us) on joining the process "
+ "group",
+ (const char *) key, controld_globals.node_pending_timeout);
+
+ if (controld_globals.node_pending_timeout > 0) {
+ abort_timer_popped(node_pending_timer);
+ }
+
+ g_hash_table_remove(node_pending_timers, key);
+
+ return FALSE; // do not reschedule timer
+}
+
+static void
+init_node_pending_timer(const crm_node_t *node, guint timeout)
+{
+ struct abort_timer_s *node_pending_timer = NULL;
+ char *key = NULL;
+
+ if (node->uuid == NULL) {
+ return;
+ }
+
+ if (node_pending_timers == NULL) {
+ node_pending_timers = pcmk__strikey_table(free,
+ free_node_pending_timer);
+
+ // The timer is somehow already existing
+ } else if (g_hash_table_lookup(node_pending_timers, node->uuid) != NULL) {
+ return;
+ }
+
+ crm_notice("Waiting for pending %s with id '%s' to join the process "
+ "group (timeout=%us)",
+ node->uname ? node->uname : "node", node->uuid,
+ controld_globals.node_pending_timeout);
+
+ node_pending_timer = calloc(1, sizeof(struct abort_timer_s));
+ CRM_ASSERT(node_pending_timer != NULL);
+
+ node_pending_timer->aborted = FALSE;
+ node_pending_timer->priority = INFINITY;
+ node_pending_timer->action = pcmk__graph_restart;
+ node_pending_timer->text = "Node pending timed out";
+
+ key = strdup(node->uuid);
+ CRM_ASSERT(key != NULL);
+
+ g_hash_table_replace(node_pending_timers, key, node_pending_timer);
+
+ node_pending_timer->id = g_timeout_add_seconds(timeout,
+ node_pending_timer_popped,
+ key);
+ CRM_ASSERT(node_pending_timer->id != 0);
+}
+
+static void
+remove_node_pending_timer(const char *node_uuid)
+{
+ if (node_pending_timers == NULL) {
+ return;
+ }
+
+ g_hash_table_remove(node_pending_timers, node_uuid);
+}
+
+void
+controld_node_pending_timer(const crm_node_t *node)
+{
+ long long remaining_timeout = 0;
+
+ /* If the node is not an active cluster node, is leaving the cluster, or is
+ * already part of CPG, or node-pending-timeout is disabled, free any
+ * node pending timer for it.
+ */
+ if (pcmk_is_set(node->flags, crm_remote_node)
+ || (node->when_member <= 1) || (node->when_online > 0)
+ || (controld_globals.node_pending_timeout == 0)) {
+ remove_node_pending_timer(node->uuid);
+ return;
+ }
+
+ // Node is a cluster member but offline in CPG
+
+ remaining_timeout = node->when_member - time(NULL)
+ + controld_globals.node_pending_timeout;
+
+ /* It already passed node pending timeout somehow.
+ * Free any node pending timer of it.
+ */
+ if (remaining_timeout <= 0) {
+ remove_node_pending_timer(node->uuid);
+ return;
+ }
+
+ init_node_pending_timer(node, remaining_timeout);
+}
+
+void
+controld_free_node_pending_timers(void)
+{
+ if (node_pending_timers == NULL) {
+ return;
+ }
+
+ g_hash_table_destroy(node_pending_timers);
+ node_pending_timers = NULL;
}
static const char *
@@ -246,7 +386,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
const xmlNode *search = NULL;
for(search = reason; search; search = search->parent) {
- if (pcmk__str_eq(XML_TAG_DIFF, TYPE(search), pcmk__str_casei)) {
+ if (pcmk__xe_is(search, XML_TAG_DIFF)) {
diff = search;
break;
}
@@ -255,7 +395,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
if(diff) {
xml_patch_versions(diff, add, del);
for(search = reason; search; search = search->parent) {
- if (pcmk__str_eq(XML_DIFF_CHANGE, TYPE(search), pcmk__str_casei)) {
+ if (pcmk__xe_is(search, XML_DIFF_CHANGE)) {
change = search;
break;
}
@@ -276,14 +416,13 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
do_crm_log(level, "Transition %d aborted by %s.%s: %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
- controld_globals.transition_graph->id, TYPE(reason),
+ controld_globals.transition_graph->id, reason->name,
ID(reason), abort_text, add[0], add[1], add[2], fn, line,
(const char *) local_path->str,
pcmk__btoa(controld_globals.transition_graph->complete));
g_string_free(local_path, TRUE);
} else {
- const char *kind = NULL;
const char *op = crm_element_value(change, XML_DIFF_OP);
const char *path = crm_element_value(change, XML_DIFF_PATH);
@@ -297,9 +436,9 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
reason = reason->children;
}
}
+ CRM_CHECK(reason != NULL, goto done);
}
- kind = TYPE(reason);
if(strcmp(op, "delete") == 0) {
const char *shortpath = strrchr(path, '/');
@@ -310,7 +449,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
add[0], add[1], add[2], fn, line, path,
pcmk__btoa(controld_globals.transition_graph->complete));
- } else if (pcmk__str_eq(XML_CIB_TAG_NVPAIR, kind, pcmk__str_none)) {
+ } else if (pcmk__xe_is(reason, XML_CIB_TAG_NVPAIR)) {
do_crm_log(level, "Transition %d aborted by %s doing %s %s=%s: %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
controld_globals.transition_graph->id,
@@ -320,7 +459,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
abort_text, add[0], add[1], add[2], fn, line, path,
pcmk__btoa(controld_globals.transition_graph->complete));
- } else if (pcmk__str_eq(XML_LRM_TAG_RSC_OP, kind, pcmk__str_none)) {
+ } else if (pcmk__xe_is(reason, XML_LRM_TAG_RSC_OP)) {
const char *magic = crm_element_value(reason, XML_ATTR_TRANSITION_MAGIC);
do_crm_log(level, "Transition %d aborted by operation %s '%s' on %s: %s "
@@ -331,14 +470,15 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
magic, add[0], add[1], add[2], fn, line,
pcmk__btoa(controld_globals.transition_graph->complete));
- } else if (pcmk__str_any_of(kind, XML_CIB_TAG_STATE, XML_CIB_TAG_NODE, NULL)) {
+ } else if (pcmk__str_any_of((const char *) reason->name,
+ XML_CIB_TAG_STATE, XML_CIB_TAG_NODE, NULL)) {
const char *uname = crm_peer_uname(ID(reason));
do_crm_log(level, "Transition %d aborted by %s '%s' on %s: %s "
CRM_XS " cib=%d.%d.%d source=%s:%d complete=%s",
controld_globals.transition_graph->id,
- kind, op, (uname? uname : ID(reason)), abort_text,
- add[0], add[1], add[2], fn, line,
+ reason->name, op, pcmk__s(uname, ID(reason)),
+ abort_text, add[0], add[1], add[2], fn, line,
pcmk__btoa(controld_globals.transition_graph->complete));
} else {
@@ -347,12 +487,13 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action,
do_crm_log(level, "Transition %d aborted by %s.%s '%s': %s "
CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s",
controld_globals.transition_graph->id,
- TYPE(reason), (id? id : ""), (op? op : "change"),
+ reason->name, pcmk__s(id, ""), pcmk__s(op, "change"),
abort_text, add[0], add[1], add[2], fn, line, path,
pcmk__btoa(controld_globals.transition_graph->complete));
}
}
+done:
if (controld_globals.transition_graph->complete) {
if (controld_get_period_transition_timer() > 0) {
controld_stop_transition_timer();
diff --git a/daemons/controld/controld_throttle.c b/daemons/controld/controld_throttle.c
index 5b7f9c0..a4775e5 100644
--- a/daemons/controld/controld_throttle.c
+++ b/daemons/controld/controld_throttle.c
@@ -154,7 +154,7 @@ throttle_cib_load(float *load)
if(stream == NULL) {
int rc = errno;
- crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc);
+ crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc);
free(loadfile); loadfile = NULL;
return FALSE;
}
@@ -220,7 +220,7 @@ throttle_load_avg(float *load)
stream = fopen(loadfile, "r");
if(stream == NULL) {
int rc = errno;
- crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc);
+ crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc);
return FALSE;
}
@@ -407,7 +407,7 @@ static void
throttle_update_job_max(const char *preference)
{
long long max = 0LL;
- const char *env_limit = getenv("PCMK_node_action_limit");
+ const char *env_limit = pcmk__env_option(PCMK__ENV_NODE_ACTION_LIMIT);
if (env_limit != NULL) {
preference = env_limit; // Per-node override
diff --git a/daemons/controld/controld_transition.c b/daemons/controld/controld_transition.c
index c8a342c..897c6d3 100644
--- a/daemons/controld/controld_transition.c
+++ b/daemons/controld/controld_transition.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2022 the Pacemaker project contributors
+ * Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -15,11 +15,6 @@
#include <pacemaker-controld.h>
-static void
-global_cib_callback(const xmlNode * msg, int callid, int rc, xmlNode * output)
-{
-}
-
static pcmk__graph_t *
create_blank_graph(void)
{
@@ -82,12 +77,6 @@ do_te_control(long long action,
crm_err("Could not set CIB notification callback");
init_ok = FALSE;
}
-
- if (cib_conn->cmds->set_op_callback(cib_conn,
- global_cib_callback) != pcmk_ok) {
- crm_err("Could not set CIB global callback");
- init_ok = FALSE;
- }
}
if (init_ok) {
diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h
index 2da4221..0655bd9 100644
--- a/daemons/controld/controld_transition.h
+++ b/daemons/controld/controld_transition.h
@@ -48,6 +48,8 @@ void controld_destroy_transition_trigger(void);
void controld_trigger_graph_as(const char *fn, int line);
void abort_after_delay(int abort_priority, enum pcmk__graph_next abort_action,
const char *abort_text, guint delay_ms);
+void controld_node_pending_timer(const crm_node_t *node);
+void controld_free_node_pending_timers(void);
void abort_transition_graph(int abort_priority,
enum pcmk__graph_next abort_action,
const char *abort_text, const xmlNode *reason,
diff --git a/daemons/controld/controld_utils.c b/daemons/controld/controld_utils.c
index 4ce09d9..9b306ee 100644
--- a/daemons/controld/controld_utils.c
+++ b/daemons/controld/controld_utils.c
@@ -828,7 +828,7 @@ get_node_id(xmlNode *lrm_rsc_op)
{
xmlNode *node = lrm_rsc_op;
- while (node != NULL && !pcmk__str_eq(XML_CIB_TAG_STATE, TYPE(node), pcmk__str_casei)) {
+ while ((node != NULL) && !pcmk__xe_is(node, XML_CIB_TAG_STATE)) {
node = node->parent;
}
diff --git a/daemons/controld/pacemaker-controld.c b/daemons/controld/pacemaker-controld.c
index 5858898..e4a72c2 100644
--- a/daemons/controld/pacemaker-controld.c
+++ b/daemons/controld/pacemaker-controld.c
@@ -112,7 +112,7 @@ main(int argc, char **argv)
goto done;
}
- if (crm_ipc_connect(old_instance)) {
+ if (pcmk__connect_generic_ipc(old_instance) == pcmk_rc_ok) {
/* IPC end-point already up */
crm_ipc_close(old_instance);
crm_ipc_destroy(old_instance);
diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h
index 1484a00..2334cce 100644
--- a/daemons/controld/pacemaker-controld.h
+++ b/daemons/controld/pacemaker-controld.h
@@ -36,4 +36,7 @@ void controld_remove_voter(const char *uname);
void controld_election_fini(void);
void controld_stop_current_election_timeout(void);
+void set_join_state(const char *start_state, const char *node_name,
+ const char *node_uuid, bool remote);
+
#endif