summaryrefslogtreecommitdiffstats
path: root/daemons/controld/controld_control.c
diff options
context:
space:
mode:
Diffstat (limited to 'daemons/controld/controld_control.c')
-rw-r--r--daemons/controld/controld_control.c287
1 files changed, 56 insertions, 231 deletions
diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c
index 644d686..368659b 100644
--- a/daemons/controld/controld_control.c
+++ b/daemons/controld/controld_control.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2023 the Pacemaker project contributors
+ * Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -14,7 +14,7 @@
#include <sys/stat.h>
#include <crm/crm.h>
-#include <crm/msg_xml.h>
+#include <crm/common/xml.h>
#include <crm/pengine/rules.h>
#include <crm/cluster/internal.h>
#include <crm/cluster/election_internal.h>
@@ -27,10 +27,10 @@ static qb_ipcs_service_t *ipcs = NULL;
static crm_trigger_t *config_read_trigger = NULL;
#if SUPPORT_COROSYNC
-extern gboolean crm_connect_corosync(crm_cluster_t * cluster);
+extern gboolean crm_connect_corosync(pcmk_cluster_t *cluster);
#endif
-void crm_shutdown(int nsig);
+static void crm_shutdown(int nsig);
static gboolean crm_read_options(gpointer user_data);
/* A_HA_CONNECT */
@@ -41,25 +41,25 @@ do_ha_control(long long action,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
gboolean registered = FALSE;
- static crm_cluster_t *cluster = NULL;
+ static pcmk_cluster_t *cluster = NULL;
if (cluster == NULL) {
cluster = pcmk_cluster_new();
}
if (action & A_HA_DISCONNECT) {
- crm_cluster_disconnect(cluster);
+ pcmk_cluster_disconnect(cluster);
crm_info("Disconnected from the cluster");
controld_set_fsa_input_flags(R_HA_DISCONNECTED);
}
if (action & A_HA_CONNECT) {
- crm_set_status_callback(&peer_update_callback);
- crm_set_autoreap(FALSE);
+ pcmk__cluster_set_status_callback(&peer_update_callback);
+ pcmk__cluster_set_autoreap(false);
#if SUPPORT_COROSYNC
- if (is_corosync_cluster()) {
+ if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) {
registered = crm_connect_corosync(cluster);
}
#endif // SUPPORT_COROSYNC
@@ -117,7 +117,7 @@ do_shutdown_req(long long action,
pcmk__s(controld_globals.dc_name, "not set"));
msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
- if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) {
+ if (!pcmk__cluster_send_message(NULL, crm_msg_crmd, msg)) {
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
free_xml(msg);
@@ -241,7 +241,7 @@ crmd_exit(crm_exit_t exit_code)
controld_destroy_transition_trigger();
pcmk__client_cleanup();
- crm_peer_destroy();
+ pcmk__cluster_destroy_node_caches();
controld_free_fsa_timers();
te_cleanup_stonith_history_sync(NULL, TRUE);
@@ -365,7 +365,7 @@ accept_controller_client(qb_ipcs_connection_t *c, uid_t uid, gid_t gid)
{
crm_trace("Accepting new IPC client connection");
if (pcmk__new_client(c, uid, gid) == NULL) {
- return -EIO;
+ return -ENOMEM;
}
return 0;
}
@@ -381,15 +381,17 @@ dispatch_controller_ipc(qb_ipcs_connection_t * c, void *data, size_t size)
xmlNode *msg = pcmk__client_data2xml(client, data, &id, &flags);
if (msg == NULL) {
- pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_PROTOCOL);
+ pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL,
+ CRM_EX_PROTOCOL);
return 0;
}
- pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_INDETERMINATE);
+ pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL,
+ CRM_EX_INDETERMINATE);
CRM_ASSERT(client->user != NULL);
- pcmk__update_acl_user(msg, F_CRM_USER, client->user);
+ pcmk__update_acl_user(msg, PCMK__XA_CRM_USER, client->user);
- crm_xml_add(msg, F_CRM_SYS_FROM, client->id);
+ crm_xml_add(msg, PCMK__XA_CRM_SYS_FROM, client->id);
if (controld_authorize_ipc_message(msg, client, NULL)) {
crm_trace("Processing IPC message from client %s",
pcmk__client_name(client));
@@ -515,194 +517,6 @@ do_recover(long long action,
register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL);
}
-static pcmk__cluster_option_t controller_options[] = {
- /* name, old name, type, allowed values,
- * default value, validator,
- * short description,
- * long description
- */
- {
- "dc-version", NULL, "string", NULL, PCMK__VALUE_NONE, NULL,
- N_("Pacemaker version on cluster node elected Designated Controller (DC)"),
- N_("Includes a hash which identifies the exact changeset the code was "
- "built from. Used for diagnostic purposes.")
- },
- {
- "cluster-infrastructure", NULL, "string", NULL, "corosync", NULL,
- N_("The messaging stack on which Pacemaker is currently running"),
- N_("Used for informational and diagnostic purposes.")
- },
- {
- "cluster-name", NULL, "string", NULL, NULL, NULL,
- N_("An arbitrary name for the cluster"),
- N_("This optional value is mostly for users' convenience as desired "
- "in administration, but may also be used in Pacemaker "
- "configuration rules via the #cluster-name node attribute, and "
- "by higher-level tools and resource agents.")
- },
- {
- XML_CONFIG_ATTR_DC_DEADTIME, NULL, "time",
- NULL, "20s", pcmk__valid_interval_spec,
- N_("How long to wait for a response from other nodes during start-up"),
- N_("The optimal value will depend on the speed and load of your network "
- "and the type of switches used.")
- },
- {
- XML_CONFIG_ATTR_RECHECK, NULL, "time",
- N_("Zero disables polling, while positive values are an interval in seconds"
- "(unless other units are specified, for example \"5min\")"),
- "15min", pcmk__valid_interval_spec,
- N_("Polling interval to recheck cluster state and evaluate rules "
- "with date specifications"),
- N_("Pacemaker is primarily event-driven, and looks ahead to know when to "
- "recheck cluster state for failure timeouts and most time-based "
- "rules. However, it will also recheck the cluster after this "
- "amount of inactivity, to evaluate rules with date specifications "
- "and serve as a fail-safe for certain types of scheduler bugs.")
- },
- {
- "load-threshold", NULL, "percentage", NULL,
- "80%", pcmk__valid_percentage,
- N_("Maximum amount of system load that should be used by cluster nodes"),
- N_("The cluster will slow down its recovery process when the amount of "
- "system resources used (currently CPU) approaches this limit"),
- },
- {
- "node-action-limit", NULL, "integer", NULL,
- "0", pcmk__valid_number,
- N_("Maximum number of jobs that can be scheduled per node "
- "(defaults to 2x cores)")
- },
- { XML_CONFIG_ATTR_FENCE_REACTION, NULL, "string", NULL, "stop", NULL,
- N_("How a cluster node should react if notified of its own fencing"),
- N_("A cluster node may receive notification of its own fencing if fencing "
- "is misconfigured, or if fabric fencing is in use that doesn't cut "
- "cluster communication. Allowed values are \"stop\" to attempt to "
- "immediately stop Pacemaker and stay stopped, or \"panic\" to attempt "
- "to immediately reboot the local node, falling back to stop on failure.")
- },
- {
- XML_CONFIG_ATTR_ELECTION_FAIL, NULL, "time", NULL,
- "2min", pcmk__valid_interval_spec,
- "*** Advanced Use Only ***",
- N_("Declare an election failed if it is not decided within this much "
- "time. If you need to adjust this value, it probably indicates "
- "the presence of a bug.")
- },
- {
- XML_CONFIG_ATTR_FORCE_QUIT, NULL, "time", NULL,
- "20min", pcmk__valid_interval_spec,
- "*** Advanced Use Only ***",
- N_("Exit immediately if shutdown does not complete within this much "
- "time. If you need to adjust this value, it probably indicates "
- "the presence of a bug.")
- },
- {
- "join-integration-timeout", "crmd-integration-timeout", "time", NULL,
- "3min", pcmk__valid_interval_spec,
- "*** Advanced Use Only ***",
- N_("If you need to adjust this value, it probably indicates "
- "the presence of a bug.")
- },
- {
- "join-finalization-timeout", "crmd-finalization-timeout", "time", NULL,
- "30min", pcmk__valid_interval_spec,
- "*** Advanced Use Only ***",
- N_("If you need to adjust this value, it probably indicates "
- "the presence of a bug.")
- },
- {
- "transition-delay", "crmd-transition-delay", "time", NULL,
- "0s", pcmk__valid_interval_spec,
- N_("*** Advanced Use Only *** Enabling this option will slow down "
- "cluster recovery under all conditions"),
- N_("Delay cluster recovery for this much time to allow for additional "
- "events to occur. Useful if your configuration is sensitive to "
- "the order in which ping updates arrive.")
- },
- {
- "stonith-watchdog-timeout", NULL, "time", NULL,
- "0", controld_verify_stonith_watchdog_timeout,
- N_("How long before nodes can be assumed to be safely down when "
- "watchdog-based self-fencing via SBD is in use"),
- N_("If this is set to a positive value, lost nodes are assumed to "
- "self-fence using watchdog-based SBD within this much time. This "
- "does not require a fencing resource to be explicitly configured, "
- "though a fence_watchdog resource can be configured, to limit use "
- "to specific nodes. If this is set to 0 (the default), the cluster "
- "will never assume watchdog-based self-fencing. If this is set to a "
- "negative value, the cluster will use twice the local value of the "
- "`SBD_WATCHDOG_TIMEOUT` environment variable if that is positive, "
- "or otherwise treat this as 0. WARNING: When used, this timeout "
- "must be larger than `SBD_WATCHDOG_TIMEOUT` on all nodes that use "
- "watchdog-based SBD, and Pacemaker will refuse to start on any of "
- "those nodes where this is not true for the local value or SBD is "
- "not active. When this is set to a negative value, "
- "`SBD_WATCHDOG_TIMEOUT` must be set to the same value on all nodes "
- "that use SBD, otherwise data corruption or loss could occur.")
- },
- {
- "stonith-max-attempts", NULL, "integer", NULL,
- "10", pcmk__valid_positive_number,
- N_("How many times fencing can fail before it will no longer be "
- "immediately re-attempted on a target")
- },
-
- // Already documented in libpe_status (other values must be kept identical)
- {
- "no-quorum-policy", NULL, "select",
- "stop, freeze, ignore, demote, suicide", "stop", pcmk__valid_quorum,
- N_("What to do when the cluster does not have quorum"), NULL
- },
- {
- XML_CONFIG_ATTR_SHUTDOWN_LOCK, NULL, "boolean", NULL,
- "false", pcmk__valid_boolean,
- N_("Whether to lock resources to a cleanly shut down node"),
- N_("When true, resources active on a node when it is cleanly shut down "
- "are kept \"locked\" to that node (not allowed to run elsewhere) "
- "until they start again on that node after it rejoins (or for at "
- "most shutdown-lock-limit, if set). Stonith resources and "
- "Pacemaker Remote connections are never locked. Clone and bundle "
- "instances and the promoted role of promotable clones are "
- "currently never locked, though support could be added in a future "
- "release.")
- },
- {
- XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT, NULL, "time", NULL,
- "0", pcmk__valid_interval_spec,
- N_("Do not lock resources to a cleanly shut down node longer than "
- "this"),
- N_("If shutdown-lock is true and this is set to a nonzero time "
- "duration, shutdown locks will expire after this much time has "
- "passed since the shutdown was initiated, even if the node has not "
- "rejoined.")
- },
- {
- XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT, NULL, "time", NULL,
- "0", pcmk__valid_interval_spec,
- N_("How long to wait for a node that has joined the cluster to join "
- "the controller process group"),
- N_("Fence nodes that do not join the controller process group within "
- "this much time after joining the cluster, to allow the cluster "
- "to continue managing resources. A value of 0 means never fence "
- "pending nodes. Setting the value to 2h means fence nodes after "
- "2 hours.")
- },
-};
-
-void
-crmd_metadata(void)
-{
- const char *desc_short = "Pacemaker controller options";
- const char *desc_long = "Cluster options used by Pacemaker's controller";
-
- gchar *s = pcmk__format_option_metadata("pacemaker-controld", desc_short,
- desc_long, controller_options,
- PCMK__NELEM(controller_options));
- printf("%s", s);
- g_free(s);
-}
-
static void
config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
@@ -726,49 +540,62 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
}
crmconfig = output;
- if ((crmconfig != NULL)
- && !pcmk__xe_is(crmconfig, XML_CIB_TAG_CRMCONFIG)) {
- crmconfig = first_named_child(crmconfig, XML_CIB_TAG_CRMCONFIG);
+ if ((crmconfig != NULL) && !pcmk__xe_is(crmconfig, PCMK_XE_CRM_CONFIG)) {
+ crmconfig = pcmk__xe_first_child(crmconfig, PCMK_XE_CRM_CONFIG, NULL,
+ NULL);
}
if (!crmconfig) {
fsa_data_t *msg_data = NULL;
- crm_err("Local CIB query for " XML_CIB_TAG_CRMCONFIG " section failed");
+ crm_err("Local CIB query for " PCMK_XE_CRM_CONFIG " section failed");
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
goto bail;
}
crm_debug("Call %d : Parsing CIB options", call_id);
config_hash = pcmk__strkey_table(free, free);
- pe_unpack_nvpairs(crmconfig, crmconfig, XML_CIB_TAG_PROPSET, NULL,
- config_hash, CIB_OPTIONS_FIRST, FALSE, now, NULL);
+ pe_unpack_nvpairs(crmconfig, crmconfig, PCMK_XE_CLUSTER_PROPERTY_SET, NULL,
+ config_hash, PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS, FALSE, now,
+ NULL);
// Validate all options, and use defaults if not already present in hash
- pcmk__validate_cluster_options(config_hash, controller_options,
- PCMK__NELEM(controller_options));
+ pcmk__validate_cluster_options(config_hash);
- value = g_hash_table_lookup(config_hash, "no-quorum-policy");
- if (pcmk__str_eq(value, "suicide", pcmk__str_casei) && pcmk__locate_sbd()) {
+ /* Validate the watchdog timeout in the context of the local node
+ * environment. If invalid, the controller will exit with a fatal error.
+ *
+ * We do this via a wrapper in the controller, so that we call
+ * pcmk__valid_stonith_watchdog_timeout() only if watchdog fencing is
+ * enabled for the local node. Otherwise, we may exit unnecessarily.
+ *
+ * A validator function in libcrmcommon can't act as such a wrapper, because
+ * it doesn't have a stonith API connection or the local node name.
+ */
+ value = g_hash_table_lookup(config_hash, PCMK_OPT_STONITH_WATCHDOG_TIMEOUT);
+ controld_verify_stonith_watchdog_timeout(value);
+
+ value = g_hash_table_lookup(config_hash, PCMK_OPT_NO_QUORUM_POLICY);
+ if (pcmk__str_eq(value, PCMK_VALUE_FENCE_LEGACY, pcmk__str_casei)
+ && (pcmk__locate_sbd() != 0)) {
controld_set_global_flags(controld_no_quorum_suicide);
}
- value = g_hash_table_lookup(config_hash, XML_CONFIG_ATTR_SHUTDOWN_LOCK);
+ value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK);
if (crm_is_true(value)) {
controld_set_global_flags(controld_shutdown_lock_enabled);
} else {
controld_clear_global_flags(controld_shutdown_lock_enabled);
}
- value = g_hash_table_lookup(config_hash,
- XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT);
- controld_globals.shutdown_lock_limit = crm_parse_interval_spec(value)
- / 1000;
+ value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK_LIMIT);
+ pcmk_parse_interval_spec(value, &controld_globals.shutdown_lock_limit);
+ controld_globals.shutdown_lock_limit /= 1000;
- value = g_hash_table_lookup(config_hash,
- XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT);
- controld_globals.node_pending_timeout = crm_parse_interval_spec(value) / 1000;
+ value = g_hash_table_lookup(config_hash, PCMK_OPT_NODE_PENDING_TIMEOUT);
+ pcmk_parse_interval_spec(value, &controld_globals.node_pending_timeout);
+ controld_globals.node_pending_timeout /= 1000;
- value = g_hash_table_lookup(config_hash, "cluster-name");
+ value = g_hash_table_lookup(config_hash, PCMK_OPT_CLUSTER_NAME);
pcmk__str_update(&(controld_globals.cluster_name), value);
// Let subcomponents initialize their own static variables
@@ -777,7 +604,7 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
controld_configure_fsa_timers(config_hash);
controld_configure_throttle(config_hash);
- alerts = first_named_child(output, XML_CIB_TAG_ALERTS);
+ alerts = pcmk__xe_first_child(output, PCMK_XE_ALERTS, NULL, NULL);
crmd_unpack_alerts(alerts);
controld_set_fsa_input_flags(R_READ_CONFIG);
@@ -809,8 +636,8 @@ crm_read_options(gpointer user_data)
{
cib_t *cib_conn = controld_globals.cib_conn;
int call_id = cib_conn->cmds->query(cib_conn,
- "//" XML_CIB_TAG_CRMCONFIG
- " | //" XML_CIB_TAG_ALERTS,
+ "//" PCMK_XE_CRM_CONFIG
+ " | //" PCMK_XE_ALERTS,
NULL, cib_xpath|cib_scope_local);
fsa_register_cib_callback(call_id, NULL, config_query_callback);
@@ -829,7 +656,7 @@ do_read_config(long long action,
controld_trigger_config();
}
-void
+static void
crm_shutdown(int nsig)
{
const char *value = NULL;
@@ -856,9 +683,7 @@ crm_shutdown(int nsig)
* config_query_callback() has been run at least once, it doesn't look like
* anything could have changed the timer period since then.
*/
- value = pcmk__cluster_option(NULL, controller_options,
- PCMK__NELEM(controller_options),
- XML_CONFIG_ATTR_FORCE_QUIT);
- default_period_ms = crm_parse_interval_spec(value);
+ value = pcmk__cluster_option(NULL, PCMK_OPT_SHUTDOWN_ESCALATION);
+ pcmk_parse_interval_spec(value, &default_period_ms);
controld_shutdown_start_countdown(default_period_ms);
}