diff options
Diffstat (limited to 'daemons/controld/controld_control.c')
-rw-r--r-- | daemons/controld/controld_control.c | 287 |
1 files changed, 56 insertions, 231 deletions
diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index 644d686..368659b 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2023 the Pacemaker project contributors + * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -14,7 +14,7 @@ #include <sys/stat.h> #include <crm/crm.h> -#include <crm/msg_xml.h> +#include <crm/common/xml.h> #include <crm/pengine/rules.h> #include <crm/cluster/internal.h> #include <crm/cluster/election_internal.h> @@ -27,10 +27,10 @@ static qb_ipcs_service_t *ipcs = NULL; static crm_trigger_t *config_read_trigger = NULL; #if SUPPORT_COROSYNC -extern gboolean crm_connect_corosync(crm_cluster_t * cluster); +extern gboolean crm_connect_corosync(pcmk_cluster_t *cluster); #endif -void crm_shutdown(int nsig); +static void crm_shutdown(int nsig); static gboolean crm_read_options(gpointer user_data); /* A_HA_CONNECT */ @@ -41,25 +41,25 @@ do_ha_control(long long action, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; - static crm_cluster_t *cluster = NULL; + static pcmk_cluster_t *cluster = NULL; if (cluster == NULL) { cluster = pcmk_cluster_new(); } if (action & A_HA_DISCONNECT) { - crm_cluster_disconnect(cluster); + pcmk_cluster_disconnect(cluster); crm_info("Disconnected from the cluster"); controld_set_fsa_input_flags(R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { - crm_set_status_callback(&peer_update_callback); - crm_set_autoreap(FALSE); + pcmk__cluster_set_status_callback(&peer_update_callback); + pcmk__cluster_set_autoreap(false); #if SUPPORT_COROSYNC - if (is_corosync_cluster()) { + if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) { registered = crm_connect_corosync(cluster); } #endif // SUPPORT_COROSYNC @@ -117,7 +117,7 @@ do_shutdown_req(long long action, pcmk__s(controld_globals.dc_name, "not set")); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); - if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { + if (!pcmk__cluster_send_message(NULL, crm_msg_crmd, msg)) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); @@ -241,7 +241,7 @@ crmd_exit(crm_exit_t exit_code) controld_destroy_transition_trigger(); pcmk__client_cleanup(); - crm_peer_destroy(); + pcmk__cluster_destroy_node_caches(); controld_free_fsa_timers(); te_cleanup_stonith_history_sync(NULL, TRUE); @@ -365,7 +365,7 @@ accept_controller_client(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) { crm_trace("Accepting new IPC client connection"); if (pcmk__new_client(c, uid, gid) == NULL) { - return -EIO; + return -ENOMEM; } return 0; } @@ -381,15 +381,17 @@ dispatch_controller_ipc(qb_ipcs_connection_t * c, void *data, size_t size) xmlNode *msg = pcmk__client_data2xml(client, data, &id, &flags); if (msg == NULL) { - pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_PROTOCOL); + pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL, + CRM_EX_PROTOCOL); return 0; } - pcmk__ipc_send_ack(client, id, flags, "ack", NULL, CRM_EX_INDETERMINATE); + pcmk__ipc_send_ack(client, id, flags, PCMK__XE_ACK, NULL, + CRM_EX_INDETERMINATE); CRM_ASSERT(client->user != NULL); - pcmk__update_acl_user(msg, F_CRM_USER, client->user); + pcmk__update_acl_user(msg, PCMK__XA_CRM_USER, client->user); - crm_xml_add(msg, F_CRM_SYS_FROM, client->id); + crm_xml_add(msg, PCMK__XA_CRM_SYS_FROM, client->id); if (controld_authorize_ipc_message(msg, client, NULL)) { crm_trace("Processing IPC message from client %s", pcmk__client_name(client)); @@ -515,194 +517,6 @@ do_recover(long long action, register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } -static pcmk__cluster_option_t controller_options[] = { - /* name, old name, type, allowed values, - * default value, validator, - * short description, - * long description - */ - { - "dc-version", NULL, "string", NULL, PCMK__VALUE_NONE, NULL, - N_("Pacemaker version on cluster node elected Designated Controller (DC)"), - N_("Includes a hash which identifies the exact changeset the code was " - "built from. Used for diagnostic purposes.") - }, - { - "cluster-infrastructure", NULL, "string", NULL, "corosync", NULL, - N_("The messaging stack on which Pacemaker is currently running"), - N_("Used for informational and diagnostic purposes.") - }, - { - "cluster-name", NULL, "string", NULL, NULL, NULL, - N_("An arbitrary name for the cluster"), - N_("This optional value is mostly for users' convenience as desired " - "in administration, but may also be used in Pacemaker " - "configuration rules via the #cluster-name node attribute, and " - "by higher-level tools and resource agents.") - }, - { - XML_CONFIG_ATTR_DC_DEADTIME, NULL, "time", - NULL, "20s", pcmk__valid_interval_spec, - N_("How long to wait for a response from other nodes during start-up"), - N_("The optimal value will depend on the speed and load of your network " - "and the type of switches used.") - }, - { - XML_CONFIG_ATTR_RECHECK, NULL, "time", - N_("Zero disables polling, while positive values are an interval in seconds" - "(unless other units are specified, for example \"5min\")"), - "15min", pcmk__valid_interval_spec, - N_("Polling interval to recheck cluster state and evaluate rules " - "with date specifications"), - N_("Pacemaker is primarily event-driven, and looks ahead to know when to " - "recheck cluster state for failure timeouts and most time-based " - "rules. However, it will also recheck the cluster after this " - "amount of inactivity, to evaluate rules with date specifications " - "and serve as a fail-safe for certain types of scheduler bugs.") - }, - { - "load-threshold", NULL, "percentage", NULL, - "80%", pcmk__valid_percentage, - N_("Maximum amount of system load that should be used by cluster nodes"), - N_("The cluster will slow down its recovery process when the amount of " - "system resources used (currently CPU) approaches this limit"), - }, - { - "node-action-limit", NULL, "integer", NULL, - "0", pcmk__valid_number, - N_("Maximum number of jobs that can be scheduled per node " - "(defaults to 2x cores)") - }, - { XML_CONFIG_ATTR_FENCE_REACTION, NULL, "string", NULL, "stop", NULL, - N_("How a cluster node should react if notified of its own fencing"), - N_("A cluster node may receive notification of its own fencing if fencing " - "is misconfigured, or if fabric fencing is in use that doesn't cut " - "cluster communication. Allowed values are \"stop\" to attempt to " - "immediately stop Pacemaker and stay stopped, or \"panic\" to attempt " - "to immediately reboot the local node, falling back to stop on failure.") - }, - { - XML_CONFIG_ATTR_ELECTION_FAIL, NULL, "time", NULL, - "2min", pcmk__valid_interval_spec, - "*** Advanced Use Only ***", - N_("Declare an election failed if it is not decided within this much " - "time. If you need to adjust this value, it probably indicates " - "the presence of a bug.") - }, - { - XML_CONFIG_ATTR_FORCE_QUIT, NULL, "time", NULL, - "20min", pcmk__valid_interval_spec, - "*** Advanced Use Only ***", - N_("Exit immediately if shutdown does not complete within this much " - "time. If you need to adjust this value, it probably indicates " - "the presence of a bug.") - }, - { - "join-integration-timeout", "crmd-integration-timeout", "time", NULL, - "3min", pcmk__valid_interval_spec, - "*** Advanced Use Only ***", - N_("If you need to adjust this value, it probably indicates " - "the presence of a bug.") - }, - { - "join-finalization-timeout", "crmd-finalization-timeout", "time", NULL, - "30min", pcmk__valid_interval_spec, - "*** Advanced Use Only ***", - N_("If you need to adjust this value, it probably indicates " - "the presence of a bug.") - }, - { - "transition-delay", "crmd-transition-delay", "time", NULL, - "0s", pcmk__valid_interval_spec, - N_("*** Advanced Use Only *** Enabling this option will slow down " - "cluster recovery under all conditions"), - N_("Delay cluster recovery for this much time to allow for additional " - "events to occur. Useful if your configuration is sensitive to " - "the order in which ping updates arrive.") - }, - { - "stonith-watchdog-timeout", NULL, "time", NULL, - "0", controld_verify_stonith_watchdog_timeout, - N_("How long before nodes can be assumed to be safely down when " - "watchdog-based self-fencing via SBD is in use"), - N_("If this is set to a positive value, lost nodes are assumed to " - "self-fence using watchdog-based SBD within this much time. This " - "does not require a fencing resource to be explicitly configured, " - "though a fence_watchdog resource can be configured, to limit use " - "to specific nodes. If this is set to 0 (the default), the cluster " - "will never assume watchdog-based self-fencing. If this is set to a " - "negative value, the cluster will use twice the local value of the " - "`SBD_WATCHDOG_TIMEOUT` environment variable if that is positive, " - "or otherwise treat this as 0. WARNING: When used, this timeout " - "must be larger than `SBD_WATCHDOG_TIMEOUT` on all nodes that use " - "watchdog-based SBD, and Pacemaker will refuse to start on any of " - "those nodes where this is not true for the local value or SBD is " - "not active. When this is set to a negative value, " - "`SBD_WATCHDOG_TIMEOUT` must be set to the same value on all nodes " - "that use SBD, otherwise data corruption or loss could occur.") - }, - { - "stonith-max-attempts", NULL, "integer", NULL, - "10", pcmk__valid_positive_number, - N_("How many times fencing can fail before it will no longer be " - "immediately re-attempted on a target") - }, - - // Already documented in libpe_status (other values must be kept identical) - { - "no-quorum-policy", NULL, "select", - "stop, freeze, ignore, demote, suicide", "stop", pcmk__valid_quorum, - N_("What to do when the cluster does not have quorum"), NULL - }, - { - XML_CONFIG_ATTR_SHUTDOWN_LOCK, NULL, "boolean", NULL, - "false", pcmk__valid_boolean, - N_("Whether to lock resources to a cleanly shut down node"), - N_("When true, resources active on a node when it is cleanly shut down " - "are kept \"locked\" to that node (not allowed to run elsewhere) " - "until they start again on that node after it rejoins (or for at " - "most shutdown-lock-limit, if set). Stonith resources and " - "Pacemaker Remote connections are never locked. Clone and bundle " - "instances and the promoted role of promotable clones are " - "currently never locked, though support could be added in a future " - "release.") - }, - { - XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT, NULL, "time", NULL, - "0", pcmk__valid_interval_spec, - N_("Do not lock resources to a cleanly shut down node longer than " - "this"), - N_("If shutdown-lock is true and this is set to a nonzero time " - "duration, shutdown locks will expire after this much time has " - "passed since the shutdown was initiated, even if the node has not " - "rejoined.") - }, - { - XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT, NULL, "time", NULL, - "0", pcmk__valid_interval_spec, - N_("How long to wait for a node that has joined the cluster to join " - "the controller process group"), - N_("Fence nodes that do not join the controller process group within " - "this much time after joining the cluster, to allow the cluster " - "to continue managing resources. A value of 0 means never fence " - "pending nodes. Setting the value to 2h means fence nodes after " - "2 hours.") - }, -}; - -void -crmd_metadata(void) -{ - const char *desc_short = "Pacemaker controller options"; - const char *desc_long = "Cluster options used by Pacemaker's controller"; - - gchar *s = pcmk__format_option_metadata("pacemaker-controld", desc_short, - desc_long, controller_options, - PCMK__NELEM(controller_options)); - printf("%s", s); - g_free(s); -} - static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { @@ -726,49 +540,62 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void } crmconfig = output; - if ((crmconfig != NULL) - && !pcmk__xe_is(crmconfig, XML_CIB_TAG_CRMCONFIG)) { - crmconfig = first_named_child(crmconfig, XML_CIB_TAG_CRMCONFIG); + if ((crmconfig != NULL) && !pcmk__xe_is(crmconfig, PCMK_XE_CRM_CONFIG)) { + crmconfig = pcmk__xe_first_child(crmconfig, PCMK_XE_CRM_CONFIG, NULL, + NULL); } if (!crmconfig) { fsa_data_t *msg_data = NULL; - crm_err("Local CIB query for " XML_CIB_TAG_CRMCONFIG " section failed"); + crm_err("Local CIB query for " PCMK_XE_CRM_CONFIG " section failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = pcmk__strkey_table(free, free); - pe_unpack_nvpairs(crmconfig, crmconfig, XML_CIB_TAG_PROPSET, NULL, - config_hash, CIB_OPTIONS_FIRST, FALSE, now, NULL); + pe_unpack_nvpairs(crmconfig, crmconfig, PCMK_XE_CLUSTER_PROPERTY_SET, NULL, + config_hash, PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS, FALSE, now, + NULL); // Validate all options, and use defaults if not already present in hash - pcmk__validate_cluster_options(config_hash, controller_options, - PCMK__NELEM(controller_options)); + pcmk__validate_cluster_options(config_hash); - value = g_hash_table_lookup(config_hash, "no-quorum-policy"); - if (pcmk__str_eq(value, "suicide", pcmk__str_casei) && pcmk__locate_sbd()) { + /* Validate the watchdog timeout in the context of the local node + * environment. If invalid, the controller will exit with a fatal error. + * + * We do this via a wrapper in the controller, so that we call + * pcmk__valid_stonith_watchdog_timeout() only if watchdog fencing is + * enabled for the local node. Otherwise, we may exit unnecessarily. + * + * A validator function in libcrmcommon can't act as such a wrapper, because + * it doesn't have a stonith API connection or the local node name. + */ + value = g_hash_table_lookup(config_hash, PCMK_OPT_STONITH_WATCHDOG_TIMEOUT); + controld_verify_stonith_watchdog_timeout(value); + + value = g_hash_table_lookup(config_hash, PCMK_OPT_NO_QUORUM_POLICY); + if (pcmk__str_eq(value, PCMK_VALUE_FENCE_LEGACY, pcmk__str_casei) + && (pcmk__locate_sbd() != 0)) { controld_set_global_flags(controld_no_quorum_suicide); } - value = g_hash_table_lookup(config_hash, XML_CONFIG_ATTR_SHUTDOWN_LOCK); + value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK); if (crm_is_true(value)) { controld_set_global_flags(controld_shutdown_lock_enabled); } else { controld_clear_global_flags(controld_shutdown_lock_enabled); } - value = g_hash_table_lookup(config_hash, - XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT); - controld_globals.shutdown_lock_limit = crm_parse_interval_spec(value) - / 1000; + value = g_hash_table_lookup(config_hash, PCMK_OPT_SHUTDOWN_LOCK_LIMIT); + pcmk_parse_interval_spec(value, &controld_globals.shutdown_lock_limit); + controld_globals.shutdown_lock_limit /= 1000; - value = g_hash_table_lookup(config_hash, - XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT); - controld_globals.node_pending_timeout = crm_parse_interval_spec(value) / 1000; + value = g_hash_table_lookup(config_hash, PCMK_OPT_NODE_PENDING_TIMEOUT); + pcmk_parse_interval_spec(value, &controld_globals.node_pending_timeout); + controld_globals.node_pending_timeout /= 1000; - value = g_hash_table_lookup(config_hash, "cluster-name"); + value = g_hash_table_lookup(config_hash, PCMK_OPT_CLUSTER_NAME); pcmk__str_update(&(controld_globals.cluster_name), value); // Let subcomponents initialize their own static variables @@ -777,7 +604,7 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void controld_configure_fsa_timers(config_hash); controld_configure_throttle(config_hash); - alerts = first_named_child(output, XML_CIB_TAG_ALERTS); + alerts = pcmk__xe_first_child(output, PCMK_XE_ALERTS, NULL, NULL); crmd_unpack_alerts(alerts); controld_set_fsa_input_flags(R_READ_CONFIG); @@ -809,8 +636,8 @@ crm_read_options(gpointer user_data) { cib_t *cib_conn = controld_globals.cib_conn; int call_id = cib_conn->cmds->query(cib_conn, - "//" XML_CIB_TAG_CRMCONFIG - " | //" XML_CIB_TAG_ALERTS, + "//" PCMK_XE_CRM_CONFIG + " | //" PCMK_XE_ALERTS, NULL, cib_xpath|cib_scope_local); fsa_register_cib_callback(call_id, NULL, config_query_callback); @@ -829,7 +656,7 @@ do_read_config(long long action, controld_trigger_config(); } -void +static void crm_shutdown(int nsig) { const char *value = NULL; @@ -856,9 +683,7 @@ crm_shutdown(int nsig) * config_query_callback() has been run at least once, it doesn't look like * anything could have changed the timer period since then. */ - value = pcmk__cluster_option(NULL, controller_options, - PCMK__NELEM(controller_options), - XML_CONFIG_ATTR_FORCE_QUIT); - default_period_ms = crm_parse_interval_spec(value); + value = pcmk__cluster_option(NULL, PCMK_OPT_SHUTDOWN_ESCALATION); + pcmk_parse_interval_spec(value, &default_period_ms); controld_shutdown_start_countdown(default_period_ms); } |