diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
commit | e5a812082ae033afb1eed82c0f2df3d0f6bdc93f (patch) | |
tree | a6716c9275b4b413f6c9194798b34b91affb3cc7 /lib/cluster | |
parent | Initial commit. (diff) | |
download | pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.tar.xz pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.zip |
Adding upstream version 2.1.6.upstream/2.1.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/cluster')
-rw-r--r-- | lib/cluster/Makefile.am | 29 | ||||
-rw-r--r-- | lib/cluster/cluster.c | 405 | ||||
-rw-r--r-- | lib/cluster/corosync.c | 814 | ||||
-rw-r--r-- | lib/cluster/cpg.c | 1092 | ||||
-rw-r--r-- | lib/cluster/crmcluster_private.h | 47 | ||||
-rw-r--r-- | lib/cluster/election.c | 727 | ||||
-rw-r--r-- | lib/cluster/membership.c | 1301 |
7 files changed, 4415 insertions, 0 deletions
diff --git a/lib/cluster/Makefile.am b/lib/cluster/Makefile.am new file mode 100644 index 0000000..9225f29 --- /dev/null +++ b/lib/cluster/Makefile.am @@ -0,0 +1,29 @@ +# +# Copyright 2004-2018 the Pacemaker project contributors +# +# The version control history for this file may have further details. +# +# This source code is licensed under the GNU General Public License version 2 +# or later (GPLv2+) WITHOUT ANY WARRANTY. +# +include $(top_srcdir)/mk/common.mk + +noinst_HEADERS = crmcluster_private.h + +## libraries +lib_LTLIBRARIES = libcrmcluster.la + +libcrmcluster_la_LDFLAGS = -version-info 30:0:1 + +libcrmcluster_la_CFLAGS = $(CFLAGS_HARDENED_LIB) +libcrmcluster_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) + +libcrmcluster_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la $(top_builddir)/lib/fencing/libstonithd.la $(CLUSTERLIBS) + +libcrmcluster_la_SOURCES = election.c cluster.c membership.c +if BUILD_CS_SUPPORT +libcrmcluster_la_SOURCES += cpg.c corosync.c +endif + +clean-generic: + rm -f *.log *.debug *.xml *~ diff --git a/lib/cluster/cluster.c b/lib/cluster/cluster.c new file mode 100644 index 0000000..011e053 --- /dev/null +++ b/lib/cluster/cluster.c @@ -0,0 +1,405 @@ +/* + * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> +#include <dlfcn.h> + +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <stdlib.h> +#include <time.h> +#include <sys/param.h> +#include <sys/types.h> + +#include <crm/crm.h> +#include <crm/msg_xml.h> + +#include <crm/common/ipc.h> +#include <crm/cluster/internal.h> +#include "crmcluster_private.h" + +CRM_TRACE_INIT_DATA(cluster); + +/*! + * \brief Get (and set if needed) a node's UUID + * + * \param[in,out] peer Node to check + * + * \return Node UUID of \p peer, or NULL if unknown + */ +const char * +crm_peer_uuid(crm_node_t *peer) +{ + char *uuid = NULL; + + // Check simple cases first, to avoid any calls that might block + if (peer == NULL) { + return NULL; + } + if (peer->uuid != NULL) { + return peer->uuid; + } + + switch (get_cluster_type()) { + case pcmk_cluster_corosync: +#if SUPPORT_COROSYNC + uuid = pcmk__corosync_uuid(peer); +#endif + break; + + case pcmk_cluster_unknown: + case pcmk_cluster_invalid: + crm_err("Unsupported cluster type"); + break; + } + + peer->uuid = uuid; + return peer->uuid; +} + +/*! + * \brief Connect to the cluster layer + * + * \param[in,out] Initialized cluster object to connect + * + * \return TRUE on success, otherwise FALSE + */ +gboolean +crm_cluster_connect(crm_cluster_t *cluster) +{ + enum cluster_type_e type = get_cluster_type(); + + crm_notice("Connecting to %s cluster infrastructure", + name_for_cluster_type(type)); + switch (type) { + case pcmk_cluster_corosync: +#if SUPPORT_COROSYNC + crm_peer_init(); + return pcmk__corosync_connect(cluster); +#else + break; +#endif // SUPPORT_COROSYNC + default: + break; + } + return FALSE; +} + +/*! + * \brief Disconnect from the cluster layer + * + * \param[in,out] cluster Cluster object to disconnect + */ +void +crm_cluster_disconnect(crm_cluster_t *cluster) +{ + enum cluster_type_e type = get_cluster_type(); + + crm_info("Disconnecting from %s cluster infrastructure", + name_for_cluster_type(type)); + switch (type) { + case pcmk_cluster_corosync: +#if SUPPORT_COROSYNC + crm_peer_destroy(); + pcmk__corosync_disconnect(cluster); +#endif // SUPPORT_COROSYNC + break; + default: + break; + } +} + +/*! + * \brief Allocate a new \p crm_cluster_t object + * + * \return A newly allocated \p crm_cluster_t object (guaranteed not \p NULL) + * \note The caller is responsible for freeing the return value using + * \p pcmk_cluster_free(). + */ +crm_cluster_t * +pcmk_cluster_new(void) +{ + crm_cluster_t *cluster = calloc(1, sizeof(crm_cluster_t)); + + CRM_ASSERT(cluster != NULL); + return cluster; +} + +/*! + * \brief Free a \p crm_cluster_t object and its dynamically allocated members + * + * \param[in,out] cluster Cluster object to free + */ +void +pcmk_cluster_free(crm_cluster_t *cluster) +{ + if (cluster == NULL) { + return; + } + free(cluster->uuid); + free(cluster->uname); + free(cluster); +} + +/*! + * \brief Send an XML message via the cluster messaging layer + * + * \param[in] node Cluster node to send message to + * \param[in] service Message type to use in message host info + * \param[in] data XML message to send + * \param[in] ordered Ignored for currently supported messaging layers + * + * \return TRUE on success, otherwise FALSE + */ +gboolean +send_cluster_message(const crm_node_t *node, enum crm_ais_msg_types service, + xmlNode *data, gboolean ordered) +{ + switch (get_cluster_type()) { + case pcmk_cluster_corosync: +#if SUPPORT_COROSYNC + return pcmk__cpg_send_xml(data, node, service); +#endif + break; + default: + break; + } + return FALSE; +} + +/*! + * \brief Get the local node's name + * + * \return Local node's name + * \note This will fatally exit if local node name cannot be known. + */ +const char * +get_local_node_name(void) +{ + static char *name = NULL; + + if (name == NULL) { + name = get_node_name(0); + } + return name; +} + +/*! + * \brief Get the node name corresponding to a cluster node ID + * + * \param[in] nodeid Node ID to check (or 0 for local node) + * + * \return Node name corresponding to \p nodeid + * \note This will fatally exit if \p nodeid is 0 and local node name cannot be + * known. + */ +char * +get_node_name(uint32_t nodeid) +{ + char *name = NULL; + enum cluster_type_e stack = get_cluster_type(); + + switch (stack) { + case pcmk_cluster_corosync: +#if SUPPORT_COROSYNC + name = pcmk__corosync_name(0, nodeid); + break; +#endif // SUPPORT_COROSYNC + + default: + crm_err("Unknown cluster type: %s (%d)", name_for_cluster_type(stack), stack); + } + + if ((name == NULL) && (nodeid == 0)) { + name = pcmk_hostname(); + if (name == NULL) { + // @TODO Maybe let the caller decide what to do + crm_err("Could not obtain the local %s node name", + name_for_cluster_type(stack)); + crm_exit(CRM_EX_FATAL); + } + crm_notice("Defaulting to uname -n for the local %s node name", + name_for_cluster_type(stack)); + } + + if (name == NULL) { + crm_notice("Could not obtain a node name for %s node with id %u", + name_for_cluster_type(stack), nodeid); + } + return name; +} + +/*! + * \brief Get the node name corresponding to a node UUID + * + * \param[in] uuid UUID of desired node + * + * \return name of desired node + * + * \note This relies on the remote peer cache being populated with all + * remote nodes in the cluster, so callers should maintain that cache. + */ +const char * +crm_peer_uname(const char *uuid) +{ + GHashTableIter iter; + crm_node_t *node = NULL; + + CRM_CHECK(uuid != NULL, return NULL); + + /* remote nodes have the same uname and uuid */ + if (g_hash_table_lookup(crm_remote_peer_cache, uuid)) { + return uuid; + } + + /* avoid blocking calls where possible */ + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + if (pcmk__str_eq(node->uuid, uuid, pcmk__str_casei)) { + if (node->uname != NULL) { + return node->uname; + } + break; + } + } + node = NULL; + + if (is_corosync_cluster()) { + long long id; + + if ((pcmk__scan_ll(uuid, &id, 0LL) != pcmk_rc_ok) + || (id < 1LL) || (id > UINT32_MAX)) { + crm_err("Invalid Corosync node ID '%s'", uuid); + return NULL; + } + + node = pcmk__search_cluster_node_cache((uint32_t) id, NULL); + if (node != NULL) { + crm_info("Setting uuid for node %s[%u] to %s", + node->uname, node->id, uuid); + node->uuid = strdup(uuid); + return node->uname; + } + return NULL; + } + + return NULL; +} + +/*! + * \brief Add a node's UUID as an XML attribute + * + * \param[in,out] xml XML element to add UUID to + * \param[in] attr XML attribute name to set + * \param[in,out] node Node whose UUID should be used as attribute value + */ +void +set_uuid(xmlNode *xml, const char *attr, crm_node_t *node) +{ + crm_xml_add(xml, attr, crm_peer_uuid(node)); +} + +/*! + * \brief Get a log-friendly string equivalent of a cluster type + * + * \param[in] type Cluster type + * + * \return Log-friendly string corresponding to \p type + */ +const char * +name_for_cluster_type(enum cluster_type_e type) +{ + switch (type) { + case pcmk_cluster_corosync: + return "corosync"; + case pcmk_cluster_unknown: + return "unknown"; + case pcmk_cluster_invalid: + return "invalid"; + } + crm_err("Invalid cluster type: %d", type); + return "invalid"; +} + +/*! + * \brief Get (and validate) the local cluster type + * + * \return Local cluster type + * \note This will fatally exit if the local cluster type is invalid. + */ +enum cluster_type_e +get_cluster_type(void) +{ + bool detected = false; + const char *cluster = NULL; + static enum cluster_type_e cluster_type = pcmk_cluster_unknown; + + /* Return the previous calculation, if any */ + if (cluster_type != pcmk_cluster_unknown) { + return cluster_type; + } + + cluster = pcmk__env_option(PCMK__ENV_CLUSTER_TYPE); + +#if SUPPORT_COROSYNC + /* If nothing is defined in the environment, try corosync (if supported) */ + if (cluster == NULL) { + crm_debug("Testing with Corosync"); + cluster_type = pcmk__corosync_detect(); + if (cluster_type != pcmk_cluster_unknown) { + detected = true; + goto done; + } + } +#endif + + /* Something was defined in the environment, test it against what we support */ + crm_info("Verifying cluster type: '%s'", + ((cluster == NULL)? "-unspecified-" : cluster)); + if (cluster == NULL) { + +#if SUPPORT_COROSYNC + } else if (pcmk__str_eq(cluster, "corosync", pcmk__str_casei)) { + cluster_type = pcmk_cluster_corosync; +#endif + + } else { + cluster_type = pcmk_cluster_invalid; + goto done; /* Keep the compiler happy when no stacks are supported */ + } + + done: + if (cluster_type == pcmk_cluster_unknown) { + crm_notice("Could not determine the current cluster type"); + + } else if (cluster_type == pcmk_cluster_invalid) { + crm_notice("This installation does not support the '%s' cluster infrastructure: terminating.", + cluster); + crm_exit(CRM_EX_FATAL); + + } else { + crm_info("%s an active '%s' cluster", + (detected? "Detected" : "Assuming"), + name_for_cluster_type(cluster_type)); + } + + return cluster_type; +} + +/*! + * \brief Check whether the local cluster is a Corosync cluster + * + * \return TRUE if the local cluster is a Corosync cluster, otherwise FALSE + */ +gboolean +is_corosync_cluster(void) +{ + return get_cluster_type() == pcmk_cluster_corosync; +} diff --git a/lib/cluster/corosync.c b/lib/cluster/corosync.c new file mode 100644 index 0000000..08280ce --- /dev/null +++ b/lib/cluster/corosync.c @@ -0,0 +1,814 @@ +/* + * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <inttypes.h> // PRIu64 + +#include <bzlib.h> + +#include <crm/common/ipc.h> +#include <crm/cluster/internal.h> +#include <crm/common/mainloop.h> +#include <sys/utsname.h> + +#include <qb/qbipcc.h> +#include <qb/qbutil.h> + +#include <corosync/corodefs.h> +#include <corosync/corotypes.h> +#include <corosync/hdb.h> +#include <corosync/cfg.h> +#include <corosync/cmap.h> +#include <corosync/quorum.h> + +#include <crm/msg_xml.h> + +#include <crm/common/ipc_internal.h> /* PCMK__SPECIAL_PID* */ +#include "crmcluster_private.h" + +static quorum_handle_t pcmk_quorum_handle = 0; + +static gboolean (*quorum_app_callback)(unsigned long long seq, + gboolean quorate) = NULL; + +/*! + * \internal + * \brief Get the Corosync UUID associated with a Pacemaker node + * + * \param[in] node Pacemaker node + * + * \return Newly allocated string with node's Corosync UUID, or NULL if unknown + * \note It is the caller's responsibility to free the result with free(). + */ +char * +pcmk__corosync_uuid(const crm_node_t *node) +{ + if ((node != NULL) && is_corosync_cluster()) { + if (node->id > 0) { + return crm_strdup_printf("%u", node->id); + } else { + crm_info("Node %s is not yet known by Corosync", node->uname); + } + } + return NULL; +} + +static bool +node_name_is_valid(const char *key, const char *name) +{ + int octet; + + if (name == NULL) { + crm_trace("%s is empty", key); + return false; + + } else if (sscanf(name, "%d.%d.%d.%d", &octet, &octet, &octet, &octet) == 4) { + crm_trace("%s contains an IPv4 address (%s), ignoring", key, name); + return false; + + } else if (strstr(name, ":") != NULL) { + crm_trace("%s contains an IPv6 address (%s), ignoring", key, name); + return false; + } + crm_trace("'%s: %s' is valid", key, name); + return true; +} + +/* + * \internal + * \brief Get Corosync node name corresponding to a node ID + * + * \param[in] cmap_handle Connection to Corosync CMAP + * \param[in] nodeid Node ID to check + * + * \return Newly allocated string with name or (if no name) IP address + * associated with first address assigned to a Corosync node ID (or NULL + * if unknown) + * \note It is the caller's responsibility to free the result with free(). + */ +char * +pcmk__corosync_name(uint64_t /*cmap_handle_t */ cmap_handle, uint32_t nodeid) +{ + // Originally based on corosync-quorumtool.c:node_name() + + int lpc = 0; + cs_error_t rc = CS_OK; + int retries = 0; + char *name = NULL; + cmap_handle_t local_handle = 0; + int fd = -1; + uid_t found_uid = 0; + gid_t found_gid = 0; + pid_t found_pid = 0; + int rv; + + if (nodeid == 0) { + nodeid = get_local_nodeid(0); + } + + if (cmap_handle == 0 && local_handle == 0) { + retries = 0; + crm_trace("Initializing CMAP connection"); + do { + rc = pcmk__init_cmap(&local_handle); + if (rc != CS_OK) { + retries++; + crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc), + retries); + sleep(retries); + } + + } while (retries < 5 && rc != CS_OK); + + if (rc != CS_OK) { + crm_warn("Could not connect to Cluster Configuration Database API, error %s", + cs_strerror(rc)); + local_handle = 0; + } + } + + if (cmap_handle == 0) { + cmap_handle = local_handle; + + rc = cmap_fd_get(cmap_handle, &fd); + if (rc != CS_OK) { + crm_err("Could not obtain the CMAP API connection: %s (%d)", + cs_strerror(rc), rc); + goto bail; + } + + /* CMAP provider run as root (in given user namespace, anyway)? */ + if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid, + &found_uid, &found_gid))) { + crm_err("CMAP provider is not authentic:" + " process %lld (uid: %lld, gid: %lld)", + (long long) PCMK__SPECIAL_PID_AS_0(found_pid), + (long long) found_uid, (long long) found_gid); + goto bail; + } else if (rv < 0) { + crm_err("Could not verify authenticity of CMAP provider: %s (%d)", + strerror(-rv), -rv); + goto bail; + } + } + + while (name == NULL && cmap_handle != 0) { + uint32_t id = 0; + char *key = NULL; + + key = crm_strdup_printf("nodelist.node.%d.nodeid", lpc); + rc = cmap_get_uint32(cmap_handle, key, &id); + crm_trace("Checking %u vs %u from %s", nodeid, id, key); + free(key); + + if (rc != CS_OK) { + break; + } + + if (nodeid == id) { + crm_trace("Searching for node name for %u in nodelist.node.%d %s", + nodeid, lpc, pcmk__s(name, "<null>")); + if (name == NULL) { + key = crm_strdup_printf("nodelist.node.%d.name", lpc); + cmap_get_string(cmap_handle, key, &name); + crm_trace("%s = %s", key, pcmk__s(name, "<null>")); + free(key); + } + if (name == NULL) { + key = crm_strdup_printf("nodelist.node.%d.ring0_addr", lpc); + cmap_get_string(cmap_handle, key, &name); + crm_trace("%s = %s", key, pcmk__s(name, "<null>")); + + if (!node_name_is_valid(key, name)) { + free(name); + name = NULL; + } + free(key); + } + break; + } + + lpc++; + } + +bail: + if(local_handle) { + cmap_finalize(local_handle); + } + + if (name == NULL) { + crm_info("Unable to get node name for nodeid %u", nodeid); + } + return name; +} + +/*! + * \internal + * \brief Disconnect from Corosync cluster + * + * \param[in,out] cluster Cluster connection to disconnect + */ +void +pcmk__corosync_disconnect(crm_cluster_t *cluster) +{ + cluster_disconnect_cpg(cluster); + if (pcmk_quorum_handle) { + quorum_finalize(pcmk_quorum_handle); + pcmk_quorum_handle = 0; + } + crm_notice("Disconnected from Corosync"); +} + +/*! + * \internal + * \brief Dispatch function for quorum connection file descriptor + * + * \param[in] user_data Ignored + * + * \return 0 on success, -1 on error (per mainloop_io_t interface) + */ +static int +quorum_dispatch_cb(gpointer user_data) +{ + int rc = quorum_dispatch(pcmk_quorum_handle, CS_DISPATCH_ALL); + + if (rc < 0) { + crm_err("Connection to the Quorum API failed: %d", rc); + quorum_finalize(pcmk_quorum_handle); + pcmk_quorum_handle = 0; + return -1; + } + return 0; +} + +/*! + * \internal + * \brief Notification callback for Corosync quorum connection + * + * \param[in] handle Corosync quorum connection + * \param[in] quorate Whether cluster is quorate + * \param[in] ring_id Corosync ring ID + * \param[in] view_list_entries Number of entries in \p view_list + * \param[in] view_list Corosync node IDs in membership + */ +static void +quorum_notification_cb(quorum_handle_t handle, uint32_t quorate, + uint64_t ring_id, uint32_t view_list_entries, + uint32_t *view_list) +{ + int i; + GHashTableIter iter; + crm_node_t *node = NULL; + static gboolean init_phase = TRUE; + + if (quorate != crm_have_quorum) { + if (quorate) { + crm_notice("Quorum acquired " CRM_XS " membership=%" PRIu64 " members=%lu", + ring_id, (long unsigned int)view_list_entries); + } else { + crm_warn("Quorum lost " CRM_XS " membership=%" PRIu64 " members=%lu", + ring_id, (long unsigned int)view_list_entries); + } + crm_have_quorum = quorate; + + } else { + crm_info("Quorum %s " CRM_XS " membership=%" PRIu64 " members=%lu", + (quorate? "retained" : "still lost"), ring_id, + (long unsigned int)view_list_entries); + } + + if (view_list_entries == 0 && init_phase) { + crm_info("Corosync membership is still forming, ignoring"); + return; + } + + init_phase = FALSE; + + /* Reset last_seen for all cached nodes so we can tell which ones aren't + * in the view list */ + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + node->last_seen = 0; + } + + /* Update the peer cache for each node in view list */ + for (i = 0; i < view_list_entries; i++) { + uint32_t id = view_list[i]; + + crm_debug("Member[%d] %u ", i, id); + + /* Get this node's peer cache entry (adding one if not already there) */ + node = crm_get_peer(id, NULL); + if (node->uname == NULL) { + char *name = pcmk__corosync_name(0, id); + + crm_info("Obtaining name for new node %u", id); + node = crm_get_peer(id, name); + free(name); + } + + /* Update the node state (including updating last_seen to ring_id) */ + pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, ring_id); + } + + /* Remove any peer cache entries we didn't update */ + pcmk__reap_unseen_nodes(ring_id); + + if (quorum_app_callback) { + quorum_app_callback(ring_id, quorate); + } +} + +/*! + * \internal + * \brief Connect to Corosync quorum service + * + * \param[in] dispatch Connection dispatch callback + * \param[in] destroy Connection destroy callback + */ +void +pcmk__corosync_quorum_connect(gboolean (*dispatch)(unsigned long long, + gboolean), + void (*destroy)(gpointer)) +{ + cs_error_t rc; + int fd = 0; + int quorate = 0; + uint32_t quorum_type = 0; + struct mainloop_fd_callbacks quorum_fd_callbacks; + uid_t found_uid = 0; + gid_t found_gid = 0; + pid_t found_pid = 0; + int rv; + + quorum_fd_callbacks.dispatch = quorum_dispatch_cb; + quorum_fd_callbacks.destroy = destroy; + + crm_debug("Configuring Pacemaker to obtain quorum from Corosync"); + + { +#if 0 + // New way but not supported by all Corosync 2 versions + quorum_model_v0_data_t quorum_model_data = { + .model = QUORUM_MODEL_V0, + .quorum_notify_fn = quorum_notification_cb, + }; + + rc = quorum_model_initialize(&pcmk_quorum_handle, QUORUM_MODEL_V0, + (quorum_model_data_t *) &quorum_model_data, + &quorum_type, NULL); +#else + quorum_callbacks_t quorum_callbacks = { + .quorum_notify_fn = quorum_notification_cb, + }; + + rc = quorum_initialize(&pcmk_quorum_handle, &quorum_callbacks, + &quorum_type); +#endif + } + + if (rc != CS_OK) { + crm_err("Could not connect to the Quorum API: %s (%d)", + cs_strerror(rc), rc); + goto bail; + + } else if (quorum_type != QUORUM_SET) { + crm_err("Corosync quorum is not configured"); + goto bail; + } + + rc = quorum_fd_get(pcmk_quorum_handle, &fd); + if (rc != CS_OK) { + crm_err("Could not obtain the Quorum API connection: %s (%d)", + strerror(rc), rc); + goto bail; + } + + /* Quorum provider run as root (in given user namespace, anyway)? */ + if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid, + &found_uid, &found_gid))) { + crm_err("Quorum provider is not authentic:" + " process %lld (uid: %lld, gid: %lld)", + (long long) PCMK__SPECIAL_PID_AS_0(found_pid), + (long long) found_uid, (long long) found_gid); + rc = CS_ERR_ACCESS; + goto bail; + } else if (rv < 0) { + crm_err("Could not verify authenticity of Quorum provider: %s (%d)", + strerror(-rv), -rv); + rc = CS_ERR_ACCESS; + goto bail; + } + + rc = quorum_getquorate(pcmk_quorum_handle, &quorate); + if (rc != CS_OK) { + crm_err("Could not obtain the current Quorum API state: %d", rc); + goto bail; + } + + if (quorate) { + crm_notice("Quorum acquired"); + } else { + crm_warn("No quorum"); + } + quorum_app_callback = dispatch; + crm_have_quorum = quorate; + + rc = quorum_trackstart(pcmk_quorum_handle, CS_TRACK_CHANGES | CS_TRACK_CURRENT); + if (rc != CS_OK) { + crm_err("Could not setup Quorum API notifications: %d", rc); + goto bail; + } + + mainloop_add_fd("quorum", G_PRIORITY_HIGH, fd, dispatch, &quorum_fd_callbacks); + + pcmk__corosync_add_nodes(NULL); + + bail: + if (rc != CS_OK) { + quorum_finalize(pcmk_quorum_handle); + } +} + +/*! + * \internal + * \brief Connect to Corosync cluster layer + * + * \param[in,out] cluster Initialized cluster object to connect + */ +gboolean +pcmk__corosync_connect(crm_cluster_t *cluster) +{ + crm_node_t *peer = NULL; + enum cluster_type_e stack = get_cluster_type(); + + crm_peer_init(); + + if (stack != pcmk_cluster_corosync) { + crm_err("Invalid cluster type: %s " CRM_XS " stack=%d", + name_for_cluster_type(stack), stack); + return FALSE; + } + + if (!cluster_connect_cpg(cluster)) { + // Error message was logged by cluster_connect_cpg() + return FALSE; + } + crm_info("Connection to %s established", name_for_cluster_type(stack)); + + cluster->nodeid = get_local_nodeid(0); + if (cluster->nodeid == 0) { + crm_err("Could not determine local node ID"); + return FALSE; + } + + cluster->uname = get_node_name(0); + if (cluster->uname == NULL) { + crm_err("Could not determine local node name"); + return FALSE; + } + + // Ensure local node always exists in peer cache + peer = crm_get_peer(cluster->nodeid, cluster->uname); + cluster->uuid = pcmk__corosync_uuid(peer); + + return TRUE; +} + +/*! + * \internal + * \brief Check whether a Corosync cluster is active + * + * \return pcmk_cluster_corosync if Corosync is found, else pcmk_cluster_unknown + */ +enum cluster_type_e +pcmk__corosync_detect(void) +{ + int rc = CS_OK; + cmap_handle_t handle; + + rc = pcmk__init_cmap(&handle); + + switch(rc) { + case CS_OK: + break; + case CS_ERR_SECURITY: + crm_debug("Failed to initialize the cmap API: Permission denied (%d)", rc); + /* It's there, we just can't talk to it. + * Good enough for us to identify as 'corosync' + */ + return pcmk_cluster_corosync; + + default: + crm_info("Failed to initialize the cmap API: %s (%d)", + pcmk__cs_err_str(rc), rc); + return pcmk_cluster_unknown; + } + + cmap_finalize(handle); + return pcmk_cluster_corosync; +} + +/*! + * \brief Check whether a Corosync cluster peer is active + * + * \param[in] node Node to check + * + * \return TRUE if \p node is an active Corosync peer, otherwise FALSE + */ +gboolean +crm_is_corosync_peer_active(const crm_node_t *node) +{ + if (node == NULL) { + crm_trace("Corosync peer inactive: NULL"); + return FALSE; + + } else if (!pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei)) { + crm_trace("Corosync peer %s inactive: state=%s", + node->uname, node->state); + return FALSE; + + } else if (!pcmk_is_set(node->processes, crm_proc_cpg)) { + crm_trace("Corosync peer %s inactive: processes=%.16x", + node->uname, node->processes); + return FALSE; + } + return TRUE; +} + +/*! + * \internal + * \brief Load Corosync node list (via CMAP) into peer cache and optionally XML + * + * \param[in,out] xml_parent If not NULL, add <node> entry here for each node + * + * \return true if any nodes were found, false otherwise + */ +bool +pcmk__corosync_add_nodes(xmlNode *xml_parent) +{ + int lpc = 0; + cs_error_t rc = CS_OK; + int retries = 0; + bool any = false; + cmap_handle_t cmap_handle; + int fd = -1; + uid_t found_uid = 0; + gid_t found_gid = 0; + pid_t found_pid = 0; + int rv; + + do { + rc = pcmk__init_cmap(&cmap_handle); + if (rc != CS_OK) { + retries++; + crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc), + retries); + sleep(retries); + } + + } while (retries < 5 && rc != CS_OK); + + if (rc != CS_OK) { + crm_warn("Could not connect to Cluster Configuration Database API, error %d", rc); + return false; + } + + rc = cmap_fd_get(cmap_handle, &fd); + if (rc != CS_OK) { + crm_err("Could not obtain the CMAP API connection: %s (%d)", + cs_strerror(rc), rc); + goto bail; + } + + /* CMAP provider run as root (in given user namespace, anyway)? */ + if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid, + &found_uid, &found_gid))) { + crm_err("CMAP provider is not authentic:" + " process %lld (uid: %lld, gid: %lld)", + (long long) PCMK__SPECIAL_PID_AS_0(found_pid), + (long long) found_uid, (long long) found_gid); + goto bail; + } else if (rv < 0) { + crm_err("Could not verify authenticity of CMAP provider: %s (%d)", + strerror(-rv), -rv); + goto bail; + } + + crm_peer_init(); + crm_trace("Initializing Corosync node list"); + for (lpc = 0; TRUE; lpc++) { + uint32_t nodeid = 0; + char *name = NULL; + char *key = NULL; + + key = crm_strdup_printf("nodelist.node.%d.nodeid", lpc); + rc = cmap_get_uint32(cmap_handle, key, &nodeid); + free(key); + + if (rc != CS_OK) { + break; + } + + name = pcmk__corosync_name(cmap_handle, nodeid); + if (name != NULL) { + GHashTableIter iter; + crm_node_t *node = NULL; + + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + if(node && node->uname && strcasecmp(node->uname, name) == 0) { + if (node->id && node->id != nodeid) { + crm_crit("Nodes %u and %u share the same name '%s': shutting down", node->id, + nodeid, name); + crm_exit(CRM_EX_FATAL); + } + } + } + } + + if (nodeid > 0 || name != NULL) { + crm_trace("Initializing node[%d] %u = %s", lpc, nodeid, name); + crm_get_peer(nodeid, name); + } + + if (nodeid > 0 && name != NULL) { + any = true; + + if (xml_parent) { + xmlNode *node = create_xml_node(xml_parent, XML_CIB_TAG_NODE); + + crm_xml_set_id(node, "%u", nodeid); + crm_xml_add(node, XML_ATTR_UNAME, name); + } + } + + free(name); + } +bail: + cmap_finalize(cmap_handle); + return any; +} + +/*! + * \internal + * \brief Get cluster name from Corosync configuration (via CMAP) + * + * \return Newly allocated string with cluster name if configured, or NULL + */ +char * +pcmk__corosync_cluster_name(void) +{ + cmap_handle_t handle; + char *cluster_name = NULL; + cs_error_t rc = CS_OK; + int fd = -1; + uid_t found_uid = 0; + gid_t found_gid = 0; + pid_t found_pid = 0; + int rv; + + rc = pcmk__init_cmap(&handle); + if (rc != CS_OK) { + crm_info("Failed to initialize the cmap API: %s (%d)", + cs_strerror(rc), rc); + return NULL; + } + + rc = cmap_fd_get(handle, &fd); + if (rc != CS_OK) { + crm_err("Could not obtain the CMAP API connection: %s (%d)", + cs_strerror(rc), rc); + goto bail; + } + + /* CMAP provider run as root (in given user namespace, anyway)? */ + if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid, + &found_uid, &found_gid))) { + crm_err("CMAP provider is not authentic:" + " process %lld (uid: %lld, gid: %lld)", + (long long) PCMK__SPECIAL_PID_AS_0(found_pid), + (long long) found_uid, (long long) found_gid); + goto bail; + } else if (rv < 0) { + crm_err("Could not verify authenticity of CMAP provider: %s (%d)", + strerror(-rv), -rv); + goto bail; + } + + rc = cmap_get_string(handle, "totem.cluster_name", &cluster_name); + if (rc != CS_OK) { + crm_info("Cannot get totem.cluster_name: %s (%d)", cs_strerror(rc), rc); + + } else { + crm_debug("cmap totem.cluster_name = '%s'", cluster_name); + } + +bail: + cmap_finalize(handle); + return cluster_name; +} + +/*! + * \internal + * \brief Check (via CMAP) whether Corosync configuration has a node list + * + * \return true if Corosync has node list, otherwise false + */ +bool +pcmk__corosync_has_nodelist(void) +{ + cs_error_t cs_rc = CS_OK; + int retries = 0; + cmap_handle_t cmap_handle; + cmap_iter_handle_t iter_handle; + char key_name[CMAP_KEYNAME_MAXLEN + 1]; + int fd = -1; + uid_t found_uid = 0; + gid_t found_gid = 0; + pid_t found_pid = 0; + int rc = pcmk_ok; + + static bool got_result = false; + static bool result = false; + + if (got_result) { + return result; + } + + // Connect to CMAP + do { + cs_rc = pcmk__init_cmap(&cmap_handle); + if (cs_rc != CS_OK) { + retries++; + crm_debug("CMAP connection failed: %s (rc=%d, retrying in %ds)", + cs_strerror(cs_rc), cs_rc, retries); + sleep(retries); + } + } while ((retries < 5) && (cs_rc != CS_OK)); + if (cs_rc != CS_OK) { + crm_warn("Assuming Corosync does not have node list: " + "CMAP connection failed (%s) " CRM_XS " rc=%d", + cs_strerror(cs_rc), cs_rc); + return false; + } + + // Get CMAP connection file descriptor + cs_rc = cmap_fd_get(cmap_handle, &fd); + if (cs_rc != CS_OK) { + crm_warn("Assuming Corosync does not have node list: " + "CMAP unusable (%s) " CRM_XS " rc=%d", + cs_strerror(cs_rc), cs_rc); + goto bail; + } + + // Check whether CMAP connection is authentic (i.e. provided by root) + rc = crm_ipc_is_authentic_process(fd, (uid_t) 0, (gid_t) 0, + &found_pid, &found_uid, &found_gid); + if (rc == 0) { + crm_warn("Assuming Corosync does not have node list: " + "CMAP provider is inauthentic " + CRM_XS " pid=%lld uid=%lld gid=%lld", + (long long) PCMK__SPECIAL_PID_AS_0(found_pid), + (long long) found_uid, (long long) found_gid); + goto bail; + } else if (rc < 0) { + crm_warn("Assuming Corosync does not have node list: " + "Could not verify CMAP authenticity (%s) " CRM_XS " rc=%d", + pcmk_strerror(rc), rc); + goto bail; + } + + // Check whether nodelist section is presetn + cs_rc = cmap_iter_init(cmap_handle, "nodelist", &iter_handle); + if (cs_rc != CS_OK) { + crm_warn("Assuming Corosync does not have node list: " + "CMAP not readable (%s) " CRM_XS " rc=%d", + cs_strerror(cs_rc), cs_rc); + goto bail; + } + + cs_rc = cmap_iter_next(cmap_handle, iter_handle, key_name, NULL, NULL); + if (cs_rc == CS_OK) { + result = true; + } + + cmap_iter_finalize(cmap_handle, iter_handle); + got_result = true; + crm_debug("Corosync %s node list", (result? "has" : "does not have")); + +bail: + cmap_finalize(cmap_handle); + return result; +} diff --git a/lib/cluster/cpg.c b/lib/cluster/cpg.c new file mode 100644 index 0000000..2af4a50 --- /dev/null +++ b/lib/cluster/cpg.c @@ -0,0 +1,1092 @@ +/* + * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> +#include <bzlib.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netdb.h> + +#include <crm/common/ipc.h> +#include <crm/cluster/internal.h> +#include <crm/common/mainloop.h> +#include <sys/utsname.h> + +#include <qb/qbipc_common.h> +#include <qb/qbipcc.h> +#include <qb/qbutil.h> + +#include <corosync/corodefs.h> +#include <corosync/corotypes.h> +#include <corosync/hdb.h> +#include <corosync/cpg.h> + +#include <crm/msg_xml.h> + +#include <crm/common/ipc_internal.h> /* PCMK__SPECIAL_PID* */ +#include "crmcluster_private.h" + +/* @TODO Once we can update the public API to require crm_cluster_t* in more + * functions, we can ditch this in favor of cluster->cpg_handle. + */ +static cpg_handle_t pcmk_cpg_handle = 0; + +// @TODO These could be moved to crm_cluster_t* at that time as well +static bool cpg_evicted = false; +static GList *cs_message_queue = NULL; +static int cs_message_timer = 0; + +struct pcmk__cpg_host_s { + uint32_t id; + uint32_t pid; + gboolean local; + enum crm_ais_msg_types type; + uint32_t size; + char uname[MAX_NAME]; +} __attribute__ ((packed)); + +typedef struct pcmk__cpg_host_s pcmk__cpg_host_t; + +struct pcmk__cpg_msg_s { + struct qb_ipc_response_header header __attribute__ ((aligned(8))); + uint32_t id; + gboolean is_compressed; + + pcmk__cpg_host_t host; + pcmk__cpg_host_t sender; + + uint32_t size; + uint32_t compressed_size; + /* 584 bytes */ + char data[0]; + +} __attribute__ ((packed)); + +typedef struct pcmk__cpg_msg_s pcmk__cpg_msg_t; + +static void crm_cs_flush(gpointer data); + +#define msg_data_len(msg) (msg->is_compressed?msg->compressed_size:msg->size) + +#define cs_repeat(rc, counter, max, code) do { \ + rc = code; \ + if ((rc == CS_ERR_TRY_AGAIN) || (rc == CS_ERR_QUEUE_FULL)) { \ + counter++; \ + crm_debug("Retrying operation after %ds", counter); \ + sleep(counter); \ + } else { \ + break; \ + } \ + } while (counter < max) + +/*! + * \brief Disconnect from Corosync CPG + * + * \param[in,out] cluster Cluster to disconnect + */ +void +cluster_disconnect_cpg(crm_cluster_t *cluster) +{ + pcmk_cpg_handle = 0; + if (cluster->cpg_handle) { + crm_trace("Disconnecting CPG"); + cpg_leave(cluster->cpg_handle, &cluster->group); + cpg_finalize(cluster->cpg_handle); + cluster->cpg_handle = 0; + + } else { + crm_info("No CPG connection"); + } +} + +/*! + * \brief Get the local Corosync node ID (via CPG) + * + * \param[in] handle CPG connection to use (or 0 to use new connection) + * + * \return Corosync ID of local node (or 0 if not known) + */ +uint32_t +get_local_nodeid(cpg_handle_t handle) +{ + cs_error_t rc = CS_OK; + int retries = 0; + static uint32_t local_nodeid = 0; + cpg_handle_t local_handle = handle; + cpg_model_v1_data_t cpg_model_info = {CPG_MODEL_V1, NULL, NULL, NULL, 0}; + int fd = -1; + uid_t found_uid = 0; + gid_t found_gid = 0; + pid_t found_pid = 0; + int rv; + + if(local_nodeid != 0) { + return local_nodeid; + } + + if(handle == 0) { + crm_trace("Creating connection"); + cs_repeat(rc, retries, 5, cpg_model_initialize(&local_handle, CPG_MODEL_V1, (cpg_model_data_t *)&cpg_model_info, NULL)); + if (rc != CS_OK) { + crm_err("Could not connect to the CPG API: %s (%d)", + cs_strerror(rc), rc); + return 0; + } + + rc = cpg_fd_get(local_handle, &fd); + if (rc != CS_OK) { + crm_err("Could not obtain the CPG API connection: %s (%d)", + cs_strerror(rc), rc); + goto bail; + } + + /* CPG provider run as root (in given user namespace, anyway)? */ + if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid, + &found_uid, &found_gid))) { + crm_err("CPG provider is not authentic:" + " process %lld (uid: %lld, gid: %lld)", + (long long) PCMK__SPECIAL_PID_AS_0(found_pid), + (long long) found_uid, (long long) found_gid); + goto bail; + } else if (rv < 0) { + crm_err("Could not verify authenticity of CPG provider: %s (%d)", + strerror(-rv), -rv); + goto bail; + } + } + + if (rc == CS_OK) { + retries = 0; + crm_trace("Performing lookup"); + cs_repeat(rc, retries, 5, cpg_local_get(local_handle, &local_nodeid)); + } + + if (rc != CS_OK) { + crm_err("Could not get local node id from the CPG API: %s (%d)", + pcmk__cs_err_str(rc), rc); + } + +bail: + if(handle == 0) { + crm_trace("Closing connection"); + cpg_finalize(local_handle); + } + crm_debug("Local nodeid is %u", local_nodeid); + return local_nodeid; +} + +/*! + * \internal + * \brief Callback function for Corosync message queue timer + * + * \param[in] data CPG handle + * + * \return FALSE (to indicate to glib that timer should not be removed) + */ +static gboolean +crm_cs_flush_cb(gpointer data) +{ + cs_message_timer = 0; + crm_cs_flush(data); + return FALSE; +} + +// Send no more than this many CPG messages in one flush +#define CS_SEND_MAX 200 + +/*! + * \internal + * \brief Send messages in Corosync CPG message queue + * + * \param[in] data CPG handle + */ +static void +crm_cs_flush(gpointer data) +{ + unsigned int sent = 0; + guint queue_len = 0; + cs_error_t rc = 0; + cpg_handle_t *handle = (cpg_handle_t *) data; + + if (*handle == 0) { + crm_trace("Connection is dead"); + return; + } + + queue_len = g_list_length(cs_message_queue); + if (((queue_len % 1000) == 0) && (queue_len > 1)) { + crm_err("CPG queue has grown to %d", queue_len); + + } else if (queue_len == CS_SEND_MAX) { + crm_warn("CPG queue has grown to %d", queue_len); + } + + if (cs_message_timer != 0) { + /* There is already a timer, wait until it goes off */ + crm_trace("Timer active %d", cs_message_timer); + return; + } + + while ((cs_message_queue != NULL) && (sent < CS_SEND_MAX)) { + struct iovec *iov = cs_message_queue->data; + + rc = cpg_mcast_joined(*handle, CPG_TYPE_AGREED, iov, 1); + if (rc != CS_OK) { + break; + } + + sent++; + crm_trace("CPG message sent, size=%llu", + (unsigned long long) iov->iov_len); + + cs_message_queue = g_list_remove(cs_message_queue, iov); + free(iov->iov_base); + free(iov); + } + + queue_len -= sent; + do_crm_log((queue_len > 5)? LOG_INFO : LOG_TRACE, + "Sent %u CPG message%s (%d still queued): %s (rc=%d)", + sent, pcmk__plural_s(sent), queue_len, pcmk__cs_err_str(rc), + (int) rc); + + if (cs_message_queue) { + uint32_t delay_ms = 100; + if (rc != CS_OK) { + /* Proportionally more if sending failed but cap at 1s */ + delay_ms = QB_MIN(1000, CS_SEND_MAX + (10 * queue_len)); + } + cs_message_timer = g_timeout_add(delay_ms, crm_cs_flush_cb, data); + } +} + +/*! + * \internal + * \brief Dispatch function for CPG handle + * + * \param[in,out] user_data Cluster object + * + * \return 0 on success, -1 on error (per mainloop_io_t interface) + */ +static int +pcmk_cpg_dispatch(gpointer user_data) +{ + cs_error_t rc = CS_OK; + crm_cluster_t *cluster = (crm_cluster_t *) user_data; + + rc = cpg_dispatch(cluster->cpg_handle, CS_DISPATCH_ONE); + if (rc != CS_OK) { + crm_err("Connection to the CPG API failed: %s (%d)", + pcmk__cs_err_str(rc), rc); + cpg_finalize(cluster->cpg_handle); + cluster->cpg_handle = 0; + return -1; + + } else if (cpg_evicted) { + crm_err("Evicted from CPG membership"); + return -1; + } + return 0; +} + +static inline const char * +ais_dest(const pcmk__cpg_host_t *host) +{ + if (host->local) { + return "local"; + } else if (host->size > 0) { + return host->uname; + } else { + return "<all>"; + } +} + +static inline const char * +msg_type2text(enum crm_ais_msg_types type) +{ + const char *text = "unknown"; + + switch (type) { + case crm_msg_none: + text = "unknown"; + break; + case crm_msg_ais: + text = "ais"; + break; + case crm_msg_cib: + text = "cib"; + break; + case crm_msg_crmd: + text = "crmd"; + break; + case crm_msg_pe: + text = "pengine"; + break; + case crm_msg_te: + text = "tengine"; + break; + case crm_msg_lrmd: + text = "lrmd"; + break; + case crm_msg_attrd: + text = "attrd"; + break; + case crm_msg_stonithd: + text = "stonithd"; + break; + case crm_msg_stonith_ng: + text = "stonith-ng"; + break; + } + return text; +} + +/*! + * \internal + * \brief Check whether a Corosync CPG message is valid + * + * \param[in] msg Corosync CPG message to check + * + * \return true if \p msg is valid, otherwise false + */ +static bool +check_message_sanity(const pcmk__cpg_msg_t *msg) +{ + int32_t payload_size = msg->header.size - sizeof(pcmk__cpg_msg_t); + + if (payload_size < 1) { + crm_err("%sCPG message %d from %s invalid: " + "Claimed size of %d bytes is too small " + CRM_XS " from %s[%u] to %s@%s", + (msg->is_compressed? "Compressed " : ""), + msg->id, ais_dest(&(msg->sender)), + (int) msg->header.size, + msg_type2text(msg->sender.type), msg->sender.pid, + msg_type2text(msg->host.type), ais_dest(&(msg->host))); + return false; + } + + if (msg->header.error != CS_OK) { + crm_err("%sCPG message %d from %s invalid: " + "Sender indicated error %d " + CRM_XS " from %s[%u] to %s@%s", + (msg->is_compressed? "Compressed " : ""), + msg->id, ais_dest(&(msg->sender)), + msg->header.error, + msg_type2text(msg->sender.type), msg->sender.pid, + msg_type2text(msg->host.type), ais_dest(&(msg->host))); + return false; + } + + if (msg_data_len(msg) != payload_size) { + crm_err("%sCPG message %d from %s invalid: " + "Total size %d inconsistent with payload size %d " + CRM_XS " from %s[%u] to %s@%s", + (msg->is_compressed? "Compressed " : ""), + msg->id, ais_dest(&(msg->sender)), + (int) msg->header.size, (int) msg_data_len(msg), + msg_type2text(msg->sender.type), msg->sender.pid, + msg_type2text(msg->host.type), ais_dest(&(msg->host))); + return false; + } + + if (!msg->is_compressed && + /* msg->size != (strlen(msg->data) + 1) would be a stronger check, + * but checking the last byte or two should be quick + */ + (((msg->size > 1) && (msg->data[msg->size - 2] == '\0')) + || (msg->data[msg->size - 1] != '\0'))) { + crm_err("CPG message %d from %s invalid: " + "Payload does not end at byte %llu " + CRM_XS " from %s[%u] to %s@%s", + msg->id, ais_dest(&(msg->sender)), + (unsigned long long) msg->size, + msg_type2text(msg->sender.type), msg->sender.pid, + msg_type2text(msg->host.type), ais_dest(&(msg->host))); + return false; + } + + crm_trace("Verified %d-byte %sCPG message %d from %s[%u]@%s to %s@%s", + (int) msg->header.size, (msg->is_compressed? "compressed " : ""), + msg->id, msg_type2text(msg->sender.type), msg->sender.pid, + ais_dest(&(msg->sender)), + msg_type2text(msg->host.type), ais_dest(&(msg->host))); + return true; +} + +/*! + * \brief Extract text data from a Corosync CPG message + * + * \param[in] handle CPG connection (to get local node ID if not known) + * \param[in] nodeid Corosync ID of node that sent message + * \param[in] pid Process ID of message sender (for logging only) + * \param[in,out] content CPG message + * \param[out] kind If not NULL, will be set to CPG header ID + * (which should be an enum crm_ais_msg_class value, + * currently always crm_class_cluster) + * \param[out] from If not NULL, will be set to sender uname + * (valid for the lifetime of \p content) + * + * \return Newly allocated string with message data + * \note It is the caller's responsibility to free the return value with free(). + */ +char * +pcmk_message_common_cs(cpg_handle_t handle, uint32_t nodeid, uint32_t pid, void *content, + uint32_t *kind, const char **from) +{ + char *data = NULL; + pcmk__cpg_msg_t *msg = (pcmk__cpg_msg_t *) content; + + if(handle) { + // Do filtering and field massaging + uint32_t local_nodeid = get_local_nodeid(handle); + const char *local_name = get_local_node_name(); + + if (msg->sender.id > 0 && msg->sender.id != nodeid) { + crm_err("Nodeid mismatch from %d.%d: claimed nodeid=%u", nodeid, pid, msg->sender.id); + return NULL; + + } else if (msg->host.id != 0 && (local_nodeid != msg->host.id)) { + /* Not for us */ + crm_trace("Not for us: %u != %u", msg->host.id, local_nodeid); + return NULL; + } else if (msg->host.size != 0 && !pcmk__str_eq(msg->host.uname, local_name, pcmk__str_casei)) { + /* Not for us */ + crm_trace("Not for us: %s != %s", msg->host.uname, local_name); + return NULL; + } + + msg->sender.id = nodeid; + if (msg->sender.size == 0) { + crm_node_t *peer = crm_get_peer(nodeid, NULL); + + if (peer == NULL) { + crm_err("Peer with nodeid=%u is unknown", nodeid); + + } else if (peer->uname == NULL) { + crm_err("No uname for peer with nodeid=%u", nodeid); + + } else { + crm_notice("Fixing uname for peer with nodeid=%u", nodeid); + msg->sender.size = strlen(peer->uname); + memset(msg->sender.uname, 0, MAX_NAME); + memcpy(msg->sender.uname, peer->uname, msg->sender.size); + } + } + } + + crm_trace("Got new%s message (size=%d, %d, %d)", + msg->is_compressed ? " compressed" : "", + msg_data_len(msg), msg->size, msg->compressed_size); + + if (kind != NULL) { + *kind = msg->header.id; + } + if (from != NULL) { + *from = msg->sender.uname; + } + + if (msg->is_compressed && msg->size > 0) { + int rc = BZ_OK; + char *uncompressed = NULL; + unsigned int new_size = msg->size + 1; + + if (!check_message_sanity(msg)) { + goto badmsg; + } + + crm_trace("Decompressing message data"); + uncompressed = calloc(1, new_size); + rc = BZ2_bzBuffToBuffDecompress(uncompressed, &new_size, msg->data, msg->compressed_size, 1, 0); + + if (rc != BZ_OK) { + crm_err("Decompression failed: %s " CRM_XS " bzerror=%d", + bz2_strerror(rc), rc); + free(uncompressed); + goto badmsg; + } + + CRM_ASSERT(rc == BZ_OK); + CRM_ASSERT(new_size == msg->size); + + data = uncompressed; + + } else if (!check_message_sanity(msg)) { + goto badmsg; + + } else { + data = strdup(msg->data); + } + + // Is this necessary? + crm_get_peer(msg->sender.id, msg->sender.uname); + + crm_trace("Payload: %.200s", data); + return data; + + badmsg: + crm_err("Invalid message (id=%d, dest=%s:%s, from=%s:%s.%d):" + " min=%d, total=%d, size=%d, bz2_size=%d", + msg->id, ais_dest(&(msg->host)), msg_type2text(msg->host.type), + ais_dest(&(msg->sender)), msg_type2text(msg->sender.type), + msg->sender.pid, (int)sizeof(pcmk__cpg_msg_t), + msg->header.size, msg->size, msg->compressed_size); + + free(data); + return NULL; +} + +/*! + * \internal + * \brief Compare cpg_address objects by node ID + * + * \param[in] first First cpg_address structure to compare + * \param[in] second Second cpg_address structure to compare + * + * \return Negative number if first's node ID is lower, + * positive number if first's node ID is greater, + * or 0 if both node IDs are equal + */ +static int +cmp_member_list_nodeid(const void *first, const void *second) +{ + const struct cpg_address *const a = *((const struct cpg_address **) first), + *const b = *((const struct cpg_address **) second); + if (a->nodeid < b->nodeid) { + return -1; + } else if (a->nodeid > b->nodeid) { + return 1; + } + /* don't bother with "reason" nor "pid" */ + return 0; +} + +/*! + * \internal + * \brief Get a readable string equivalent of a cpg_reason_t value + * + * \param[in] reason CPG reason value + * + * \return Readable string suitable for logging + */ +static const char * +cpgreason2str(cpg_reason_t reason) +{ + switch (reason) { + case CPG_REASON_JOIN: return " via cpg_join"; + case CPG_REASON_LEAVE: return " via cpg_leave"; + case CPG_REASON_NODEDOWN: return " via cluster exit"; + case CPG_REASON_NODEUP: return " via cluster join"; + case CPG_REASON_PROCDOWN: return " for unknown reason"; + default: break; + } + return ""; +} + +/*! + * \internal + * \brief Get a log-friendly node name + * + * \param[in] peer Node to check + * + * \return Node's uname, or readable string if not known + */ +static inline const char * +peer_name(const crm_node_t *peer) +{ + if (peer == NULL) { + return "unknown node"; + } else if (peer->uname == NULL) { + return "peer node"; + } else { + return peer->uname; + } +} + +/*! + * \internal + * \brief Process a CPG peer's leaving the cluster + * + * \param[in] cpg_group_name CPG group name (for logging) + * \param[in] event_counter Event number (for logging) + * \param[in] local_nodeid Node ID of local node + * \param[in] cpg_peer CPG peer that left + * \param[in] sorted_member_list List of remaining members, qsort()-ed by ID + * \param[in] member_list_entries Number of entries in \p sorted_member_list + */ +static void +node_left(const char *cpg_group_name, int event_counter, + uint32_t local_nodeid, const struct cpg_address *cpg_peer, + const struct cpg_address **sorted_member_list, + size_t member_list_entries) +{ + crm_node_t *peer = pcmk__search_cluster_node_cache(cpg_peer->nodeid, + NULL); + const struct cpg_address **rival = NULL; + + /* Most CPG-related Pacemaker code assumes that only one process on a node + * can be in the process group, but Corosync does not impose this + * limitation, and more than one can be a member in practice due to a + * daemon attempting to start while another instance is already running. + * + * Check for any such duplicate instances, because we don't want to process + * their leaving as if our actual peer left. If the peer that left still has + * an entry in sorted_member_list (with a different PID), we will ignore the + * leaving. + * + * @TODO Track CPG members' PIDs so we can tell exactly who left. + */ + if (peer != NULL) { + rival = bsearch(&cpg_peer, sorted_member_list, member_list_entries, + sizeof(const struct cpg_address *), + cmp_member_list_nodeid); + } + + if (rival == NULL) { + crm_info("Group %s event %d: %s (node %u pid %u) left%s", + cpg_group_name, event_counter, peer_name(peer), + cpg_peer->nodeid, cpg_peer->pid, + cpgreason2str(cpg_peer->reason)); + if (peer != NULL) { + crm_update_peer_proc(__func__, peer, crm_proc_cpg, + OFFLINESTATUS); + } + } else if (cpg_peer->nodeid == local_nodeid) { + crm_warn("Group %s event %d: duplicate local pid %u left%s", + cpg_group_name, event_counter, + cpg_peer->pid, cpgreason2str(cpg_peer->reason)); + } else { + crm_warn("Group %s event %d: " + "%s (node %u) duplicate pid %u left%s (%u remains)", + cpg_group_name, event_counter, peer_name(peer), + cpg_peer->nodeid, cpg_peer->pid, + cpgreason2str(cpg_peer->reason), (*rival)->pid); + } +} + +/*! + * \brief Handle a CPG configuration change event + * + * \param[in] handle CPG connection + * \param[in] cpg_name CPG group name + * \param[in] member_list List of current CPG members + * \param[in] member_list_entries Number of entries in \p member_list + * \param[in] left_list List of CPG members that left + * \param[in] left_list_entries Number of entries in \p left_list + * \param[in] joined_list List of CPG members that joined + * \param[in] joined_list_entries Number of entries in \p joined_list + */ +void +pcmk_cpg_membership(cpg_handle_t handle, + const struct cpg_name *groupName, + const struct cpg_address *member_list, size_t member_list_entries, + const struct cpg_address *left_list, size_t left_list_entries, + const struct cpg_address *joined_list, size_t joined_list_entries) +{ + int i; + gboolean found = FALSE; + static int counter = 0; + uint32_t local_nodeid = get_local_nodeid(handle); + const struct cpg_address **sorted; + + sorted = malloc(member_list_entries * sizeof(const struct cpg_address *)); + CRM_ASSERT(sorted != NULL); + + for (size_t iter = 0; iter < member_list_entries; iter++) { + sorted[iter] = member_list + iter; + } + /* so that the cross-matching multiply-subscribed nodes is then cheap */ + qsort(sorted, member_list_entries, sizeof(const struct cpg_address *), + cmp_member_list_nodeid); + + for (i = 0; i < left_list_entries; i++) { + node_left(groupName->value, counter, local_nodeid, &left_list[i], + sorted, member_list_entries); + } + free(sorted); + sorted = NULL; + + for (i = 0; i < joined_list_entries; i++) { + crm_info("Group %s event %d: node %u pid %u joined%s", + groupName->value, counter, joined_list[i].nodeid, + joined_list[i].pid, cpgreason2str(joined_list[i].reason)); + } + + for (i = 0; i < member_list_entries; i++) { + crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL); + + if (member_list[i].nodeid == local_nodeid + && member_list[i].pid != getpid()) { + // See the note in node_left() + crm_warn("Group %s event %d: detected duplicate local pid %u", + groupName->value, counter, member_list[i].pid); + continue; + } + crm_info("Group %s event %d: %s (node %u pid %u) is member", + groupName->value, counter, peer_name(peer), + member_list[i].nodeid, member_list[i].pid); + + /* If the caller left auto-reaping enabled, this will also update the + * state to member. + */ + peer = crm_update_peer_proc(__func__, peer, crm_proc_cpg, + ONLINESTATUS); + + if (peer && peer->state && strcmp(peer->state, CRM_NODE_MEMBER)) { + /* The node is a CPG member, but we currently think it's not a + * cluster member. This is possible only if auto-reaping was + * disabled. The node may be joining, and we happened to get the CPG + * notification before the quorum notification; or the node may have + * just died, and we are processing its final messages; or a bug + * has affected the peer cache. + */ + time_t now = time(NULL); + + if (peer->when_lost == 0) { + // Track when we first got into this contradictory state + peer->when_lost = now; + + } else if (now > (peer->when_lost + 60)) { + // If it persists for more than a minute, update the state + crm_warn("Node %u is member of group %s but was believed offline", + member_list[i].nodeid, groupName->value); + pcmk__update_peer_state(__func__, peer, CRM_NODE_MEMBER, 0); + } + } + + if (local_nodeid == member_list[i].nodeid) { + found = TRUE; + } + } + + if (!found) { + crm_err("Local node was evicted from group %s", groupName->value); + cpg_evicted = true; + } + + counter++; +} + +/*! + * \brief Connect to Corosync CPG + * + * \param[in,out] cluster Cluster object + * + * \return TRUE on success, otherwise FALSE + */ +gboolean +cluster_connect_cpg(crm_cluster_t *cluster) +{ + cs_error_t rc; + int fd = -1; + int retries = 0; + uint32_t id = 0; + crm_node_t *peer = NULL; + cpg_handle_t handle = 0; + const char *message_name = pcmk__message_name(crm_system_name); + uid_t found_uid = 0; + gid_t found_gid = 0; + pid_t found_pid = 0; + int rv; + + struct mainloop_fd_callbacks cpg_fd_callbacks = { + .dispatch = pcmk_cpg_dispatch, + .destroy = cluster->destroy, + }; + + cpg_model_v1_data_t cpg_model_info = { + .model = CPG_MODEL_V1, + .cpg_deliver_fn = cluster->cpg.cpg_deliver_fn, + .cpg_confchg_fn = cluster->cpg.cpg_confchg_fn, + .cpg_totem_confchg_fn = NULL, + .flags = 0, + }; + + cpg_evicted = false; + cluster->group.length = 0; + cluster->group.value[0] = 0; + + /* group.value is char[128] */ + strncpy(cluster->group.value, message_name, 127); + cluster->group.value[127] = 0; + cluster->group.length = 1 + QB_MIN(127, strlen(cluster->group.value)); + + cs_repeat(rc, retries, 30, cpg_model_initialize(&handle, CPG_MODEL_V1, (cpg_model_data_t *)&cpg_model_info, NULL)); + if (rc != CS_OK) { + crm_err("Could not connect to the CPG API: %s (%d)", + cs_strerror(rc), rc); + goto bail; + } + + rc = cpg_fd_get(handle, &fd); + if (rc != CS_OK) { + crm_err("Could not obtain the CPG API connection: %s (%d)", + cs_strerror(rc), rc); + goto bail; + } + + /* CPG provider run as root (in given user namespace, anyway)? */ + if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid, + &found_uid, &found_gid))) { + crm_err("CPG provider is not authentic:" + " process %lld (uid: %lld, gid: %lld)", + (long long) PCMK__SPECIAL_PID_AS_0(found_pid), + (long long) found_uid, (long long) found_gid); + rc = CS_ERR_ACCESS; + goto bail; + } else if (rv < 0) { + crm_err("Could not verify authenticity of CPG provider: %s (%d)", + strerror(-rv), -rv); + rc = CS_ERR_ACCESS; + goto bail; + } + + id = get_local_nodeid(handle); + if (id == 0) { + crm_err("Could not get local node id from the CPG API"); + goto bail; + + } + cluster->nodeid = id; + + retries = 0; + cs_repeat(rc, retries, 30, cpg_join(handle, &cluster->group)); + if (rc != CS_OK) { + crm_err("Could not join the CPG group '%s': %d", message_name, rc); + goto bail; + } + + pcmk_cpg_handle = handle; + cluster->cpg_handle = handle; + mainloop_add_fd("corosync-cpg", G_PRIORITY_MEDIUM, fd, cluster, &cpg_fd_callbacks); + + bail: + if (rc != CS_OK) { + cpg_finalize(handle); + return FALSE; + } + + peer = crm_get_peer(id, NULL); + crm_update_peer_proc(__func__, peer, crm_proc_cpg, ONLINESTATUS); + return TRUE; +} + +/*! + * \internal + * \brief Send an XML message via Corosync CPG + * + * \param[in] msg XML message to send + * \param[in] node Cluster node to send message to + * \param[in] dest Type of message to send + * + * \return TRUE on success, otherwise FALSE + */ +gboolean +pcmk__cpg_send_xml(xmlNode *msg, const crm_node_t *node, + enum crm_ais_msg_types dest) +{ + gboolean rc = TRUE; + char *data = NULL; + + data = dump_xml_unformatted(msg); + rc = send_cluster_text(crm_class_cluster, data, FALSE, node, dest); + free(data); + return rc; +} + +/*! + * \internal + * \brief Send string data via Corosync CPG + * + * \param[in] msg_class Message class (to set as CPG header ID) + * \param[in] data Data to send + * \param[in] local What to set as host "local" value (which is never used) + * \param[in] node Cluster node to send message to + * \param[in] dest Type of message to send + * + * \return TRUE on success, otherwise FALSE + */ +gboolean +send_cluster_text(enum crm_ais_msg_class msg_class, const char *data, + gboolean local, const crm_node_t *node, + enum crm_ais_msg_types dest) +{ + static int msg_id = 0; + static int local_pid = 0; + static int local_name_len = 0; + static const char *local_name = NULL; + + char *target = NULL; + struct iovec *iov; + pcmk__cpg_msg_t *msg = NULL; + enum crm_ais_msg_types sender = text2msg_type(crm_system_name); + + switch (msg_class) { + case crm_class_cluster: + break; + default: + crm_err("Invalid message class: %d", msg_class); + return FALSE; + } + + CRM_CHECK(dest != crm_msg_ais, return FALSE); + + if (local_name == NULL) { + local_name = get_local_node_name(); + } + if ((local_name_len == 0) && (local_name != NULL)) { + local_name_len = strlen(local_name); + } + + if (data == NULL) { + data = ""; + } + + if (local_pid == 0) { + local_pid = getpid(); + } + + if (sender == crm_msg_none) { + sender = local_pid; + } + + msg = calloc(1, sizeof(pcmk__cpg_msg_t)); + + msg_id++; + msg->id = msg_id; + msg->header.id = msg_class; + msg->header.error = CS_OK; + + msg->host.type = dest; + msg->host.local = local; + + if (node) { + if (node->uname) { + target = strdup(node->uname); + msg->host.size = strlen(node->uname); + memset(msg->host.uname, 0, MAX_NAME); + memcpy(msg->host.uname, node->uname, msg->host.size); + } else { + target = crm_strdup_printf("%u", node->id); + } + msg->host.id = node->id; + } else { + target = strdup("all"); + } + + msg->sender.id = 0; + msg->sender.type = sender; + msg->sender.pid = local_pid; + msg->sender.size = local_name_len; + memset(msg->sender.uname, 0, MAX_NAME); + if ((local_name != NULL) && (msg->sender.size != 0)) { + memcpy(msg->sender.uname, local_name, msg->sender.size); + } + + msg->size = 1 + strlen(data); + msg->header.size = sizeof(pcmk__cpg_msg_t) + msg->size; + + if (msg->size < CRM_BZ2_THRESHOLD) { + msg = pcmk__realloc(msg, msg->header.size); + memcpy(msg->data, data, msg->size); + + } else { + char *compressed = NULL; + unsigned int new_size = 0; + char *uncompressed = strdup(data); + + if (pcmk__compress(uncompressed, (unsigned int) msg->size, 0, + &compressed, &new_size) == pcmk_rc_ok) { + + msg->header.size = sizeof(pcmk__cpg_msg_t) + new_size; + msg = pcmk__realloc(msg, msg->header.size); + memcpy(msg->data, compressed, new_size); + + msg->is_compressed = TRUE; + msg->compressed_size = new_size; + + } else { + // cppcheck seems not to understand the abort logic in pcmk__realloc + // cppcheck-suppress memleak + msg = pcmk__realloc(msg, msg->header.size); + memcpy(msg->data, data, msg->size); + } + + free(uncompressed); + free(compressed); + } + + iov = calloc(1, sizeof(struct iovec)); + iov->iov_base = msg; + iov->iov_len = msg->header.size; + + if (msg->compressed_size) { + crm_trace("Queueing CPG message %u to %s (%llu bytes, %d bytes compressed payload): %.200s", + msg->id, target, (unsigned long long) iov->iov_len, + msg->compressed_size, data); + } else { + crm_trace("Queueing CPG message %u to %s (%llu bytes, %d bytes payload): %.200s", + msg->id, target, (unsigned long long) iov->iov_len, + msg->size, data); + } + free(target); + + cs_message_queue = g_list_append(cs_message_queue, iov); + crm_cs_flush(&pcmk_cpg_handle); + + return TRUE; +} + +/*! + * \brief Get the message type equivalent of a string + * + * \param[in] text String of message type + * + * \return Message type equivalent of \p text + */ +enum crm_ais_msg_types +text2msg_type(const char *text) +{ + int type = crm_msg_none; + + CRM_CHECK(text != NULL, return type); + text = pcmk__message_name(text); + if (pcmk__str_eq(text, "ais", pcmk__str_casei)) { + type = crm_msg_ais; + } else if (pcmk__str_eq(text, CRM_SYSTEM_CIB, pcmk__str_casei)) { + type = crm_msg_cib; + } else if (pcmk__strcase_any_of(text, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL)) { + type = crm_msg_crmd; + } else if (pcmk__str_eq(text, CRM_SYSTEM_TENGINE, pcmk__str_casei)) { + type = crm_msg_te; + } else if (pcmk__str_eq(text, CRM_SYSTEM_PENGINE, pcmk__str_casei)) { + type = crm_msg_pe; + } else if (pcmk__str_eq(text, CRM_SYSTEM_LRMD, pcmk__str_casei)) { + type = crm_msg_lrmd; + } else if (pcmk__str_eq(text, CRM_SYSTEM_STONITHD, pcmk__str_casei)) { + type = crm_msg_stonithd; + } else if (pcmk__str_eq(text, "stonith-ng", pcmk__str_casei)) { + type = crm_msg_stonith_ng; + } else if (pcmk__str_eq(text, "attrd", pcmk__str_casei)) { + type = crm_msg_attrd; + + } else { + /* This will normally be a transient client rather than + * a cluster daemon. Set the type to the pid of the client + */ + int scan_rc = sscanf(text, "%d", &type); + + if (scan_rc != 1 || type <= crm_msg_stonith_ng) { + /* Ensure it's sane */ + type = crm_msg_none; + } + } + return type; +} diff --git a/lib/cluster/crmcluster_private.h b/lib/cluster/crmcluster_private.h new file mode 100644 index 0000000..6933b73 --- /dev/null +++ b/lib/cluster/crmcluster_private.h @@ -0,0 +1,47 @@ +/* + * Copyright 2020-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#ifndef PCMK__CRMCLUSTER_PRIVATE__H +# define PCMK__CRMCLUSTER_PRIVATE__H + +/* This header is for the sole use of libcrmcluster, so that functions can be + * declared with G_GNUC_INTERNAL for efficiency. + */ + +#include <stdint.h> // uint32_t, uint64_t + +#include <glib.h> // G_GNUC_INTERNAL, gboolean +#include <libxml/tree.h> // xmlNode + +#include <crm/cluster.h> // cluster_type_e, crm_node_t + +G_GNUC_INTERNAL +enum cluster_type_e pcmk__corosync_detect(void); + +G_GNUC_INTERNAL +bool pcmk__corosync_has_nodelist(void); + +G_GNUC_INTERNAL +char *pcmk__corosync_uuid(const crm_node_t *peer); + +G_GNUC_INTERNAL +char *pcmk__corosync_name(uint64_t /*cmap_handle_t */ cmap_handle, + uint32_t nodeid); + +G_GNUC_INTERNAL +gboolean pcmk__corosync_connect(crm_cluster_t *cluster); + +G_GNUC_INTERNAL +void pcmk__corosync_disconnect(crm_cluster_t *cluster); + +G_GNUC_INTERNAL +gboolean pcmk__cpg_send_xml(xmlNode *msg, const crm_node_t *node, + enum crm_ais_msg_types dest); + +#endif // PCMK__CRMCLUSTER_PRIVATE__H diff --git a/lib/cluster/election.c b/lib/cluster/election.c new file mode 100644 index 0000000..ebbae72 --- /dev/null +++ b/lib/cluster/election.c @@ -0,0 +1,727 @@ +/* + * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <sys/time.h> +#include <sys/resource.h> + +#include <crm/msg_xml.h> +#include <crm/common/xml.h> + +#include <crm/common/mainloop.h> +#include <crm/cluster/internal.h> +#include <crm/cluster/election_internal.h> +#include <crm/crm.h> + +#define STORM_INTERVAL 2 /* in seconds */ + +struct election_s { + enum election_result state; + guint count; // How many times local node has voted + char *name; // Descriptive name for this election + char *uname; // Local node's name + GSourceFunc cb; // Function to call if election is won + GHashTable *voted; // Key = node name, value = how node voted + mainloop_timer_t *timeout; // When to abort if all votes not received + int election_wins; // Track wins, for storm detection + bool wrote_blackbox; // Write a storm blackbox at most once + time_t expires; // When storm detection period ends + time_t last_election_loss; // When dampening period ends +}; + +static void +election_complete(election_t *e) +{ + e->state = election_won; + if (e->cb != NULL) { + e->cb(e); + } + election_reset(e); +} + +static gboolean +election_timer_cb(gpointer user_data) +{ + election_t *e = user_data; + + crm_info("%s timed out, declaring local node as winner", e->name); + election_complete(e); + return FALSE; +} + +/*! + * \brief Get current state of an election + * + * \param[in] e Election object + * + * \return Current state of \e + */ +enum election_result +election_state(const election_t *e) +{ + return (e == NULL)? election_error : e->state; +} + +/*! + * \brief Create a new election object + * + * Every node that wishes to participate in an election must create an election + * object. Typically, this should be done once, at start-up. A caller should + * only create a single election object. + * + * \param[in] name Label for election (for logging) + * \param[in] uname Local node's name + * \param[in] period_ms How long to wait for all peers to vote + * \param[in] cb Function to call if local node wins election + * + * \return Newly allocated election object on success, NULL on error + * \note The caller is responsible for freeing the returned value using + * election_fini(). + */ +election_t * +election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb) +{ + election_t *e = NULL; + + static guint count = 0; + + CRM_CHECK(uname != NULL, return NULL); + + e = calloc(1, sizeof(election_t)); + if (e == NULL) { + crm_perror(LOG_CRIT, "Cannot create election"); + return NULL; + } + + e->uname = strdup(uname); + if (e->uname == NULL) { + crm_perror(LOG_CRIT, "Cannot create election"); + free(e); + return NULL; + } + + e->name = name? crm_strdup_printf("election-%s", name) + : crm_strdup_printf("election-%u", count++); + e->cb = cb; + e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, + election_timer_cb, e); + crm_trace("Created %s", e->name); + return e; +} + +/*! + * \brief Disregard any previous vote by specified peer + * + * This discards any recorded vote from a specified peer. Election users should + * call this whenever a voting peer becomes inactive. + * + * \param[in,out] e Election object + * \param[in] uname Name of peer to disregard + */ +void +election_remove(election_t *e, const char *uname) +{ + if ((e != NULL) && (uname != NULL) && (e->voted != NULL)) { + crm_trace("Discarding %s (no-)vote from lost peer %s", e->name, uname); + g_hash_table_remove(e->voted, uname); + } +} + +/*! + * \brief Stop election timer and disregard all votes + * + * \param[in,out] e Election object + */ +void +election_reset(election_t *e) +{ + if (e != NULL) { + crm_trace("Resetting election %s", e->name); + mainloop_timer_stop(e->timeout); + if (e->voted) { + crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted)); + g_hash_table_destroy(e->voted); + e->voted = NULL; + } + } +} + +/*! + * \brief Free an election object + * + * Free all memory associated with an election object, stopping its + * election timer (if running). + * + * \param[in,out] e Election object + */ +void +election_fini(election_t *e) +{ + if (e != NULL) { + election_reset(e); + crm_trace("Destroying %s", e->name); + mainloop_timer_del(e->timeout); + free(e->uname); + free(e->name); + free(e); + } +} + +static void +election_timeout_start(election_t *e) +{ + if (e != NULL) { + mainloop_timer_start(e->timeout); + } +} + +/*! + * \brief Stop an election's timer, if running + * + * \param[in,out] e Election object + */ +void +election_timeout_stop(election_t *e) +{ + if (e != NULL) { + mainloop_timer_stop(e->timeout); + } +} + +/*! + * \brief Change an election's timeout (restarting timer if running) + * + * \param[in,out] e Election object + * \param[in] period New timeout + */ +void +election_timeout_set_period(election_t *e, guint period) +{ + if (e != NULL) { + mainloop_timer_set_period(e->timeout, period); + } else { + crm_err("No election defined"); + } +} + +static int +get_uptime(struct timeval *output) +{ + static time_t expires = 0; + static struct rusage info; + + time_t tm_now = time(NULL); + + if (expires < tm_now) { + int rc = 0; + + info.ru_utime.tv_sec = 0; + info.ru_utime.tv_usec = 0; + rc = getrusage(RUSAGE_SELF, &info); + + output->tv_sec = 0; + output->tv_usec = 0; + + if (rc < 0) { + crm_perror(LOG_ERR, "Could not calculate the current uptime"); + expires = 0; + return -1; + } + + crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec, + (long)info.ru_utime.tv_usec); + } + + expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */ + output->tv_sec = info.ru_utime.tv_sec; + output->tv_usec = info.ru_utime.tv_usec; + + return 1; +} + +static int +compare_age(struct timeval your_age) +{ + struct timeval our_age; + + get_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */ + + if (our_age.tv_sec > your_age.tv_sec) { + crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); + return 1; + } else if (our_age.tv_sec < your_age.tv_sec) { + crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); + return -1; + } else if (our_age.tv_usec > your_age.tv_usec) { + crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)", + (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); + return 1; + } else if (our_age.tv_usec < your_age.tv_usec) { + crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)", + (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); + return -1; + } + + return 0; +} + +/*! + * \brief Start a new election by offering local node's candidacy + * + * Broadcast a "vote" election message containing the local node's ID, + * (incremented) election counter, and uptime, and start the election timer. + * + * \param[in,out] e Election object + * + * \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if + * all active peers do so, or if the election times out, the local node + * wins the election. (If we lose to any peer vote, we will stop the + * timer, so a timeout means we did not lose -- either some peer did not + * vote, or we did not call election_check() in time.) + */ +void +election_vote(election_t *e) +{ + struct timeval age; + xmlNode *vote = NULL; + crm_node_t *our_node; + + if (e == NULL) { + crm_trace("Election vote requested, but no election available"); + return; + } + + our_node = crm_get_peer(0, e->uname); + if ((our_node == NULL) || (crm_is_peer_active(our_node) == FALSE)) { + crm_trace("Cannot vote in %s yet: local node not connected to cluster", + e->name); + return; + } + + election_reset(e); + e->state = election_in_progress; + vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); + + e->count++; + crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid); + crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count); + + get_uptime(&age); + crm_xml_add_timeval(vote, F_CRM_ELECTION_AGE_S, F_CRM_ELECTION_AGE_US, &age); + + send_cluster_message(NULL, crm_msg_crmd, vote, TRUE); + free_xml(vote); + + crm_debug("Started %s round %d", e->name, e->count); + election_timeout_start(e); + return; +} + +/*! + * \brief Check whether local node has won an election + * + * If all known peers have sent no-vote messages, stop the election timer, set + * the election state to won, and call any registered win callback. + * + * \param[in,out] e Election object + * + * \return TRUE if local node has won, FALSE otherwise + * \note If all known peers have sent no-vote messages, but the election owner + * does not call this function, the election will not be won (and the + * callback will not be called) until the election times out. + * \note This should be called when election_count_vote() returns + * \c election_in_progress. + */ +bool +election_check(election_t *e) +{ + int voted_size = 0; + int num_members = 0; + + if (e == NULL) { + crm_trace("Election check requested, but no election available"); + return FALSE; + } + if (e->voted == NULL) { + crm_trace("%s check requested, but no votes received yet", e->name); + return FALSE; + } + + voted_size = g_hash_table_size(e->voted); + num_members = crm_active_peers(); + + /* in the case of #voted > #members, it is better to + * wait for the timeout and give the cluster time to + * stabilize + */ + if (voted_size >= num_members) { + /* we won and everyone has voted */ + election_timeout_stop(e); + if (voted_size > num_members) { + GHashTableIter gIter; + const crm_node_t *node; + char *key = NULL; + + crm_warn("Received too many votes in %s", e->name); + g_hash_table_iter_init(&gIter, crm_peer_cache); + while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) { + if (crm_is_peer_active(node)) { + crm_warn("* expected vote: %s", node->uname); + } + } + + g_hash_table_iter_init(&gIter, e->voted); + while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) { + crm_warn("* actual vote: %s", key); + } + + } + + crm_info("%s won by local node", e->name); + election_complete(e); + return TRUE; + + } else { + crm_debug("%s still waiting on %d of %d votes", + e->name, num_members - voted_size, num_members); + } + + return FALSE; +} + +#define LOSS_DAMPEN 2 /* in seconds */ + +struct vote { + const char *op; + const char *from; + const char *version; + const char *election_owner; + int election_id; + struct timeval age; +}; + +/*! + * \brief Unpack an election message + * + * \param[in] e Election object (for logging only) + * \param[in] message Election message XML + * \param[out] vote Parsed fields from message + * + * \return TRUE if election message and election are valid, FALSE otherwise + * \note The parsed struct's pointer members are valid only for the lifetime of + * the message argument. + */ +static bool +parse_election_message(const election_t *e, const xmlNode *message, + struct vote *vote) +{ + CRM_CHECK(message && vote, return FALSE); + + vote->election_id = -1; + vote->age.tv_sec = -1; + vote->age.tv_usec = -1; + + vote->op = crm_element_value(message, F_CRM_TASK); + vote->from = crm_element_value(message, F_CRM_HOST_FROM); + vote->version = crm_element_value(message, F_CRM_VERSION); + vote->election_owner = crm_element_value(message, F_CRM_ELECTION_OWNER); + + crm_element_value_int(message, F_CRM_ELECTION_ID, &(vote->election_id)); + + if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL) + || (vote->election_owner == NULL) || (vote->election_id < 0)) { + + crm_warn("Invalid %s message from %s in %s ", + (vote->op? vote->op : "election"), + (vote->from? vote->from : "unspecified node"), + (e? e->name : "election")); + return FALSE; + } + + // Op-specific validation + + if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) { + // Only vote ops have uptime + crm_element_value_timeval(message, F_CRM_ELECTION_AGE_S, + F_CRM_ELECTION_AGE_US, &(vote->age)); + if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) { + crm_warn("Cannot count %s %s from %s because it is missing uptime", + (e? e->name : "election"), vote->op, vote->from); + return FALSE; + } + + } else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) { + crm_info("Cannot process %s message from %s because %s is not a known election op", + (e? e->name : "election"), vote->from, vote->op); + return FALSE; + } + + // Election validation + + if (e == NULL) { + crm_info("Cannot count %s from %s because no election available", + vote->op, vote->from); + return FALSE; + } + + /* If the membership cache is NULL, we REALLY shouldn't be voting -- + * the question is how we managed to get here. + */ + if (crm_peer_cache == NULL) { + crm_info("Cannot count %s %s from %s because no peer information available", + e->name, vote->op, vote->from); + return FALSE; + } + return TRUE; +} + +static void +record_vote(election_t *e, struct vote *vote) +{ + char *voter_copy = NULL; + char *vote_copy = NULL; + + CRM_ASSERT(e && vote && vote->from && vote->op); + if (e->voted == NULL) { + e->voted = pcmk__strkey_table(free, free); + } + + voter_copy = strdup(vote->from); + vote_copy = strdup(vote->op); + CRM_ASSERT(voter_copy && vote_copy); + + g_hash_table_replace(e->voted, voter_copy, vote_copy); +} + +static void +send_no_vote(crm_node_t *peer, struct vote *vote) +{ + // @TODO probably shouldn't hardcode CRM_SYSTEM_CRMD and crm_msg_crmd + + xmlNode *novote = create_request(CRM_OP_NOVOTE, NULL, vote->from, + CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); + + crm_xml_add(novote, F_CRM_ELECTION_OWNER, vote->election_owner); + crm_xml_add_int(novote, F_CRM_ELECTION_ID, vote->election_id); + + send_cluster_message(peer, crm_msg_crmd, novote, TRUE); + free_xml(novote); +} + +/*! + * \brief Process an election message (vote or no-vote) from a peer + * + * \param[in,out] e Election object + * \param[in] message Election message XML from peer + * \param[in] can_win Whether local node is eligible to win + * + * \return Election state after new vote is considered + * \note If the peer message is a vote, and we prefer the peer to win, this will + * send a no-vote reply to the peer. + * \note The situations "we lost to this vote" from "this is a late no-vote + * after we've already lost" both return election_lost. If a caller needs + * to distinguish them, it should save the current state before calling + * this function, and then compare the result. + */ +enum election_result +election_count_vote(election_t *e, const xmlNode *message, bool can_win) +{ + int log_level = LOG_INFO; + gboolean done = FALSE; + gboolean we_lose = FALSE; + const char *reason = "unknown"; + bool we_are_owner = FALSE; + crm_node_t *our_node = NULL, *your_node = NULL; + time_t tm_now = time(NULL); + struct vote vote; + + CRM_CHECK(message != NULL, return election_error); + if (parse_election_message(e, message, &vote) == FALSE) { + return election_error; + } + + your_node = crm_get_peer(0, vote.from); + our_node = crm_get_peer(0, e->uname); + we_are_owner = (our_node != NULL) + && pcmk__str_eq(our_node->uuid, vote.election_owner, + pcmk__str_none); + + if (!can_win) { + reason = "Not eligible"; + we_lose = TRUE; + + } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) { + reason = "We are not part of the cluster"; + log_level = LOG_ERR; + we_lose = TRUE; + + } else if (we_are_owner && (vote.election_id != e->count)) { + log_level = LOG_TRACE; + reason = "Superseded"; + done = TRUE; + + } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) { + /* Possibly we cached the message in the FSA queue at a point that it wasn't */ + reason = "Peer is not part of our cluster"; + log_level = LOG_WARNING; + done = TRUE; + + } else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none) + || pcmk__str_eq(vote.from, e->uname, pcmk__str_none)) { + /* Receiving our own broadcast vote, or a no-vote from peer, is a vote + * for us to win + */ + if (!we_are_owner) { + crm_warn("Cannot count %s round %d %s from %s because we are not election owner (%s)", + e->name, vote.election_id, vote.op, vote.from, + vote.election_owner); + return election_error; + } + if (e->state != election_in_progress) { + // Should only happen if we already lost + crm_debug("Not counting %s round %d %s from %s because no election in progress", + e->name, vote.election_id, vote.op, vote.from); + return e->state; + } + record_vote(e, &vote); + reason = "Recorded"; + done = TRUE; + + } else { + // A peer vote requires a comparison to determine which node is better + int age_result = compare_age(vote.age); + int version_result = compare_version(vote.version, CRM_FEATURE_SET); + + if (version_result < 0) { + reason = "Version"; + we_lose = TRUE; + + } else if (version_result > 0) { + reason = "Version"; + + } else if (age_result < 0) { + reason = "Uptime"; + we_lose = TRUE; + + } else if (age_result > 0) { + reason = "Uptime"; + + } else if (strcasecmp(e->uname, vote.from) > 0) { + reason = "Host name"; + we_lose = TRUE; + + } else { + reason = "Host name"; + } + } + + if (e->expires < tm_now) { + e->election_wins = 0; + e->expires = tm_now + STORM_INTERVAL; + + } else if (done == FALSE && we_lose == FALSE) { + int peers = 1 + g_hash_table_size(crm_peer_cache); + + /* If every node has to vote down every other node, thats N*(N-1) total elections + * Allow some leeway before _really_ complaining + */ + e->election_wins++; + if (e->election_wins > (peers * peers)) { + crm_warn("%s election storm detected: %d wins in %d seconds", + e->name, e->election_wins, STORM_INTERVAL); + e->election_wins = 0; + e->expires = tm_now + STORM_INTERVAL; + if (e->wrote_blackbox == FALSE) { + /* It's questionable whether a black box (from every node in the + * cluster) would be truly helpful in diagnosing an election + * storm. It's also highly doubtful a production environment + * would get multiple election storms from distinct causes, so + * saving one blackbox per process lifetime should be + * sufficient. Alternatives would be to save a timestamp of the + * last blackbox write instead of a boolean, and write a new one + * if some amount of time has passed; or to save a storm count, + * write a blackbox on every Nth occurrence. + */ + crm_write_blackbox(0, NULL); + e->wrote_blackbox = TRUE; + } + } + } + + if (done) { + do_crm_log(log_level + 1, + "Processed %s round %d %s (current round %d) from %s (%s)", + e->name, vote.election_id, vote.op, e->count, vote.from, + reason); + return e->state; + + } else if (we_lose == FALSE) { + /* We track the time of the last election loss to implement an election + * dampening period, reducing the likelihood of an election storm. If + * this node has lost within the dampening period, don't start a new + * election, even if we win against a peer's vote -- the peer we lost to + * should win again. + * + * @TODO This has a problem case: if an election winner immediately + * leaves the cluster, and a new election is immediately called, all + * nodes could lose, with no new winner elected. The ideal solution + * would be to tie the election structure with the peer caches, which + * would allow us to clear the dampening when the previous winner + * leaves (and would allow other improvements as well). + */ + if ((e->last_election_loss == 0) + || ((tm_now - e->last_election_loss) > (time_t) LOSS_DAMPEN)) { + + do_crm_log(log_level, "%s round %d (owner node ID %s) pass: %s from %s (%s)", + e->name, vote.election_id, vote.election_owner, vote.op, + vote.from, reason); + + e->last_election_loss = 0; + election_timeout_stop(e); + + /* Start a new election by voting down this, and other, peers */ + e->state = election_start; + return e->state; + } else { + char *loss_time = ctime(&e->last_election_loss); + + if (loss_time) { + // Show only HH:MM:SS + loss_time += 11; + loss_time[8] = '\0'; + } + crm_info("Ignoring %s round %d (owner node ID %s) pass vs %s because we lost less than %ds ago at %s", + e->name, vote.election_id, vote.election_owner, vote.from, + LOSS_DAMPEN, (loss_time? loss_time : "unknown")); + } + } + + e->last_election_loss = tm_now; + + do_crm_log(log_level, "%s round %d (owner node ID %s) lost: %s from %s (%s)", + e->name, vote.election_id, vote.election_owner, vote.op, + vote.from, reason); + + election_reset(e); + send_no_vote(your_node, &vote); + e->state = election_lost; + return e->state; +} + +/*! + * \brief Reset any election dampening currently in effect + * + * \param[in,out] e Election object to clear + */ +void +election_clear_dampening(election_t *e) +{ + e->last_election_loss = 0; +} diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c new file mode 100644 index 0000000..0c54f19 --- /dev/null +++ b/lib/cluster/membership.c @@ -0,0 +1,1301 @@ +/* + * Copyright 2004-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif + +#include <sys/param.h> +#include <sys/types.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <glib.h> +#include <crm/common/ipc.h> +#include <crm/common/xml_internal.h> +#include <crm/cluster/internal.h> +#include <crm/msg_xml.h> +#include <crm/stonith-ng.h> +#include "crmcluster_private.h" + +/* The peer cache remembers cluster nodes that have been seen. + * This is managed mostly automatically by libcluster, based on + * cluster membership events. + * + * Because cluster nodes can have conflicting names or UUIDs, + * the hash table key is a uniquely generated ID. + */ +GHashTable *crm_peer_cache = NULL; + +/* + * The remote peer cache tracks pacemaker_remote nodes. While the + * value has the same type as the peer cache's, it is tracked separately for + * three reasons: pacemaker_remote nodes can't have conflicting names or UUIDs, + * so the name (which is also the UUID) is used as the hash table key; there + * is no equivalent of membership events, so management is not automatic; and + * most users of the peer cache need to exclude pacemaker_remote nodes. + * + * That said, using a single cache would be more logical and less error-prone, + * so it would be a good idea to merge them one day. + * + * libcluster provides two avenues for populating the cache: + * crm_remote_peer_get() and crm_remote_peer_cache_remove() directly manage it, + * while crm_remote_peer_cache_refresh() populates it via the CIB. + */ +GHashTable *crm_remote_peer_cache = NULL; + +/* + * The known node cache tracks cluster and remote nodes that have been seen in + * the CIB. It is useful mainly when a caller needs to know about a node that + * may no longer be in the membership, but doesn't want to add the node to the + * main peer cache tables. + */ +static GHashTable *known_node_cache = NULL; + +unsigned long long crm_peer_seq = 0; +gboolean crm_have_quorum = FALSE; +static gboolean crm_autoreap = TRUE; + +// Flag setting and clearing for crm_node_t:flags + +#define set_peer_flags(peer, flags_to_set) do { \ + (peer)->flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \ + "Peer", (peer)->uname, \ + (peer)->flags, (flags_to_set), \ + #flags_to_set); \ + } while (0) + +#define clear_peer_flags(peer, flags_to_clear) do { \ + (peer)->flags = pcmk__clear_flags_as(__func__, __LINE__, \ + LOG_TRACE, \ + "Peer", (peer)->uname, \ + (peer)->flags, (flags_to_clear), \ + #flags_to_clear); \ + } while (0) + +static void update_peer_uname(crm_node_t *node, const char *uname); + +int +crm_remote_peer_cache_size(void) +{ + if (crm_remote_peer_cache == NULL) { + return 0; + } + return g_hash_table_size(crm_remote_peer_cache); +} + +/*! + * \brief Get a remote node peer cache entry, creating it if necessary + * + * \param[in] node_name Name of remote node + * + * \return Cache entry for node on success, NULL (and set errno) otherwise + * + * \note When creating a new entry, this will leave the node state undetermined, + * so the caller should also call pcmk__update_peer_state() if the state + * is known. + */ +crm_node_t * +crm_remote_peer_get(const char *node_name) +{ + crm_node_t *node; + + if (node_name == NULL) { + errno = -EINVAL; + return NULL; + } + + /* Return existing cache entry if one exists */ + node = g_hash_table_lookup(crm_remote_peer_cache, node_name); + if (node) { + return node; + } + + /* Allocate a new entry */ + node = calloc(1, sizeof(crm_node_t)); + if (node == NULL) { + return NULL; + } + + /* Populate the essential information */ + set_peer_flags(node, crm_remote_node); + node->uuid = strdup(node_name); + if (node->uuid == NULL) { + free(node); + errno = -ENOMEM; + return NULL; + } + + /* Add the new entry to the cache */ + g_hash_table_replace(crm_remote_peer_cache, node->uuid, node); + crm_trace("added %s to remote cache", node_name); + + /* Update the entry's uname, ensuring peer status callbacks are called */ + update_peer_uname(node, node_name); + return node; +} + +void +crm_remote_peer_cache_remove(const char *node_name) +{ + if (g_hash_table_remove(crm_remote_peer_cache, node_name)) { + crm_trace("removed %s from remote peer cache", node_name); + } +} + +/*! + * \internal + * \brief Return node status based on a CIB status entry + * + * \param[in] node_state XML of node state + * + * \return CRM_NODE_LOST if XML_NODE_IN_CLUSTER is false in node_state, + * CRM_NODE_MEMBER otherwise + * \note Unlike most boolean XML attributes, this one defaults to true, for + * backward compatibility with older controllers that don't set it. + */ +static const char * +remote_state_from_cib(const xmlNode *node_state) +{ + bool status = false; + + if (pcmk__xe_get_bool_attr(node_state, XML_NODE_IN_CLUSTER, &status) == pcmk_rc_ok && !status) { + return CRM_NODE_LOST; + } else { + return CRM_NODE_MEMBER; + } +} + +/* user data for looping through remote node xpath searches */ +struct refresh_data { + const char *field; /* XML attribute to check for node name */ + gboolean has_state; /* whether to update node state based on XML */ +}; + +/*! + * \internal + * \brief Process one pacemaker_remote node xpath search result + * + * \param[in] result XML search result + * \param[in] user_data what to look for in the XML + */ +static void +remote_cache_refresh_helper(xmlNode *result, void *user_data) +{ + const struct refresh_data *data = user_data; + const char *remote = crm_element_value(result, data->field); + const char *state = NULL; + crm_node_t *node; + + CRM_CHECK(remote != NULL, return); + + /* Determine node's state, if the result has it */ + if (data->has_state) { + state = remote_state_from_cib(result); + } + + /* Check whether cache already has entry for node */ + node = g_hash_table_lookup(crm_remote_peer_cache, remote); + + if (node == NULL) { + /* Node is not in cache, so add a new entry for it */ + node = crm_remote_peer_get(remote); + CRM_ASSERT(node); + if (state) { + pcmk__update_peer_state(__func__, node, state, 0); + } + + } else if (pcmk_is_set(node->flags, crm_node_dirty)) { + /* Node is in cache and hasn't been updated already, so mark it clean */ + clear_peer_flags(node, crm_node_dirty); + if (state) { + pcmk__update_peer_state(__func__, node, state, 0); + } + } +} + +static void +mark_dirty(gpointer key, gpointer value, gpointer user_data) +{ + set_peer_flags((crm_node_t *) value, crm_node_dirty); +} + +static gboolean +is_dirty(gpointer key, gpointer value, gpointer user_data) +{ + return pcmk_is_set(((crm_node_t*)value)->flags, crm_node_dirty); +} + +/*! + * \brief Repopulate the remote peer cache based on CIB XML + * + * \param[in] xmlNode CIB XML to parse + */ +void +crm_remote_peer_cache_refresh(xmlNode *cib) +{ + struct refresh_data data; + + crm_peer_init(); + + /* First, we mark all existing cache entries as dirty, + * so that later we can remove any that weren't in the CIB. + * We don't empty the cache, because we need to detect changes in state. + */ + g_hash_table_foreach(crm_remote_peer_cache, mark_dirty, NULL); + + /* Look for guest nodes and remote nodes in the status section */ + data.field = "id"; + data.has_state = TRUE; + crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_STATUS, + remote_cache_refresh_helper, &data); + + /* Look for guest nodes and remote nodes in the configuration section, + * because they may have just been added and not have a status entry yet. + * In that case, the cached node state will be left NULL, so that the + * peer status callback isn't called until we're sure the node started + * successfully. + */ + data.field = "value"; + data.has_state = FALSE; + crm_foreach_xpath_result(cib, PCMK__XP_GUEST_NODE_CONFIG, + remote_cache_refresh_helper, &data); + data.field = "id"; + data.has_state = FALSE; + crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_CONFIG, + remote_cache_refresh_helper, &data); + + /* Remove all old cache entries that weren't seen in the CIB */ + g_hash_table_foreach_remove(crm_remote_peer_cache, is_dirty, NULL); +} + +gboolean +crm_is_peer_active(const crm_node_t * node) +{ + if(node == NULL) { + return FALSE; + } + + if (pcmk_is_set(node->flags, crm_remote_node)) { + /* remote nodes are never considered active members. This + * guarantees they will never be considered for DC membership.*/ + return FALSE; + } +#if SUPPORT_COROSYNC + if (is_corosync_cluster()) { + return crm_is_corosync_peer_active(node); + } +#endif + crm_err("Unhandled cluster type: %s", name_for_cluster_type(get_cluster_type())); + return FALSE; +} + +static gboolean +crm_reap_dead_member(gpointer key, gpointer value, gpointer user_data) +{ + crm_node_t *node = value; + crm_node_t *search = user_data; + + if (search == NULL) { + return FALSE; + + } else if (search->id && node->id != search->id) { + return FALSE; + + } else if (search->id == 0 && !pcmk__str_eq(node->uname, search->uname, pcmk__str_casei)) { + return FALSE; + + } else if (crm_is_peer_active(value) == FALSE) { + crm_info("Removing node with name %s and id %u from membership cache", + (node->uname? node->uname : "unknown"), node->id); + return TRUE; + } + return FALSE; +} + +/*! + * \brief Remove all peer cache entries matching a node ID and/or uname + * + * \param[in] id ID of node to remove (or 0 to ignore) + * \param[in] name Uname of node to remove (or NULL to ignore) + * + * \return Number of cache entries removed + */ +guint +reap_crm_member(uint32_t id, const char *name) +{ + int matches = 0; + crm_node_t search = { 0, }; + + if (crm_peer_cache == NULL) { + crm_trace("Membership cache not initialized, ignoring purge request"); + return 0; + } + + search.id = id; + pcmk__str_update(&search.uname, name); + matches = g_hash_table_foreach_remove(crm_peer_cache, crm_reap_dead_member, &search); + if(matches) { + crm_notice("Purged %d peer%s with id=%u%s%s from the membership cache", + matches, pcmk__plural_s(matches), search.id, + (search.uname? " and/or uname=" : ""), + (search.uname? search.uname : "")); + + } else { + crm_info("No peers with id=%u%s%s to purge from the membership cache", + search.id, (search.uname? " and/or uname=" : ""), + (search.uname? search.uname : "")); + } + + free(search.uname); + return matches; +} + +static void +count_peer(gpointer key, gpointer value, gpointer user_data) +{ + guint *count = user_data; + crm_node_t *node = value; + + if (crm_is_peer_active(node)) { + *count = *count + 1; + } +} + +guint +crm_active_peers(void) +{ + guint count = 0; + + if (crm_peer_cache) { + g_hash_table_foreach(crm_peer_cache, count_peer, &count); + } + return count; +} + +static void +destroy_crm_node(gpointer data) +{ + crm_node_t *node = data; + + crm_trace("Destroying entry for node %u: %s", node->id, node->uname); + + free(node->uname); + free(node->state); + free(node->uuid); + free(node->expected); + free(node->conn_host); + free(node); +} + +void +crm_peer_init(void) +{ + if (crm_peer_cache == NULL) { + crm_peer_cache = pcmk__strikey_table(free, destroy_crm_node); + } + + if (crm_remote_peer_cache == NULL) { + crm_remote_peer_cache = pcmk__strikey_table(NULL, destroy_crm_node); + } + + if (known_node_cache == NULL) { + known_node_cache = pcmk__strikey_table(free, destroy_crm_node); + } +} + +void +crm_peer_destroy(void) +{ + if (crm_peer_cache != NULL) { + crm_trace("Destroying peer cache with %d members", g_hash_table_size(crm_peer_cache)); + g_hash_table_destroy(crm_peer_cache); + crm_peer_cache = NULL; + } + + if (crm_remote_peer_cache != NULL) { + crm_trace("Destroying remote peer cache with %d members", g_hash_table_size(crm_remote_peer_cache)); + g_hash_table_destroy(crm_remote_peer_cache); + crm_remote_peer_cache = NULL; + } + + if (known_node_cache != NULL) { + crm_trace("Destroying known node cache with %d members", + g_hash_table_size(known_node_cache)); + g_hash_table_destroy(known_node_cache); + known_node_cache = NULL; + } + +} + +static void (*peer_status_callback)(enum crm_status_type, crm_node_t *, + const void *) = NULL; + +/*! + * \brief Set a client function that will be called after peer status changes + * + * \param[in] dispatch Pointer to function to use as callback + * + * \note Previously, client callbacks were responsible for peer cache + * management. This is no longer the case, and client callbacks should do + * only client-specific handling. Callbacks MUST NOT add or remove entries + * in the peer caches. + */ +void +crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *)) +{ + peer_status_callback = dispatch; +} + +/*! + * \brief Tell the library whether to automatically reap lost nodes + * + * If TRUE (the default), calling crm_update_peer_proc() will also update the + * peer state to CRM_NODE_MEMBER or CRM_NODE_LOST, and pcmk__update_peer_state() + * will reap peers whose state changes to anything other than CRM_NODE_MEMBER. + * Callers should leave this enabled unless they plan to manage the cache + * separately on their own. + * + * \param[in] autoreap TRUE to enable automatic reaping, FALSE to disable + */ +void +crm_set_autoreap(gboolean autoreap) +{ + crm_autoreap = autoreap; +} + +static void +dump_peer_hash(int level, const char *caller) +{ + GHashTableIter iter; + const char *id = NULL; + crm_node_t *node = NULL; + + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) { + do_crm_log(level, "%s: Node %u/%s = %p - %s", caller, node->id, node->uname, node, id); + } +} + +static gboolean +hash_find_by_data(gpointer key, gpointer value, gpointer user_data) +{ + return value == user_data; +} + +/*! + * \internal + * \brief Search caches for a node (cluster or Pacemaker Remote) + * + * \param[in] id If not 0, cluster node ID to search for + * \param[in] uname If not NULL, node name to search for + * \param[in] flags Bitmask of enum crm_get_peer_flags + * + * \return Node cache entry if found, otherwise NULL + */ +crm_node_t * +pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags) +{ + crm_node_t *node = NULL; + + CRM_ASSERT(id > 0 || uname != NULL); + + crm_peer_init(); + + if ((uname != NULL) && pcmk_is_set(flags, CRM_GET_PEER_REMOTE)) { + node = g_hash_table_lookup(crm_remote_peer_cache, uname); + } + + if ((node == NULL) && pcmk_is_set(flags, CRM_GET_PEER_CLUSTER)) { + node = pcmk__search_cluster_node_cache(id, uname); + } + return node; +} + +/*! + * \brief Get a node cache entry (cluster or Pacemaker Remote) + * + * \param[in] id If not 0, cluster node ID to search for + * \param[in] uname If not NULL, node name to search for + * \param[in] flags Bitmask of enum crm_get_peer_flags + * + * \return (Possibly newly created) node cache entry + */ +crm_node_t * +crm_get_peer_full(unsigned int id, const char *uname, int flags) +{ + crm_node_t *node = NULL; + + CRM_ASSERT(id > 0 || uname != NULL); + + crm_peer_init(); + + if (pcmk_is_set(flags, CRM_GET_PEER_REMOTE)) { + node = g_hash_table_lookup(crm_remote_peer_cache, uname); + } + + if ((node == NULL) && pcmk_is_set(flags, CRM_GET_PEER_CLUSTER)) { + node = crm_get_peer(id, uname); + } + return node; +} + +/*! + * \internal + * \brief Search cluster node cache + * + * \param[in] id If not 0, cluster node ID to search for + * \param[in] uname If not NULL, node name to search for + * + * \return Cluster node cache entry if found, otherwise NULL + */ +crm_node_t * +pcmk__search_cluster_node_cache(unsigned int id, const char *uname) +{ + GHashTableIter iter; + crm_node_t *node = NULL; + crm_node_t *by_id = NULL; + crm_node_t *by_name = NULL; + + CRM_ASSERT(id > 0 || uname != NULL); + + crm_peer_init(); + + if (uname != NULL) { + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + if(node->uname && strcasecmp(node->uname, uname) == 0) { + crm_trace("Name match: %s = %p", node->uname, node); + by_name = node; + break; + } + } + } + + if (id > 0) { + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + if(node->id == id) { + crm_trace("ID match: %u = %p", node->id, node); + by_id = node; + break; + } + } + } + + node = by_id; /* Good default */ + if(by_id == by_name) { + /* Nothing to do if they match (both NULL counts) */ + crm_trace("Consistent: %p for %u/%s", by_id, id, uname); + + } else if(by_id == NULL && by_name) { + crm_trace("Only one: %p for %u/%s", by_name, id, uname); + + if(id && by_name->id) { + dump_peer_hash(LOG_WARNING, __func__); + crm_crit("Node %u and %u share the same name '%s'", + id, by_name->id, uname); + node = NULL; /* Create a new one */ + + } else { + node = by_name; + } + + } else if(by_name == NULL && by_id) { + crm_trace("Only one: %p for %u/%s", by_id, id, uname); + + if(uname && by_id->uname) { + dump_peer_hash(LOG_WARNING, __func__); + crm_crit("Node '%s' and '%s' share the same cluster nodeid %u: assuming '%s' is correct", + uname, by_id->uname, id, uname); + } + + } else if(uname && by_id->uname) { + if(pcmk__str_eq(uname, by_id->uname, pcmk__str_casei)) { + crm_notice("Node '%s' has changed its ID from %u to %u", by_id->uname, by_name->id, by_id->id); + g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name); + + } else { + crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u %s", by_id->uname, by_name->uname, id, uname); + dump_peer_hash(LOG_INFO, __func__); + crm_abort(__FILE__, __func__, __LINE__, "member weirdness", TRUE, + TRUE); + } + + } else if(id && by_name->id) { + crm_warn("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname); + + } else { + /* Simple merge */ + + /* Only corosync-based clusters use node IDs. The functions that call + * pcmk__update_peer_state() and crm_update_peer_proc() only know + * nodeid, so 'by_id' is authoritative when merging. + */ + dump_peer_hash(LOG_DEBUG, __func__); + + crm_info("Merging %p into %p", by_name, by_id); + g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name); + } + + return node; +} + +#if SUPPORT_COROSYNC +static guint +remove_conflicting_peer(crm_node_t *node) +{ + int matches = 0; + GHashTableIter iter; + crm_node_t *existing_node = NULL; + + if (node->id == 0 || node->uname == NULL) { + return 0; + } + + if (!pcmk__corosync_has_nodelist()) { + return 0; + } + + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &existing_node)) { + if (existing_node->id > 0 + && existing_node->id != node->id + && existing_node->uname != NULL + && strcasecmp(existing_node->uname, node->uname) == 0) { + + if (crm_is_peer_active(existing_node)) { + continue; + } + + crm_warn("Removing cached offline node %u/%s which has conflicting uname with %u", + existing_node->id, existing_node->uname, node->id); + + g_hash_table_iter_remove(&iter); + matches++; + } + } + + return matches; +} +#endif + +/*! + * \brief Get a cluster node cache entry + * + * \param[in] id If not 0, cluster node ID to search for + * \param[in] uname If not NULL, node name to search for + * + * \return (Possibly newly created) cluster node cache entry + */ +/* coverity[-alloc] Memory is referenced in one or both hashtables */ +crm_node_t * +crm_get_peer(unsigned int id, const char *uname) +{ + crm_node_t *node = NULL; + char *uname_lookup = NULL; + + CRM_ASSERT(id > 0 || uname != NULL); + + crm_peer_init(); + + node = pcmk__search_cluster_node_cache(id, uname); + + /* if uname wasn't provided, and find_peer did not turn up a uname based on id. + * we need to do a lookup of the node name using the id in the cluster membership. */ + if ((node == NULL || node->uname == NULL) && (uname == NULL)) { + uname_lookup = get_node_name(id); + } + + if (uname_lookup) { + uname = uname_lookup; + crm_trace("Inferred a name of '%s' for node %u", uname, id); + + /* try to turn up the node one more time now that we know the uname. */ + if (node == NULL) { + node = pcmk__search_cluster_node_cache(id, uname); + } + } + + + if (node == NULL) { + char *uniqueid = crm_generate_uuid(); + + node = calloc(1, sizeof(crm_node_t)); + CRM_ASSERT(node); + + crm_info("Created entry %s/%p for node %s/%u (%d total)", + uniqueid, node, uname, id, 1 + g_hash_table_size(crm_peer_cache)); + g_hash_table_replace(crm_peer_cache, uniqueid, node); + } + + if(id > 0 && uname && (node->id == 0 || node->uname == NULL)) { + crm_info("Node %u is now known as %s", id, uname); + } + + if(id > 0 && node->id == 0) { + node->id = id; + } + + if (uname && (node->uname == NULL)) { + update_peer_uname(node, uname); + } + + if(node->uuid == NULL) { + const char *uuid = crm_peer_uuid(node); + + if (uuid) { + crm_info("Node %u has uuid %s", id, uuid); + + } else { + crm_info("Cannot obtain a UUID for node %u/%s", id, node->uname); + } + } + + free(uname_lookup); + + return node; +} + +/*! + * \internal + * \brief Update a node's uname + * + * \param[in,out] node Node object to update + * \param[in] uname New name to set + * + * \note This function should not be called within a peer cache iteration, + * because in some cases it can remove conflicting cache entries, + * which would invalidate the iterator. + */ +static void +update_peer_uname(crm_node_t *node, const char *uname) +{ + CRM_CHECK(uname != NULL, + crm_err("Bug: can't update node name without name"); return); + CRM_CHECK(node != NULL, + crm_err("Bug: can't update node name to %s without node", uname); + return); + + if (pcmk__str_eq(uname, node->uname, pcmk__str_casei)) { + crm_debug("Node uname '%s' did not change", uname); + return; + } + + for (const char *c = uname; *c; ++c) { + if ((*c >= 'A') && (*c <= 'Z')) { + crm_warn("Node names with capitals are discouraged, consider changing '%s'", + uname); + break; + } + } + + pcmk__str_update(&node->uname, uname); + + if (peer_status_callback != NULL) { + peer_status_callback(crm_status_uname, node, NULL); + } + +#if SUPPORT_COROSYNC + if (is_corosync_cluster() && !pcmk_is_set(node->flags, crm_remote_node)) { + remove_conflicting_peer(node); + } +#endif +} + +/*! + * \internal + * \brief Get log-friendly string equivalent of a process flag + * + * \param[in] proc Process flag + * + * \return Log-friendly string equivalent of \p proc + */ +static inline const char * +proc2text(enum crm_proc_flag proc) +{ + const char *text = "unknown"; + + switch (proc) { + case crm_proc_none: + text = "none"; + break; + case crm_proc_based: + text = "pacemaker-based"; + break; + case crm_proc_controld: + text = "pacemaker-controld"; + break; + case crm_proc_schedulerd: + text = "pacemaker-schedulerd"; + break; + case crm_proc_execd: + text = "pacemaker-execd"; + break; + case crm_proc_attrd: + text = "pacemaker-attrd"; + break; + case crm_proc_fenced: + text = "pacemaker-fenced"; + break; + case crm_proc_cpg: + text = "corosync-cpg"; + break; + } + return text; +} + +/*! + * \internal + * \brief Update a node's process information (and potentially state) + * + * \param[in] source Caller's function name (for log messages) + * \param[in,out] node Node object to update + * \param[in] flag Bitmask of new process information + * \param[in] status node status (online, offline, etc.) + * + * \return NULL if any node was reaped from peer caches, value of node otherwise + * + * \note If this function returns NULL, the supplied node object was likely + * freed and should not be used again. This function should not be + * called within a cache iteration if reaping is possible, otherwise + * reaping could invalidate the iterator. + */ +crm_node_t * +crm_update_peer_proc(const char *source, crm_node_t * node, uint32_t flag, const char *status) +{ + uint32_t last = 0; + gboolean changed = FALSE; + + CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL", + source, proc2text(flag), status); + return NULL); + + /* Pacemaker doesn't spawn processes on remote nodes */ + if (pcmk_is_set(node->flags, crm_remote_node)) { + return node; + } + + last = node->processes; + if (status == NULL) { + node->processes = flag; + if (node->processes != last) { + changed = TRUE; + } + + } else if (pcmk__str_eq(status, ONLINESTATUS, pcmk__str_casei)) { + if ((node->processes & flag) != flag) { + node->processes = pcmk__set_flags_as(__func__, __LINE__, + LOG_TRACE, "Peer process", + node->uname, node->processes, + flag, "processes"); + changed = TRUE; + } + + } else if (node->processes & flag) { + node->processes = pcmk__clear_flags_as(__func__, __LINE__, + LOG_TRACE, "Peer process", + node->uname, node->processes, + flag, "processes"); + changed = TRUE; + } + + if (changed) { + if (status == NULL && flag <= crm_proc_none) { + crm_info("%s: Node %s[%u] - all processes are now offline", source, node->uname, + node->id); + } else { + crm_info("%s: Node %s[%u] - %s is now %s", source, node->uname, node->id, + proc2text(flag), status); + } + + /* Call the client callback first, then update the peer state, + * in case the node will be reaped + */ + if (peer_status_callback != NULL) { + peer_status_callback(crm_status_processes, node, &last); + } + + /* The client callback shouldn't touch the peer caches, + * but as a safety net, bail if the peer cache was destroyed. + */ + if (crm_peer_cache == NULL) { + return NULL; + } + + if (crm_autoreap) { + const char *peer_state = NULL; + + if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { + peer_state = CRM_NODE_MEMBER; + } else { + peer_state = CRM_NODE_LOST; + } + node = pcmk__update_peer_state(__func__, node, peer_state, 0); + } + } else { + crm_trace("%s: Node %s[%u] - %s is unchanged (%s)", source, node->uname, node->id, + proc2text(flag), status); + } + return node; +} + +/*! + * \internal + * \brief Update a cluster node cache entry's expected join state + * + * \param[in] source Caller's function name (for logging) + * \param[in,out] node Node to update + * \param[in] expected Node's new join state + */ +void +pcmk__update_peer_expected(const char *source, crm_node_t *node, + const char *expected) +{ + char *last = NULL; + gboolean changed = FALSE; + + CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected); + return); + + /* Remote nodes don't participate in joins */ + if (pcmk_is_set(node->flags, crm_remote_node)) { + return; + } + + last = node->expected; + if (expected != NULL && !pcmk__str_eq(node->expected, expected, pcmk__str_casei)) { + node->expected = strdup(expected); + changed = TRUE; + } + + if (changed) { + crm_info("%s: Node %s[%u] - expected state is now %s (was %s)", source, node->uname, node->id, + expected, last); + free(last); + } else { + crm_trace("%s: Node %s[%u] - expected state is unchanged (%s)", source, node->uname, + node->id, expected); + } +} + +/*! + * \internal + * \brief Update a node's state and membership information + * + * \param[in] source Caller's function name (for log messages) + * \param[in,out] node Node object to update + * \param[in] state Node's new state + * \param[in] membership Node's new membership ID + * \param[in,out] iter If not NULL, pointer to node's peer cache iterator + * + * \return NULL if any node was reaped, value of node otherwise + * + * \note If this function returns NULL, the supplied node object was likely + * freed and should not be used again. This function may be called from + * within a peer cache iteration if the iterator is supplied. + */ +static crm_node_t * +update_peer_state_iter(const char *source, crm_node_t *node, const char *state, + uint64_t membership, GHashTableIter *iter) +{ + gboolean is_member; + + CRM_CHECK(node != NULL, + crm_err("Could not set state for unknown host to %s" + CRM_XS " source=%s", state, source); + return NULL); + + is_member = pcmk__str_eq(state, CRM_NODE_MEMBER, pcmk__str_casei); + if (is_member) { + node->when_lost = 0; + if (membership) { + node->last_seen = membership; + } + } + + if (state && !pcmk__str_eq(node->state, state, pcmk__str_casei)) { + char *last = node->state; + + node->state = strdup(state); + crm_notice("Node %s state is now %s " CRM_XS + " nodeid=%u previous=%s source=%s", node->uname, state, + node->id, (last? last : "unknown"), source); + if (peer_status_callback != NULL) { + peer_status_callback(crm_status_nstate, node, last); + } + free(last); + + if (crm_autoreap && !is_member + && !pcmk_is_set(node->flags, crm_remote_node)) { + /* We only autoreap from the peer cache, not the remote peer cache, + * because the latter should be managed only by + * crm_remote_peer_cache_refresh(). + */ + if(iter) { + crm_notice("Purged 1 peer with id=%u and/or uname=%s from the membership cache", node->id, node->uname); + g_hash_table_iter_remove(iter); + + } else { + reap_crm_member(node->id, node->uname); + } + node = NULL; + } + + } else { + crm_trace("Node %s state is unchanged (%s) " CRM_XS + " nodeid=%u source=%s", node->uname, state, node->id, source); + } + return node; +} + +/*! + * \brief Update a node's state and membership information + * + * \param[in] source Caller's function name (for log messages) + * \param[in,out] node Node object to update + * \param[in] state Node's new state + * \param[in] membership Node's new membership ID + * + * \return NULL if any node was reaped, value of node otherwise + * + * \note If this function returns NULL, the supplied node object was likely + * freed and should not be used again. This function should not be + * called within a cache iteration if reaping is possible, + * otherwise reaping could invalidate the iterator. + */ +crm_node_t * +pcmk__update_peer_state(const char *source, crm_node_t *node, + const char *state, uint64_t membership) +{ + return update_peer_state_iter(source, node, state, membership, NULL); +} + +/*! + * \internal + * \brief Reap all nodes from cache whose membership information does not match + * + * \param[in] membership Membership ID of nodes to keep + */ +void +pcmk__reap_unseen_nodes(uint64_t membership) +{ + GHashTableIter iter; + crm_node_t *node = NULL; + + crm_trace("Reaping unseen nodes..."); + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) { + if (node->last_seen != membership) { + if (node->state) { + /* + * Calling update_peer_state_iter() allows us to + * remove the node from crm_peer_cache without + * invalidating our iterator + */ + update_peer_state_iter(__func__, node, CRM_NODE_LOST, + membership, &iter); + + } else { + crm_info("State of node %s[%u] is still unknown", + node->uname, node->id); + } + } + } +} + +static crm_node_t * +find_known_node(const char *id, const char *uname) +{ + GHashTableIter iter; + crm_node_t *node = NULL; + crm_node_t *by_id = NULL; + crm_node_t *by_name = NULL; + + if (uname) { + g_hash_table_iter_init(&iter, known_node_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + if (node->uname && strcasecmp(node->uname, uname) == 0) { + crm_trace("Name match: %s = %p", node->uname, node); + by_name = node; + break; + } + } + } + + if (id) { + g_hash_table_iter_init(&iter, known_node_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + if(strcasecmp(node->uuid, id) == 0) { + crm_trace("ID match: %s= %p", id, node); + by_id = node; + break; + } + } + } + + node = by_id; /* Good default */ + if (by_id == by_name) { + /* Nothing to do if they match (both NULL counts) */ + crm_trace("Consistent: %p for %s/%s", by_id, id, uname); + + } else if (by_id == NULL && by_name) { + crm_trace("Only one: %p for %s/%s", by_name, id, uname); + + if (id) { + node = NULL; + + } else { + node = by_name; + } + + } else if (by_name == NULL && by_id) { + crm_trace("Only one: %p for %s/%s", by_id, id, uname); + + if (uname) { + node = NULL; + } + + } else if (uname && by_id->uname + && pcmk__str_eq(uname, by_id->uname, pcmk__str_casei)) { + /* Multiple nodes have the same uname in the CIB. + * Return by_id. */ + + } else if (id && by_name->uuid + && pcmk__str_eq(id, by_name->uuid, pcmk__str_casei)) { + /* Multiple nodes have the same id in the CIB. + * Return by_name. */ + node = by_name; + + } else { + node = NULL; + } + + if (node == NULL) { + crm_debug("Couldn't find node%s%s%s%s", + id? " " : "", + id? id : "", + uname? " with name " : "", + uname? uname : ""); + } + + return node; +} + +static void +known_node_cache_refresh_helper(xmlNode *xml_node, void *user_data) +{ + const char *id = crm_element_value(xml_node, XML_ATTR_ID); + const char *uname = crm_element_value(xml_node, XML_ATTR_UNAME); + crm_node_t * node = NULL; + + CRM_CHECK(id != NULL && uname !=NULL, return); + node = find_known_node(id, uname); + + if (node == NULL) { + char *uniqueid = crm_generate_uuid(); + + node = calloc(1, sizeof(crm_node_t)); + CRM_ASSERT(node != NULL); + + node->uname = strdup(uname); + CRM_ASSERT(node->uname != NULL); + + node->uuid = strdup(id); + CRM_ASSERT(node->uuid != NULL); + + g_hash_table_replace(known_node_cache, uniqueid, node); + + } else if (pcmk_is_set(node->flags, crm_node_dirty)) { + pcmk__str_update(&node->uname, uname); + + /* Node is in cache and hasn't been updated already, so mark it clean */ + clear_peer_flags(node, crm_node_dirty); + } + +} + +static void +refresh_known_node_cache(xmlNode *cib) +{ + crm_peer_init(); + + g_hash_table_foreach(known_node_cache, mark_dirty, NULL); + + crm_foreach_xpath_result(cib, PCMK__XP_MEMBER_NODE_CONFIG, + known_node_cache_refresh_helper, NULL); + + /* Remove all old cache entries that weren't seen in the CIB */ + g_hash_table_foreach_remove(known_node_cache, is_dirty, NULL); +} + +void +pcmk__refresh_node_caches_from_cib(xmlNode *cib) +{ + crm_remote_peer_cache_refresh(cib); + refresh_known_node_cache(cib); +} + +/*! + * \internal + * \brief Search known node cache + * + * \param[in] id If not 0, cluster node ID to search for + * \param[in] uname If not NULL, node name to search for + * \param[in] flags Bitmask of enum crm_get_peer_flags + * + * \return Known node cache entry if found, otherwise NULL + */ +crm_node_t * +pcmk__search_known_node_cache(unsigned int id, const char *uname, + uint32_t flags) +{ + crm_node_t *node = NULL; + char *id_str = NULL; + + CRM_ASSERT(id > 0 || uname != NULL); + + node = pcmk__search_node_caches(id, uname, flags); + + if (node || !(flags & CRM_GET_PEER_CLUSTER)) { + return node; + } + + if (id > 0) { + id_str = crm_strdup_printf("%u", id); + } + + node = find_known_node(id_str, uname); + + free(id_str); + return node; +} + + +// Deprecated functions kept only for backward API compatibility +// LCOV_EXCL_START + +#include <crm/cluster/compat.h> + +int +crm_terminate_member(int nodeid, const char *uname, void *unused) +{ + return stonith_api_kick(nodeid, uname, 120, TRUE); +} + +int +crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection) +{ + return stonith_api_kick(nodeid, uname, 120, TRUE); +} + +// LCOV_EXCL_STOP +// End deprecated API |