diff options
Diffstat (limited to 'lib/cluster/election.c')
-rw-r--r-- | lib/cluster/election.c | 727 |
1 files changed, 727 insertions, 0 deletions
diff --git a/lib/cluster/election.c b/lib/cluster/election.c new file mode 100644 index 0000000..ebbae72 --- /dev/null +++ b/lib/cluster/election.c @@ -0,0 +1,727 @@ +/* + * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <sys/time.h> +#include <sys/resource.h> + +#include <crm/msg_xml.h> +#include <crm/common/xml.h> + +#include <crm/common/mainloop.h> +#include <crm/cluster/internal.h> +#include <crm/cluster/election_internal.h> +#include <crm/crm.h> + +#define STORM_INTERVAL 2 /* in seconds */ + +struct election_s { + enum election_result state; + guint count; // How many times local node has voted + char *name; // Descriptive name for this election + char *uname; // Local node's name + GSourceFunc cb; // Function to call if election is won + GHashTable *voted; // Key = node name, value = how node voted + mainloop_timer_t *timeout; // When to abort if all votes not received + int election_wins; // Track wins, for storm detection + bool wrote_blackbox; // Write a storm blackbox at most once + time_t expires; // When storm detection period ends + time_t last_election_loss; // When dampening period ends +}; + +static void +election_complete(election_t *e) +{ + e->state = election_won; + if (e->cb != NULL) { + e->cb(e); + } + election_reset(e); +} + +static gboolean +election_timer_cb(gpointer user_data) +{ + election_t *e = user_data; + + crm_info("%s timed out, declaring local node as winner", e->name); + election_complete(e); + return FALSE; +} + +/*! + * \brief Get current state of an election + * + * \param[in] e Election object + * + * \return Current state of \e + */ +enum election_result +election_state(const election_t *e) +{ + return (e == NULL)? election_error : e->state; +} + +/*! + * \brief Create a new election object + * + * Every node that wishes to participate in an election must create an election + * object. Typically, this should be done once, at start-up. A caller should + * only create a single election object. + * + * \param[in] name Label for election (for logging) + * \param[in] uname Local node's name + * \param[in] period_ms How long to wait for all peers to vote + * \param[in] cb Function to call if local node wins election + * + * \return Newly allocated election object on success, NULL on error + * \note The caller is responsible for freeing the returned value using + * election_fini(). + */ +election_t * +election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb) +{ + election_t *e = NULL; + + static guint count = 0; + + CRM_CHECK(uname != NULL, return NULL); + + e = calloc(1, sizeof(election_t)); + if (e == NULL) { + crm_perror(LOG_CRIT, "Cannot create election"); + return NULL; + } + + e->uname = strdup(uname); + if (e->uname == NULL) { + crm_perror(LOG_CRIT, "Cannot create election"); + free(e); + return NULL; + } + + e->name = name? crm_strdup_printf("election-%s", name) + : crm_strdup_printf("election-%u", count++); + e->cb = cb; + e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, + election_timer_cb, e); + crm_trace("Created %s", e->name); + return e; +} + +/*! + * \brief Disregard any previous vote by specified peer + * + * This discards any recorded vote from a specified peer. Election users should + * call this whenever a voting peer becomes inactive. + * + * \param[in,out] e Election object + * \param[in] uname Name of peer to disregard + */ +void +election_remove(election_t *e, const char *uname) +{ + if ((e != NULL) && (uname != NULL) && (e->voted != NULL)) { + crm_trace("Discarding %s (no-)vote from lost peer %s", e->name, uname); + g_hash_table_remove(e->voted, uname); + } +} + +/*! + * \brief Stop election timer and disregard all votes + * + * \param[in,out] e Election object + */ +void +election_reset(election_t *e) +{ + if (e != NULL) { + crm_trace("Resetting election %s", e->name); + mainloop_timer_stop(e->timeout); + if (e->voted) { + crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted)); + g_hash_table_destroy(e->voted); + e->voted = NULL; + } + } +} + +/*! + * \brief Free an election object + * + * Free all memory associated with an election object, stopping its + * election timer (if running). + * + * \param[in,out] e Election object + */ +void +election_fini(election_t *e) +{ + if (e != NULL) { + election_reset(e); + crm_trace("Destroying %s", e->name); + mainloop_timer_del(e->timeout); + free(e->uname); + free(e->name); + free(e); + } +} + +static void +election_timeout_start(election_t *e) +{ + if (e != NULL) { + mainloop_timer_start(e->timeout); + } +} + +/*! + * \brief Stop an election's timer, if running + * + * \param[in,out] e Election object + */ +void +election_timeout_stop(election_t *e) +{ + if (e != NULL) { + mainloop_timer_stop(e->timeout); + } +} + +/*! + * \brief Change an election's timeout (restarting timer if running) + * + * \param[in,out] e Election object + * \param[in] period New timeout + */ +void +election_timeout_set_period(election_t *e, guint period) +{ + if (e != NULL) { + mainloop_timer_set_period(e->timeout, period); + } else { + crm_err("No election defined"); + } +} + +static int +get_uptime(struct timeval *output) +{ + static time_t expires = 0; + static struct rusage info; + + time_t tm_now = time(NULL); + + if (expires < tm_now) { + int rc = 0; + + info.ru_utime.tv_sec = 0; + info.ru_utime.tv_usec = 0; + rc = getrusage(RUSAGE_SELF, &info); + + output->tv_sec = 0; + output->tv_usec = 0; + + if (rc < 0) { + crm_perror(LOG_ERR, "Could not calculate the current uptime"); + expires = 0; + return -1; + } + + crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec, + (long)info.ru_utime.tv_usec); + } + + expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */ + output->tv_sec = info.ru_utime.tv_sec; + output->tv_usec = info.ru_utime.tv_usec; + + return 1; +} + +static int +compare_age(struct timeval your_age) +{ + struct timeval our_age; + + get_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */ + + if (our_age.tv_sec > your_age.tv_sec) { + crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); + return 1; + } else if (our_age.tv_sec < your_age.tv_sec) { + crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); + return -1; + } else if (our_age.tv_usec > your_age.tv_usec) { + crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)", + (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); + return 1; + } else if (our_age.tv_usec < your_age.tv_usec) { + crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)", + (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); + return -1; + } + + return 0; +} + +/*! + * \brief Start a new election by offering local node's candidacy + * + * Broadcast a "vote" election message containing the local node's ID, + * (incremented) election counter, and uptime, and start the election timer. + * + * \param[in,out] e Election object + * + * \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if + * all active peers do so, or if the election times out, the local node + * wins the election. (If we lose to any peer vote, we will stop the + * timer, so a timeout means we did not lose -- either some peer did not + * vote, or we did not call election_check() in time.) + */ +void +election_vote(election_t *e) +{ + struct timeval age; + xmlNode *vote = NULL; + crm_node_t *our_node; + + if (e == NULL) { + crm_trace("Election vote requested, but no election available"); + return; + } + + our_node = crm_get_peer(0, e->uname); + if ((our_node == NULL) || (crm_is_peer_active(our_node) == FALSE)) { + crm_trace("Cannot vote in %s yet: local node not connected to cluster", + e->name); + return; + } + + election_reset(e); + e->state = election_in_progress; + vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); + + e->count++; + crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid); + crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count); + + get_uptime(&age); + crm_xml_add_timeval(vote, F_CRM_ELECTION_AGE_S, F_CRM_ELECTION_AGE_US, &age); + + send_cluster_message(NULL, crm_msg_crmd, vote, TRUE); + free_xml(vote); + + crm_debug("Started %s round %d", e->name, e->count); + election_timeout_start(e); + return; +} + +/*! + * \brief Check whether local node has won an election + * + * If all known peers have sent no-vote messages, stop the election timer, set + * the election state to won, and call any registered win callback. + * + * \param[in,out] e Election object + * + * \return TRUE if local node has won, FALSE otherwise + * \note If all known peers have sent no-vote messages, but the election owner + * does not call this function, the election will not be won (and the + * callback will not be called) until the election times out. + * \note This should be called when election_count_vote() returns + * \c election_in_progress. + */ +bool +election_check(election_t *e) +{ + int voted_size = 0; + int num_members = 0; + + if (e == NULL) { + crm_trace("Election check requested, but no election available"); + return FALSE; + } + if (e->voted == NULL) { + crm_trace("%s check requested, but no votes received yet", e->name); + return FALSE; + } + + voted_size = g_hash_table_size(e->voted); + num_members = crm_active_peers(); + + /* in the case of #voted > #members, it is better to + * wait for the timeout and give the cluster time to + * stabilize + */ + if (voted_size >= num_members) { + /* we won and everyone has voted */ + election_timeout_stop(e); + if (voted_size > num_members) { + GHashTableIter gIter; + const crm_node_t *node; + char *key = NULL; + + crm_warn("Received too many votes in %s", e->name); + g_hash_table_iter_init(&gIter, crm_peer_cache); + while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) { + if (crm_is_peer_active(node)) { + crm_warn("* expected vote: %s", node->uname); + } + } + + g_hash_table_iter_init(&gIter, e->voted); + while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) { + crm_warn("* actual vote: %s", key); + } + + } + + crm_info("%s won by local node", e->name); + election_complete(e); + return TRUE; + + } else { + crm_debug("%s still waiting on %d of %d votes", + e->name, num_members - voted_size, num_members); + } + + return FALSE; +} + +#define LOSS_DAMPEN 2 /* in seconds */ + +struct vote { + const char *op; + const char *from; + const char *version; + const char *election_owner; + int election_id; + struct timeval age; +}; + +/*! + * \brief Unpack an election message + * + * \param[in] e Election object (for logging only) + * \param[in] message Election message XML + * \param[out] vote Parsed fields from message + * + * \return TRUE if election message and election are valid, FALSE otherwise + * \note The parsed struct's pointer members are valid only for the lifetime of + * the message argument. + */ +static bool +parse_election_message(const election_t *e, const xmlNode *message, + struct vote *vote) +{ + CRM_CHECK(message && vote, return FALSE); + + vote->election_id = -1; + vote->age.tv_sec = -1; + vote->age.tv_usec = -1; + + vote->op = crm_element_value(message, F_CRM_TASK); + vote->from = crm_element_value(message, F_CRM_HOST_FROM); + vote->version = crm_element_value(message, F_CRM_VERSION); + vote->election_owner = crm_element_value(message, F_CRM_ELECTION_OWNER); + + crm_element_value_int(message, F_CRM_ELECTION_ID, &(vote->election_id)); + + if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL) + || (vote->election_owner == NULL) || (vote->election_id < 0)) { + + crm_warn("Invalid %s message from %s in %s ", + (vote->op? vote->op : "election"), + (vote->from? vote->from : "unspecified node"), + (e? e->name : "election")); + return FALSE; + } + + // Op-specific validation + + if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) { + // Only vote ops have uptime + crm_element_value_timeval(message, F_CRM_ELECTION_AGE_S, + F_CRM_ELECTION_AGE_US, &(vote->age)); + if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) { + crm_warn("Cannot count %s %s from %s because it is missing uptime", + (e? e->name : "election"), vote->op, vote->from); + return FALSE; + } + + } else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) { + crm_info("Cannot process %s message from %s because %s is not a known election op", + (e? e->name : "election"), vote->from, vote->op); + return FALSE; + } + + // Election validation + + if (e == NULL) { + crm_info("Cannot count %s from %s because no election available", + vote->op, vote->from); + return FALSE; + } + + /* If the membership cache is NULL, we REALLY shouldn't be voting -- + * the question is how we managed to get here. + */ + if (crm_peer_cache == NULL) { + crm_info("Cannot count %s %s from %s because no peer information available", + e->name, vote->op, vote->from); + return FALSE; + } + return TRUE; +} + +static void +record_vote(election_t *e, struct vote *vote) +{ + char *voter_copy = NULL; + char *vote_copy = NULL; + + CRM_ASSERT(e && vote && vote->from && vote->op); + if (e->voted == NULL) { + e->voted = pcmk__strkey_table(free, free); + } + + voter_copy = strdup(vote->from); + vote_copy = strdup(vote->op); + CRM_ASSERT(voter_copy && vote_copy); + + g_hash_table_replace(e->voted, voter_copy, vote_copy); +} + +static void +send_no_vote(crm_node_t *peer, struct vote *vote) +{ + // @TODO probably shouldn't hardcode CRM_SYSTEM_CRMD and crm_msg_crmd + + xmlNode *novote = create_request(CRM_OP_NOVOTE, NULL, vote->from, + CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); + + crm_xml_add(novote, F_CRM_ELECTION_OWNER, vote->election_owner); + crm_xml_add_int(novote, F_CRM_ELECTION_ID, vote->election_id); + + send_cluster_message(peer, crm_msg_crmd, novote, TRUE); + free_xml(novote); +} + +/*! + * \brief Process an election message (vote or no-vote) from a peer + * + * \param[in,out] e Election object + * \param[in] message Election message XML from peer + * \param[in] can_win Whether local node is eligible to win + * + * \return Election state after new vote is considered + * \note If the peer message is a vote, and we prefer the peer to win, this will + * send a no-vote reply to the peer. + * \note The situations "we lost to this vote" from "this is a late no-vote + * after we've already lost" both return election_lost. If a caller needs + * to distinguish them, it should save the current state before calling + * this function, and then compare the result. + */ +enum election_result +election_count_vote(election_t *e, const xmlNode *message, bool can_win) +{ + int log_level = LOG_INFO; + gboolean done = FALSE; + gboolean we_lose = FALSE; + const char *reason = "unknown"; + bool we_are_owner = FALSE; + crm_node_t *our_node = NULL, *your_node = NULL; + time_t tm_now = time(NULL); + struct vote vote; + + CRM_CHECK(message != NULL, return election_error); + if (parse_election_message(e, message, &vote) == FALSE) { + return election_error; + } + + your_node = crm_get_peer(0, vote.from); + our_node = crm_get_peer(0, e->uname); + we_are_owner = (our_node != NULL) + && pcmk__str_eq(our_node->uuid, vote.election_owner, + pcmk__str_none); + + if (!can_win) { + reason = "Not eligible"; + we_lose = TRUE; + + } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) { + reason = "We are not part of the cluster"; + log_level = LOG_ERR; + we_lose = TRUE; + + } else if (we_are_owner && (vote.election_id != e->count)) { + log_level = LOG_TRACE; + reason = "Superseded"; + done = TRUE; + + } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) { + /* Possibly we cached the message in the FSA queue at a point that it wasn't */ + reason = "Peer is not part of our cluster"; + log_level = LOG_WARNING; + done = TRUE; + + } else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none) + || pcmk__str_eq(vote.from, e->uname, pcmk__str_none)) { + /* Receiving our own broadcast vote, or a no-vote from peer, is a vote + * for us to win + */ + if (!we_are_owner) { + crm_warn("Cannot count %s round %d %s from %s because we are not election owner (%s)", + e->name, vote.election_id, vote.op, vote.from, + vote.election_owner); + return election_error; + } + if (e->state != election_in_progress) { + // Should only happen if we already lost + crm_debug("Not counting %s round %d %s from %s because no election in progress", + e->name, vote.election_id, vote.op, vote.from); + return e->state; + } + record_vote(e, &vote); + reason = "Recorded"; + done = TRUE; + + } else { + // A peer vote requires a comparison to determine which node is better + int age_result = compare_age(vote.age); + int version_result = compare_version(vote.version, CRM_FEATURE_SET); + + if (version_result < 0) { + reason = "Version"; + we_lose = TRUE; + + } else if (version_result > 0) { + reason = "Version"; + + } else if (age_result < 0) { + reason = "Uptime"; + we_lose = TRUE; + + } else if (age_result > 0) { + reason = "Uptime"; + + } else if (strcasecmp(e->uname, vote.from) > 0) { + reason = "Host name"; + we_lose = TRUE; + + } else { + reason = "Host name"; + } + } + + if (e->expires < tm_now) { + e->election_wins = 0; + e->expires = tm_now + STORM_INTERVAL; + + } else if (done == FALSE && we_lose == FALSE) { + int peers = 1 + g_hash_table_size(crm_peer_cache); + + /* If every node has to vote down every other node, thats N*(N-1) total elections + * Allow some leeway before _really_ complaining + */ + e->election_wins++; + if (e->election_wins > (peers * peers)) { + crm_warn("%s election storm detected: %d wins in %d seconds", + e->name, e->election_wins, STORM_INTERVAL); + e->election_wins = 0; + e->expires = tm_now + STORM_INTERVAL; + if (e->wrote_blackbox == FALSE) { + /* It's questionable whether a black box (from every node in the + * cluster) would be truly helpful in diagnosing an election + * storm. It's also highly doubtful a production environment + * would get multiple election storms from distinct causes, so + * saving one blackbox per process lifetime should be + * sufficient. Alternatives would be to save a timestamp of the + * last blackbox write instead of a boolean, and write a new one + * if some amount of time has passed; or to save a storm count, + * write a blackbox on every Nth occurrence. + */ + crm_write_blackbox(0, NULL); + e->wrote_blackbox = TRUE; + } + } + } + + if (done) { + do_crm_log(log_level + 1, + "Processed %s round %d %s (current round %d) from %s (%s)", + e->name, vote.election_id, vote.op, e->count, vote.from, + reason); + return e->state; + + } else if (we_lose == FALSE) { + /* We track the time of the last election loss to implement an election + * dampening period, reducing the likelihood of an election storm. If + * this node has lost within the dampening period, don't start a new + * election, even if we win against a peer's vote -- the peer we lost to + * should win again. + * + * @TODO This has a problem case: if an election winner immediately + * leaves the cluster, and a new election is immediately called, all + * nodes could lose, with no new winner elected. The ideal solution + * would be to tie the election structure with the peer caches, which + * would allow us to clear the dampening when the previous winner + * leaves (and would allow other improvements as well). + */ + if ((e->last_election_loss == 0) + || ((tm_now - e->last_election_loss) > (time_t) LOSS_DAMPEN)) { + + do_crm_log(log_level, "%s round %d (owner node ID %s) pass: %s from %s (%s)", + e->name, vote.election_id, vote.election_owner, vote.op, + vote.from, reason); + + e->last_election_loss = 0; + election_timeout_stop(e); + + /* Start a new election by voting down this, and other, peers */ + e->state = election_start; + return e->state; + } else { + char *loss_time = ctime(&e->last_election_loss); + + if (loss_time) { + // Show only HH:MM:SS + loss_time += 11; + loss_time[8] = '\0'; + } + crm_info("Ignoring %s round %d (owner node ID %s) pass vs %s because we lost less than %ds ago at %s", + e->name, vote.election_id, vote.election_owner, vote.from, + LOSS_DAMPEN, (loss_time? loss_time : "unknown")); + } + } + + e->last_election_loss = tm_now; + + do_crm_log(log_level, "%s round %d (owner node ID %s) lost: %s from %s (%s)", + e->name, vote.election_id, vote.election_owner, vote.op, + vote.from, reason); + + election_reset(e); + send_no_vote(your_node, &vote); + e->state = election_lost; + return e->state; +} + +/*! + * \brief Reset any election dampening currently in effect + * + * \param[in,out] e Election object to clear + */ +void +election_clear_dampening(election_t *e) +{ + e->last_election_loss = 0; +} |