summaryrefslogtreecommitdiffstats
path: root/daemons/controld/controld_timers.c
diff options
context:
space:
mode:
Diffstat (limited to 'daemons/controld/controld_timers.c')
-rw-r--r--daemons/controld/controld_timers.c509
1 files changed, 509 insertions, 0 deletions
diff --git a/daemons/controld/controld_timers.c b/daemons/controld/controld_timers.c
new file mode 100644
index 0000000..a65bef5
--- /dev/null
+++ b/daemons/controld/controld_timers.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright 2004-2022 the Pacemaker project contributors
+ *
+ * The version control history for this file may have further details.
+ *
+ * This source code is licensed under the GNU General Public License version 2
+ * or later (GPLv2+) WITHOUT ANY WARRANTY.
+ */
+
+#include <crm_internal.h>
+
+#include <time.h>
+#include <stdlib.h>
+
+#include <crm/crm.h>
+#include <crm/msg_xml.h>
+#include <pacemaker-controld.h>
+
+//! FSA mainloop timer type
+typedef struct fsa_timer_s {
+ guint source_id; //!< Timer source ID
+ guint period_ms; //!< Timer period
+ enum crmd_fsa_input fsa_input; //!< Input to register if timer pops
+ gboolean (*callback) (gpointer data); //!< What do if timer pops
+ bool log_error; //!< Timer popping indicates error
+ int counter; //!< For detecting loops
+} fsa_timer_t;
+
+//! Wait before retrying a failed cib or executor connection
+static fsa_timer_t *wait_timer = NULL;
+
+//! Periodically re-run scheduler (for date_spec evaluation and as a failsafe)
+static fsa_timer_t *recheck_timer = NULL;
+
+//! Wait at start-up, or after an election, for DC to make contact
+static fsa_timer_t *election_timer = NULL;
+
+//! Delay start of new transition with expectation something else might happen
+static fsa_timer_t *transition_timer = NULL;
+
+//! join-integration-timeout
+static fsa_timer_t *integration_timer = NULL;
+
+//! join-finalization-timeout
+static fsa_timer_t *finalization_timer = NULL;
+
+// Wait for DC to stop all resources and give us the all-clear to shut down
+fsa_timer_t *shutdown_escalation_timer = NULL;
+
+//! Cluster recheck interval (from configuration)
+static guint recheck_interval_ms = 0;
+
+static const char *
+get_timer_desc(fsa_timer_t * timer)
+{
+ if (timer == election_timer) {
+ return "Election Trigger";
+
+ } else if (timer == shutdown_escalation_timer) {
+ return "Shutdown Escalation";
+
+ } else if (timer == integration_timer) {
+ return "Integration Timer";
+
+ } else if (timer == finalization_timer) {
+ return "Finalization Timer";
+
+ } else if (timer == transition_timer) {
+ return "New Transition Timer";
+
+ } else if (timer == wait_timer) {
+ return "Wait Timer";
+
+ } else if (timer == recheck_timer) {
+ return "Cluster Recheck Timer";
+
+ }
+ return "Unknown Timer";
+}
+
+/*!
+ * \internal
+ * \brief Stop an FSA timer
+ *
+ * \param[in,out] timer Timer to stop
+ *
+ * \return true if the timer was running, or false otherwise
+ */
+static bool
+controld_stop_timer(fsa_timer_t *timer)
+{
+ CRM_CHECK(timer != NULL, return false);
+
+ if (timer->source_id != 0) {
+ crm_trace("Stopping %s (would inject %s if popped after %ums, src=%d)",
+ get_timer_desc(timer), fsa_input2string(timer->fsa_input),
+ timer->period_ms, timer->source_id);
+ g_source_remove(timer->source_id);
+ timer->source_id = 0;
+
+ } else {
+ crm_trace("%s already stopped (would inject %s if popped after %ums)",
+ get_timer_desc(timer), fsa_input2string(timer->fsa_input),
+ timer->period_ms);
+ return false;
+ }
+ return true;
+}
+
+/*!
+ * \internal
+ * \brief Start an FSA timer
+ *
+ * \param[in,out] timer Timer to start
+ */
+static void
+controld_start_timer(fsa_timer_t *timer)
+{
+ if (timer->source_id == 0 && timer->period_ms > 0) {
+ timer->source_id = g_timeout_add(timer->period_ms, timer->callback, (void *)timer);
+ CRM_ASSERT(timer->source_id != 0);
+ crm_debug("Started %s (inject %s if pops after %ums, source=%d)",
+ get_timer_desc(timer), fsa_input2string(timer->fsa_input),
+ timer->period_ms, timer->source_id);
+ } else {
+ crm_debug("%s already running (inject %s if pops after %ums, source=%d)",
+ get_timer_desc(timer), fsa_input2string(timer->fsa_input),
+ timer->period_ms, timer->source_id);
+ }
+}
+
+/* A_DC_TIMER_STOP, A_DC_TIMER_START,
+ * A_FINALIZE_TIMER_STOP, A_FINALIZE_TIMER_START
+ * A_INTEGRATE_TIMER_STOP, A_INTEGRATE_TIMER_START
+ */
+void
+do_timer_control(long long action,
+ enum crmd_fsa_cause cause,
+ enum crmd_fsa_state cur_state,
+ enum crmd_fsa_input current_input, fsa_data_t * msg_data)
+{
+ gboolean timer_op_ok = TRUE;
+
+ if (action & A_DC_TIMER_STOP) {
+ timer_op_ok = controld_stop_timer(election_timer);
+
+ } else if (action & A_FINALIZE_TIMER_STOP) {
+ timer_op_ok = controld_stop_timer(finalization_timer);
+
+ } else if (action & A_INTEGRATE_TIMER_STOP) {
+ timer_op_ok = controld_stop_timer(integration_timer);
+ }
+
+ /* don't start a timer that wasn't already running */
+ if (action & A_DC_TIMER_START && timer_op_ok) {
+ controld_start_timer(election_timer);
+ if (AM_I_DC) {
+ /* there can be only one */
+ register_fsa_input(cause, I_ELECTION, NULL);
+ }
+
+ } else if (action & A_FINALIZE_TIMER_START) {
+ controld_start_timer(finalization_timer);
+
+ } else if (action & A_INTEGRATE_TIMER_START) {
+ controld_start_timer(integration_timer);
+ }
+}
+
+static gboolean
+crm_timer_popped(gpointer data)
+{
+ fsa_timer_t *timer = (fsa_timer_t *) data;
+
+ if (timer->log_error) {
+ crm_err("%s just popped in state %s! " CRM_XS " input=%s time=%ums",
+ get_timer_desc(timer),
+ fsa_state2string(controld_globals.fsa_state),
+ fsa_input2string(timer->fsa_input), timer->period_ms);
+ } else {
+ crm_info("%s just popped " CRM_XS " input=%s time=%ums",
+ get_timer_desc(timer), fsa_input2string(timer->fsa_input),
+ timer->period_ms);
+ timer->counter++;
+ }
+
+ if ((timer == election_timer) && (election_timer->counter > 5)) {
+ crm_notice("We appear to be in an election loop, something may be wrong");
+ crm_write_blackbox(0, NULL);
+ election_timer->counter = 0;
+ }
+
+ controld_stop_timer(timer); // Make timer _not_ go off again
+
+ if (timer->fsa_input == I_INTEGRATED) {
+ crm_info("Welcomed: %d, Integrated: %d",
+ crmd_join_phase_count(crm_join_welcomed),
+ crmd_join_phase_count(crm_join_integrated));
+ if (crmd_join_phase_count(crm_join_welcomed) == 0) {
+ // If we don't even have ourselves, start again
+ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, NULL,
+ __func__);
+
+ } else {
+ register_fsa_input_before(C_TIMER_POPPED, timer->fsa_input, NULL);
+ }
+
+ } else if ((timer == recheck_timer)
+ && (controld_globals.fsa_state != S_IDLE)) {
+ crm_debug("Discarding %s event in state: %s",
+ fsa_input2string(timer->fsa_input),
+ fsa_state2string(controld_globals.fsa_state));
+
+ } else if ((timer == finalization_timer)
+ && (controld_globals.fsa_state != S_FINALIZE_JOIN)) {
+ crm_debug("Discarding %s event in state: %s",
+ fsa_input2string(timer->fsa_input),
+ fsa_state2string(controld_globals.fsa_state));
+
+ } else if (timer->fsa_input != I_NULL) {
+ register_fsa_input(C_TIMER_POPPED, timer->fsa_input, NULL);
+ }
+
+ controld_trigger_fsa();
+
+ return TRUE;
+}
+
+bool
+controld_init_fsa_timers(void)
+{
+ transition_timer = calloc(1, sizeof(fsa_timer_t));
+ if (transition_timer == NULL) {
+ return FALSE;
+ }
+
+ integration_timer = calloc(1, sizeof(fsa_timer_t));
+ if (integration_timer == NULL) {
+ return FALSE;
+ }
+
+ finalization_timer = calloc(1, sizeof(fsa_timer_t));
+ if (finalization_timer == NULL) {
+ return FALSE;
+ }
+
+ election_timer = calloc(1, sizeof(fsa_timer_t));
+ if (election_timer == NULL) {
+ return FALSE;
+ }
+
+ shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t));
+ if (shutdown_escalation_timer == NULL) {
+ return FALSE;
+ }
+
+ wait_timer = calloc(1, sizeof(fsa_timer_t));
+ if (wait_timer == NULL) {
+ return FALSE;
+ }
+
+ recheck_timer = calloc(1, sizeof(fsa_timer_t));
+ if (recheck_timer == NULL) {
+ return FALSE;
+ }
+
+ election_timer->source_id = 0;
+ election_timer->period_ms = 0;
+ election_timer->fsa_input = I_DC_TIMEOUT;
+ election_timer->callback = crm_timer_popped;
+ election_timer->log_error = FALSE;
+
+ transition_timer->source_id = 0;
+ transition_timer->period_ms = 0;
+ transition_timer->fsa_input = I_PE_CALC;
+ transition_timer->callback = crm_timer_popped;
+ transition_timer->log_error = FALSE;
+
+ integration_timer->source_id = 0;
+ integration_timer->period_ms = 0;
+ integration_timer->fsa_input = I_INTEGRATED;
+ integration_timer->callback = crm_timer_popped;
+ integration_timer->log_error = TRUE;
+
+ finalization_timer->source_id = 0;
+ finalization_timer->period_ms = 0;
+ finalization_timer->fsa_input = I_FINALIZED;
+ finalization_timer->callback = crm_timer_popped;
+ finalization_timer->log_error = FALSE;
+
+ /* We can't use I_FINALIZED here, because that creates a bug in the join
+ * process where a joining node can be stuck in S_PENDING while we think it
+ * is in S_NOT_DC. This created an infinite transition loop in which we
+ * continually send probes which the node NACKs because it's pending.
+ *
+ * If we have nodes where the cluster layer is active but the controller is
+ * not, we can avoid this causing an election/join loop, in the integration
+ * phase.
+ */
+ finalization_timer->fsa_input = I_ELECTION;
+
+ shutdown_escalation_timer->source_id = 0;
+ shutdown_escalation_timer->period_ms = 0;
+ shutdown_escalation_timer->fsa_input = I_STOP;
+ shutdown_escalation_timer->callback = crm_timer_popped;
+ shutdown_escalation_timer->log_error = TRUE;
+
+ wait_timer->source_id = 0;
+ wait_timer->period_ms = 2000;
+ wait_timer->fsa_input = I_NULL;
+ wait_timer->callback = crm_timer_popped;
+ wait_timer->log_error = FALSE;
+
+ recheck_timer->source_id = 0;
+ recheck_timer->period_ms = 0;
+ recheck_timer->fsa_input = I_PE_CALC;
+ recheck_timer->callback = crm_timer_popped;
+ recheck_timer->log_error = FALSE;
+
+ return TRUE;
+}
+
+/*!
+ * \internal
+ * \brief Configure timers based on the CIB
+ *
+ * \param[in,out] options Name/value pairs for configured options
+ */
+void
+controld_configure_fsa_timers(GHashTable *options)
+{
+ const char *value = NULL;
+
+ // Election timer
+ value = g_hash_table_lookup(options, XML_CONFIG_ATTR_DC_DEADTIME);
+ election_timer->period_ms = crm_parse_interval_spec(value);
+
+ // Integration timer
+ value = g_hash_table_lookup(options, "join-integration-timeout");
+ integration_timer->period_ms = crm_parse_interval_spec(value);
+
+ // Finalization timer
+ value = g_hash_table_lookup(options, "join-finalization-timeout");
+ finalization_timer->period_ms = crm_parse_interval_spec(value);
+
+ // Shutdown escalation timer
+ value = g_hash_table_lookup(options, XML_CONFIG_ATTR_FORCE_QUIT);
+ shutdown_escalation_timer->period_ms = crm_parse_interval_spec(value);
+ crm_debug("Shutdown escalation occurs if DC has not responded to request "
+ "in %ums", shutdown_escalation_timer->period_ms);
+
+ // Transition timer
+ value = g_hash_table_lookup(options, "transition-delay");
+ transition_timer->period_ms = crm_parse_interval_spec(value);
+
+ // Recheck interval
+ value = g_hash_table_lookup(options, XML_CONFIG_ATTR_RECHECK);
+ recheck_interval_ms = crm_parse_interval_spec(value);
+ crm_debug("Re-run scheduler after %dms of inactivity", recheck_interval_ms);
+}
+
+void
+controld_free_fsa_timers(void)
+{
+ controld_stop_timer(transition_timer);
+ controld_stop_timer(integration_timer);
+ controld_stop_timer(finalization_timer);
+ controld_stop_timer(election_timer);
+ controld_stop_timer(shutdown_escalation_timer);
+ controld_stop_timer(wait_timer);
+ controld_stop_timer(recheck_timer);
+
+ free(transition_timer); transition_timer = NULL;
+ free(integration_timer); integration_timer = NULL;
+ free(finalization_timer); finalization_timer = NULL;
+ free(election_timer); election_timer = NULL;
+ free(shutdown_escalation_timer); shutdown_escalation_timer = NULL;
+ free(wait_timer); wait_timer = NULL;
+ free(recheck_timer); recheck_timer = NULL;
+}
+
+/*!
+ * \internal
+ * \brief Check whether the transition timer is started
+ * \return true if the transition timer is started, or false otherwise
+ */
+bool
+controld_is_started_transition_timer(void)
+{
+ return (transition_timer->period_ms > 0)
+ && (transition_timer->source_id != 0);
+}
+
+/*!
+ * \internal
+ * \brief Start the recheck timer
+ */
+void
+controld_start_recheck_timer(void)
+{
+ // Default to recheck interval configured in CIB (if any)
+ guint period_ms = recheck_interval_ms;
+
+ // If scheduler supplied a "recheck by" time, check whether that's sooner
+ if (controld_globals.transition_graph->recheck_by > 0) {
+ time_t diff_seconds = controld_globals.transition_graph->recheck_by
+ - time(NULL);
+
+ if (diff_seconds < 1) {
+ // We're already past the desired time
+ period_ms = 500;
+ } else {
+ period_ms = (guint) diff_seconds * 1000;
+ }
+
+ // Use "recheck by" only if it's sooner than interval from CIB
+ if (period_ms > recheck_interval_ms) {
+ period_ms = recheck_interval_ms;
+ }
+ }
+
+ if (period_ms > 0) {
+ recheck_timer->period_ms = period_ms;
+ controld_start_timer(recheck_timer);
+ }
+}
+
+/*!
+ * \internal
+ * \brief Start the wait timer
+ */
+void
+controld_start_wait_timer(void)
+{
+ controld_start_timer(wait_timer);
+}
+
+/*!
+ * \internal
+ * \brief Stop the recheck timer
+ *
+ * \return true if the recheck timer was running, or false otherwise
+ */
+bool
+controld_stop_recheck_timer(void)
+{
+ return controld_stop_timer(recheck_timer);
+}
+
+/*!
+ * \brief Get the transition timer's configured period
+ * \return The transition_timer's period
+ */
+guint
+controld_get_period_transition_timer(void)
+{
+ return transition_timer->period_ms;
+}
+
+/*!
+ * \internal
+ * \brief Reset the election timer's counter to 0
+ */
+void
+controld_reset_counter_election_timer(void)
+{
+ election_timer->counter = 0;
+}
+
+/*!
+ * \internal
+ * \brief Stop the transition timer
+ *
+ * \return true if the transition timer was running, or false otherwise
+ */
+bool
+controld_stop_transition_timer(void)
+{
+ return controld_stop_timer(transition_timer);
+}
+
+/*!
+ * \internal
+ * \brief Start the transition timer
+ */
+void
+controld_start_transition_timer(void)
+{
+ controld_start_timer(transition_timer);
+}
+
+/*!
+ * \internal
+ * \brief Start the countdown sequence for a shutdown
+ *
+ * \param[in] default_period_ms Period to use if the shutdown escalation
+ * timer's period is 0
+ */
+void
+controld_shutdown_start_countdown(guint default_period_ms)
+{
+ if (shutdown_escalation_timer->period_ms == 0) {
+ shutdown_escalation_timer->period_ms = default_period_ms;
+ }
+
+ crm_notice("Initiating controller shutdown sequence " CRM_XS " limit=%ums",
+ shutdown_escalation_timer->period_ms);
+ controld_start_timer(shutdown_escalation_timer);
+}