diff options
Diffstat (limited to 'daemons/controld/controld_timers.c')
-rw-r--r-- | daemons/controld/controld_timers.c | 509 |
1 files changed, 509 insertions, 0 deletions
diff --git a/daemons/controld/controld_timers.c b/daemons/controld/controld_timers.c new file mode 100644 index 0000000..a65bef5 --- /dev/null +++ b/daemons/controld/controld_timers.c @@ -0,0 +1,509 @@ +/* + * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <time.h> +#include <stdlib.h> + +#include <crm/crm.h> +#include <crm/msg_xml.h> +#include <pacemaker-controld.h> + +//! FSA mainloop timer type +typedef struct fsa_timer_s { + guint source_id; //!< Timer source ID + guint period_ms; //!< Timer period + enum crmd_fsa_input fsa_input; //!< Input to register if timer pops + gboolean (*callback) (gpointer data); //!< What do if timer pops + bool log_error; //!< Timer popping indicates error + int counter; //!< For detecting loops +} fsa_timer_t; + +//! Wait before retrying a failed cib or executor connection +static fsa_timer_t *wait_timer = NULL; + +//! Periodically re-run scheduler (for date_spec evaluation and as a failsafe) +static fsa_timer_t *recheck_timer = NULL; + +//! Wait at start-up, or after an election, for DC to make contact +static fsa_timer_t *election_timer = NULL; + +//! Delay start of new transition with expectation something else might happen +static fsa_timer_t *transition_timer = NULL; + +//! join-integration-timeout +static fsa_timer_t *integration_timer = NULL; + +//! join-finalization-timeout +static fsa_timer_t *finalization_timer = NULL; + +// Wait for DC to stop all resources and give us the all-clear to shut down +fsa_timer_t *shutdown_escalation_timer = NULL; + +//! Cluster recheck interval (from configuration) +static guint recheck_interval_ms = 0; + +static const char * +get_timer_desc(fsa_timer_t * timer) +{ + if (timer == election_timer) { + return "Election Trigger"; + + } else if (timer == shutdown_escalation_timer) { + return "Shutdown Escalation"; + + } else if (timer == integration_timer) { + return "Integration Timer"; + + } else if (timer == finalization_timer) { + return "Finalization Timer"; + + } else if (timer == transition_timer) { + return "New Transition Timer"; + + } else if (timer == wait_timer) { + return "Wait Timer"; + + } else if (timer == recheck_timer) { + return "Cluster Recheck Timer"; + + } + return "Unknown Timer"; +} + +/*! + * \internal + * \brief Stop an FSA timer + * + * \param[in,out] timer Timer to stop + * + * \return true if the timer was running, or false otherwise + */ +static bool +controld_stop_timer(fsa_timer_t *timer) +{ + CRM_CHECK(timer != NULL, return false); + + if (timer->source_id != 0) { + crm_trace("Stopping %s (would inject %s if popped after %ums, src=%d)", + get_timer_desc(timer), fsa_input2string(timer->fsa_input), + timer->period_ms, timer->source_id); + g_source_remove(timer->source_id); + timer->source_id = 0; + + } else { + crm_trace("%s already stopped (would inject %s if popped after %ums)", + get_timer_desc(timer), fsa_input2string(timer->fsa_input), + timer->period_ms); + return false; + } + return true; +} + +/*! + * \internal + * \brief Start an FSA timer + * + * \param[in,out] timer Timer to start + */ +static void +controld_start_timer(fsa_timer_t *timer) +{ + if (timer->source_id == 0 && timer->period_ms > 0) { + timer->source_id = g_timeout_add(timer->period_ms, timer->callback, (void *)timer); + CRM_ASSERT(timer->source_id != 0); + crm_debug("Started %s (inject %s if pops after %ums, source=%d)", + get_timer_desc(timer), fsa_input2string(timer->fsa_input), + timer->period_ms, timer->source_id); + } else { + crm_debug("%s already running (inject %s if pops after %ums, source=%d)", + get_timer_desc(timer), fsa_input2string(timer->fsa_input), + timer->period_ms, timer->source_id); + } +} + +/* A_DC_TIMER_STOP, A_DC_TIMER_START, + * A_FINALIZE_TIMER_STOP, A_FINALIZE_TIMER_START + * A_INTEGRATE_TIMER_STOP, A_INTEGRATE_TIMER_START + */ +void +do_timer_control(long long action, + enum crmd_fsa_cause cause, + enum crmd_fsa_state cur_state, + enum crmd_fsa_input current_input, fsa_data_t * msg_data) +{ + gboolean timer_op_ok = TRUE; + + if (action & A_DC_TIMER_STOP) { + timer_op_ok = controld_stop_timer(election_timer); + + } else if (action & A_FINALIZE_TIMER_STOP) { + timer_op_ok = controld_stop_timer(finalization_timer); + + } else if (action & A_INTEGRATE_TIMER_STOP) { + timer_op_ok = controld_stop_timer(integration_timer); + } + + /* don't start a timer that wasn't already running */ + if (action & A_DC_TIMER_START && timer_op_ok) { + controld_start_timer(election_timer); + if (AM_I_DC) { + /* there can be only one */ + register_fsa_input(cause, I_ELECTION, NULL); + } + + } else if (action & A_FINALIZE_TIMER_START) { + controld_start_timer(finalization_timer); + + } else if (action & A_INTEGRATE_TIMER_START) { + controld_start_timer(integration_timer); + } +} + +static gboolean +crm_timer_popped(gpointer data) +{ + fsa_timer_t *timer = (fsa_timer_t *) data; + + if (timer->log_error) { + crm_err("%s just popped in state %s! " CRM_XS " input=%s time=%ums", + get_timer_desc(timer), + fsa_state2string(controld_globals.fsa_state), + fsa_input2string(timer->fsa_input), timer->period_ms); + } else { + crm_info("%s just popped " CRM_XS " input=%s time=%ums", + get_timer_desc(timer), fsa_input2string(timer->fsa_input), + timer->period_ms); + timer->counter++; + } + + if ((timer == election_timer) && (election_timer->counter > 5)) { + crm_notice("We appear to be in an election loop, something may be wrong"); + crm_write_blackbox(0, NULL); + election_timer->counter = 0; + } + + controld_stop_timer(timer); // Make timer _not_ go off again + + if (timer->fsa_input == I_INTEGRATED) { + crm_info("Welcomed: %d, Integrated: %d", + crmd_join_phase_count(crm_join_welcomed), + crmd_join_phase_count(crm_join_integrated)); + if (crmd_join_phase_count(crm_join_welcomed) == 0) { + // If we don't even have ourselves, start again + register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, NULL, + __func__); + + } else { + register_fsa_input_before(C_TIMER_POPPED, timer->fsa_input, NULL); + } + + } else if ((timer == recheck_timer) + && (controld_globals.fsa_state != S_IDLE)) { + crm_debug("Discarding %s event in state: %s", + fsa_input2string(timer->fsa_input), + fsa_state2string(controld_globals.fsa_state)); + + } else if ((timer == finalization_timer) + && (controld_globals.fsa_state != S_FINALIZE_JOIN)) { + crm_debug("Discarding %s event in state: %s", + fsa_input2string(timer->fsa_input), + fsa_state2string(controld_globals.fsa_state)); + + } else if (timer->fsa_input != I_NULL) { + register_fsa_input(C_TIMER_POPPED, timer->fsa_input, NULL); + } + + controld_trigger_fsa(); + + return TRUE; +} + +bool +controld_init_fsa_timers(void) +{ + transition_timer = calloc(1, sizeof(fsa_timer_t)); + if (transition_timer == NULL) { + return FALSE; + } + + integration_timer = calloc(1, sizeof(fsa_timer_t)); + if (integration_timer == NULL) { + return FALSE; + } + + finalization_timer = calloc(1, sizeof(fsa_timer_t)); + if (finalization_timer == NULL) { + return FALSE; + } + + election_timer = calloc(1, sizeof(fsa_timer_t)); + if (election_timer == NULL) { + return FALSE; + } + + shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t)); + if (shutdown_escalation_timer == NULL) { + return FALSE; + } + + wait_timer = calloc(1, sizeof(fsa_timer_t)); + if (wait_timer == NULL) { + return FALSE; + } + + recheck_timer = calloc(1, sizeof(fsa_timer_t)); + if (recheck_timer == NULL) { + return FALSE; + } + + election_timer->source_id = 0; + election_timer->period_ms = 0; + election_timer->fsa_input = I_DC_TIMEOUT; + election_timer->callback = crm_timer_popped; + election_timer->log_error = FALSE; + + transition_timer->source_id = 0; + transition_timer->period_ms = 0; + transition_timer->fsa_input = I_PE_CALC; + transition_timer->callback = crm_timer_popped; + transition_timer->log_error = FALSE; + + integration_timer->source_id = 0; + integration_timer->period_ms = 0; + integration_timer->fsa_input = I_INTEGRATED; + integration_timer->callback = crm_timer_popped; + integration_timer->log_error = TRUE; + + finalization_timer->source_id = 0; + finalization_timer->period_ms = 0; + finalization_timer->fsa_input = I_FINALIZED; + finalization_timer->callback = crm_timer_popped; + finalization_timer->log_error = FALSE; + + /* We can't use I_FINALIZED here, because that creates a bug in the join + * process where a joining node can be stuck in S_PENDING while we think it + * is in S_NOT_DC. This created an infinite transition loop in which we + * continually send probes which the node NACKs because it's pending. + * + * If we have nodes where the cluster layer is active but the controller is + * not, we can avoid this causing an election/join loop, in the integration + * phase. + */ + finalization_timer->fsa_input = I_ELECTION; + + shutdown_escalation_timer->source_id = 0; + shutdown_escalation_timer->period_ms = 0; + shutdown_escalation_timer->fsa_input = I_STOP; + shutdown_escalation_timer->callback = crm_timer_popped; + shutdown_escalation_timer->log_error = TRUE; + + wait_timer->source_id = 0; + wait_timer->period_ms = 2000; + wait_timer->fsa_input = I_NULL; + wait_timer->callback = crm_timer_popped; + wait_timer->log_error = FALSE; + + recheck_timer->source_id = 0; + recheck_timer->period_ms = 0; + recheck_timer->fsa_input = I_PE_CALC; + recheck_timer->callback = crm_timer_popped; + recheck_timer->log_error = FALSE; + + return TRUE; +} + +/*! + * \internal + * \brief Configure timers based on the CIB + * + * \param[in,out] options Name/value pairs for configured options + */ +void +controld_configure_fsa_timers(GHashTable *options) +{ + const char *value = NULL; + + // Election timer + value = g_hash_table_lookup(options, XML_CONFIG_ATTR_DC_DEADTIME); + election_timer->period_ms = crm_parse_interval_spec(value); + + // Integration timer + value = g_hash_table_lookup(options, "join-integration-timeout"); + integration_timer->period_ms = crm_parse_interval_spec(value); + + // Finalization timer + value = g_hash_table_lookup(options, "join-finalization-timeout"); + finalization_timer->period_ms = crm_parse_interval_spec(value); + + // Shutdown escalation timer + value = g_hash_table_lookup(options, XML_CONFIG_ATTR_FORCE_QUIT); + shutdown_escalation_timer->period_ms = crm_parse_interval_spec(value); + crm_debug("Shutdown escalation occurs if DC has not responded to request " + "in %ums", shutdown_escalation_timer->period_ms); + + // Transition timer + value = g_hash_table_lookup(options, "transition-delay"); + transition_timer->period_ms = crm_parse_interval_spec(value); + + // Recheck interval + value = g_hash_table_lookup(options, XML_CONFIG_ATTR_RECHECK); + recheck_interval_ms = crm_parse_interval_spec(value); + crm_debug("Re-run scheduler after %dms of inactivity", recheck_interval_ms); +} + +void +controld_free_fsa_timers(void) +{ + controld_stop_timer(transition_timer); + controld_stop_timer(integration_timer); + controld_stop_timer(finalization_timer); + controld_stop_timer(election_timer); + controld_stop_timer(shutdown_escalation_timer); + controld_stop_timer(wait_timer); + controld_stop_timer(recheck_timer); + + free(transition_timer); transition_timer = NULL; + free(integration_timer); integration_timer = NULL; + free(finalization_timer); finalization_timer = NULL; + free(election_timer); election_timer = NULL; + free(shutdown_escalation_timer); shutdown_escalation_timer = NULL; + free(wait_timer); wait_timer = NULL; + free(recheck_timer); recheck_timer = NULL; +} + +/*! + * \internal + * \brief Check whether the transition timer is started + * \return true if the transition timer is started, or false otherwise + */ +bool +controld_is_started_transition_timer(void) +{ + return (transition_timer->period_ms > 0) + && (transition_timer->source_id != 0); +} + +/*! + * \internal + * \brief Start the recheck timer + */ +void +controld_start_recheck_timer(void) +{ + // Default to recheck interval configured in CIB (if any) + guint period_ms = recheck_interval_ms; + + // If scheduler supplied a "recheck by" time, check whether that's sooner + if (controld_globals.transition_graph->recheck_by > 0) { + time_t diff_seconds = controld_globals.transition_graph->recheck_by + - time(NULL); + + if (diff_seconds < 1) { + // We're already past the desired time + period_ms = 500; + } else { + period_ms = (guint) diff_seconds * 1000; + } + + // Use "recheck by" only if it's sooner than interval from CIB + if (period_ms > recheck_interval_ms) { + period_ms = recheck_interval_ms; + } + } + + if (period_ms > 0) { + recheck_timer->period_ms = period_ms; + controld_start_timer(recheck_timer); + } +} + +/*! + * \internal + * \brief Start the wait timer + */ +void +controld_start_wait_timer(void) +{ + controld_start_timer(wait_timer); +} + +/*! + * \internal + * \brief Stop the recheck timer + * + * \return true if the recheck timer was running, or false otherwise + */ +bool +controld_stop_recheck_timer(void) +{ + return controld_stop_timer(recheck_timer); +} + +/*! + * \brief Get the transition timer's configured period + * \return The transition_timer's period + */ +guint +controld_get_period_transition_timer(void) +{ + return transition_timer->period_ms; +} + +/*! + * \internal + * \brief Reset the election timer's counter to 0 + */ +void +controld_reset_counter_election_timer(void) +{ + election_timer->counter = 0; +} + +/*! + * \internal + * \brief Stop the transition timer + * + * \return true if the transition timer was running, or false otherwise + */ +bool +controld_stop_transition_timer(void) +{ + return controld_stop_timer(transition_timer); +} + +/*! + * \internal + * \brief Start the transition timer + */ +void +controld_start_transition_timer(void) +{ + controld_start_timer(transition_timer); +} + +/*! + * \internal + * \brief Start the countdown sequence for a shutdown + * + * \param[in] default_period_ms Period to use if the shutdown escalation + * timer's period is 0 + */ +void +controld_shutdown_start_countdown(guint default_period_ms) +{ + if (shutdown_escalation_timer->period_ms == 0) { + shutdown_escalation_timer->period_ms = default_period_ms; + } + + crm_notice("Initiating controller shutdown sequence " CRM_XS " limit=%ums", + shutdown_escalation_timer->period_ms); + controld_start_timer(shutdown_escalation_timer); +} |