diff options
Diffstat (limited to 'daemons/pacemakerd/pcmkd_subdaemons.c')
-rw-r--r-- | daemons/pacemakerd/pcmkd_subdaemons.c | 116 |
1 files changed, 64 insertions, 52 deletions
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c index 21e432e..5bd3512 100644 --- a/daemons/pacemakerd/pcmkd_subdaemons.c +++ b/daemons/pacemakerd/pcmkd_subdaemons.c @@ -1,5 +1,5 @@ /* - * Copyright 2010-2023 the Pacemaker project contributors + * Copyright 2010-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -10,6 +10,10 @@ #include <crm_internal.h> #include "pacemakerd.h" +#if SUPPORT_COROSYNC +#include "pcmkd_corosync.h" +#endif + #include <errno.h> #include <grp.h> #include <signal.h> @@ -21,22 +25,25 @@ #include <unistd.h> #include <crm/cluster.h> -#include <crm/msg_xml.h> +#include <crm/common/xml.h> + +enum child_daemon_flags { + child_none = 0, + child_respawn = 1 << 0, + child_needs_cluster = 1 << 1, + child_needs_retry = 1 << 2, + child_active_before_startup = 1 << 3, +}; typedef struct pcmk_child_s { pid_t pid; int respawn_count; - bool respawn; const char *name; const char *uid; const char *command; const char *endpoint; /* IPC server name */ - bool needs_cluster; int check_count; - - /* Anything below here will be dynamically initialized */ - bool needs_retry; - bool active_before_startup; + uint32_t flags; } pcmk_child_t; #define PCMK_PROCESS_CHECK_INTERVAL 1 @@ -48,34 +55,34 @@ typedef struct pcmk_child_s { static pcmk_child_t pcmk_children[] = { { - 0, 0, true, "pacemaker-based", CRM_DAEMON_USER, + 0, 0, "pacemaker-based", CRM_DAEMON_USER, CRM_DAEMON_DIR "/pacemaker-based", PCMK__SERVER_BASED_RO, - true + 0, child_respawn | child_needs_cluster }, { - 0, 0, true, "pacemaker-fenced", NULL, + 0, 0, "pacemaker-fenced", NULL, CRM_DAEMON_DIR "/pacemaker-fenced", "stonith-ng", - true + 0, child_respawn | child_needs_cluster }, { - 0, 0, true, "pacemaker-execd", NULL, + 0, 0, "pacemaker-execd", NULL, CRM_DAEMON_DIR "/pacemaker-execd", CRM_SYSTEM_LRMD, - false + 0, child_respawn }, { - 0, 0, true, "pacemaker-attrd", CRM_DAEMON_USER, - CRM_DAEMON_DIR "/pacemaker-attrd", T_ATTRD, - true + 0, 0, "pacemaker-attrd", CRM_DAEMON_USER, + CRM_DAEMON_DIR "/pacemaker-attrd", PCMK__VALUE_ATTRD, + 0, child_respawn | child_needs_cluster }, { - 0, 0, true, "pacemaker-schedulerd", CRM_DAEMON_USER, + 0, 0, "pacemaker-schedulerd", CRM_DAEMON_USER, CRM_DAEMON_DIR "/pacemaker-schedulerd", CRM_SYSTEM_PENGINE, - false + 0, child_respawn }, { - 0, 0, true, "pacemaker-controld", CRM_DAEMON_USER, + 0, 0, "pacemaker-controld", CRM_DAEMON_USER, CRM_DAEMON_DIR "/pacemaker-controld", CRM_SYSTEM_CRMD, - true + 0, child_respawn | child_needs_cluster }, }; @@ -103,7 +110,7 @@ unsigned int shutdown_complete_state_reported_to = 0; gboolean shutdown_complete_state_reported_client_closed = FALSE; /* state we report when asked via pacemakerd-api status-ping */ -const char *pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_INIT; +const char *pacemakerd_state = PCMK__VALUE_INIT; gboolean running_with_sbd = FALSE; /* local copy */ GMainLoop *mainloop = NULL; @@ -154,7 +161,7 @@ check_next_subdaemon(gpointer user_data) pcmk_children[next_child].pid), pcmk_children[next_child].check_count); stop_child(&pcmk_children[next_child], SIGKILL); - if (pcmk_children[next_child].respawn) { + if (pcmk_is_set(pcmk_children[next_child].flags, child_respawn)) { /* as long as the respawn-limit isn't reached give it another round of check retries */ @@ -166,7 +173,7 @@ check_next_subdaemon(gpointer user_data) (long long) PCMK__SPECIAL_PID_AS_0( pcmk_children[next_child].pid), pcmk_children[next_child].check_count); - if (pcmk_children[next_child].respawn) { + if (pcmk_is_set(pcmk_children[next_child].flags, child_respawn)) { /* as long as the respawn-limit isn't reached and we haven't run out of connect retries we account this as progress we are willing @@ -180,7 +187,7 @@ check_next_subdaemon(gpointer user_data) */ break; case pcmk_rc_ipc_unresponsive: - if (!pcmk_children[next_child].respawn) { + if (!pcmk_is_set(pcmk_children[next_child].flags, child_respawn)) { /* if a subdaemon is down and we don't want it to be restarted this is a success during shutdown. if it isn't restarted anymore @@ -191,7 +198,7 @@ check_next_subdaemon(gpointer user_data) subdaemon_check_progress = time(NULL); } } - if (!pcmk_children[next_child].active_before_startup) { + if (!pcmk_is_set(pcmk_children[next_child].flags, child_active_before_startup)) { crm_trace("found %s[%lld] missing - signal-handler " "will take care of it", pcmk_children[next_child].name, @@ -199,7 +206,7 @@ check_next_subdaemon(gpointer user_data) pcmk_children[next_child].pid)); break; } - if (pcmk_children[next_child].respawn) { + if (pcmk_is_set(pcmk_children[next_child].flags, child_respawn)) { crm_err("%s[%lld] terminated", pcmk_children[next_child].name, (long long) PCMK__SPECIAL_PID_AS_0( @@ -264,14 +271,14 @@ pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitco case CRM_EX_FATAL: crm_warn("Shutting cluster down because %s[%d] had fatal failure", name, pid); - child->respawn = false; + child->flags &= ~child_respawn; fatal_error = TRUE; pcmk_shutdown(SIGTERM); break; case CRM_EX_PANIC: crm_emerg("%s[%d] instructed the machine to reset", name, pid); - child->respawn = false; + child->flags &= ~child_respawn; fatal_error = TRUE; pcmk__panic(__func__); pcmk_shutdown(SIGTERM); @@ -291,20 +298,20 @@ static void pcmk_process_exit(pcmk_child_t * child) { child->pid = 0; - child->active_before_startup = false; + child->flags &= ~child_active_before_startup; child->check_count = 0; child->respawn_count += 1; if (child->respawn_count > MAX_RESPAWN) { crm_err("Child respawn count exceeded by %s", child->name); - child->respawn = false; + child->flags &= ~child_respawn; } if (shutdown_trigger) { /* resume step-wise shutdown (returned TRUE yields no parallelizing) */ mainloop_set_trigger(shutdown_trigger); - } else if (!child->respawn) { + } else if (!pcmk_is_set(child->flags, child_respawn)) { /* nothing to do */ } else if (crm_is_true(pcmk__env_option(PCMK__ENV_FAIL_FAST))) { @@ -316,10 +323,10 @@ pcmk_process_exit(pcmk_child_t * child) " appears alright per %s IPC end-point", child->name, child->endpoint); - } else if (child->needs_cluster && !pcmkd_cluster_connected()) { + } else if (pcmk_is_set(child->flags, child_needs_cluster) && !pcmkd_cluster_connected()) { crm_notice("Not respawning %s subdaemon until cluster returns", child->name); - child->needs_retry = true; + child->flags |= child_needs_retry; } else { crm_notice("Respawning %s subdaemon after unexpected exit", @@ -336,7 +343,7 @@ pcmk_shutdown_worker(gpointer user_data) if (phase == PCMK__NELEM(pcmk_children) - 1) { crm_notice("Shutting down Pacemaker"); - pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTTINGDOWN; + pacemakerd_state = PCMK__VALUE_SHUTTING_DOWN; } for (; phase >= 0; phase--) { @@ -345,7 +352,7 @@ pcmk_shutdown_worker(gpointer user_data) if (child->pid != 0) { time_t now = time(NULL); - if (child->respawn) { + if (pcmk_is_set(child->flags, child_respawn)) { if (child->pid == PCMK__SPECIAL_PID) { crm_warn("The process behind %s IPC cannot be" " terminated, so either wait the graceful" @@ -359,7 +366,7 @@ pcmk_shutdown_worker(gpointer user_data) child->command); } next_log = now + 30; - child->respawn = false; + child->flags &= ~child_respawn; stop_child(child, SIGTERM); if (phase < PCMK_CHILD_CONTROLD) { g_timeout_add(SHUTDOWN_ESCALATION_PERIOD, @@ -381,7 +388,7 @@ pcmk_shutdown_worker(gpointer user_data) } crm_notice("Shutdown complete"); - pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE; + pacemakerd_state = PCMK__VALUE_SHUTDOWN_COMPLETE; if (!fatal_error && running_with_sbd && pcmk__get_sbd_sync_resource_startup() && !shutdown_complete_state_reported_client_closed) { @@ -393,8 +400,12 @@ pcmk_shutdown_worker(gpointer user_data) { const char *delay = pcmk__env_option(PCMK__ENV_SHUTDOWN_DELAY); if(delay) { + long long delay_ms = crm_get_msec(delay); + sync(); - pcmk__sleep_ms(crm_get_msec(delay)); + if (delay_ms > 0) { + pcmk__sleep_ms((unsigned int) QB_MIN(delay_ms, UINT_MAX)); + } } } @@ -427,7 +438,7 @@ start_child(pcmk_child_t * child) const char *env_valgrind = pcmk__env_option(PCMK__ENV_VALGRIND_ENABLED); const char *env_callgrind = pcmk__env_option(PCMK__ENV_CALLGRIND_ENABLED); - child->active_before_startup = false; + child->flags &= ~child_active_before_startup; child->check_count = 0; if (child->command == NULL) { @@ -481,19 +492,20 @@ start_child(pcmk_child_t * child) (void)setsid(); /* Setup the two alternate arg arrays */ - opts_vgrind[0] = strdup(VALGRIND_BIN); + opts_vgrind[0] = pcmk__str_copy(VALGRIND_BIN); if (use_callgrind) { - opts_vgrind[1] = strdup("--tool=callgrind"); - opts_vgrind[2] = strdup("--callgrind-out-file=" CRM_STATE_DIR "/callgrind.out.%p"); - opts_vgrind[3] = strdup(child->command); + opts_vgrind[1] = pcmk__str_copy("--tool=callgrind"); + opts_vgrind[2] = pcmk__str_copy("--callgrind-out-file=" + CRM_STATE_DIR "/callgrind.out.%p"); + opts_vgrind[3] = pcmk__str_copy(child->command); opts_vgrind[4] = NULL; } else { - opts_vgrind[1] = strdup(child->command); + opts_vgrind[1] = pcmk__str_copy(child->command); opts_vgrind[2] = NULL; opts_vgrind[3] = NULL; opts_vgrind[4] = NULL; } - opts_default[0] = strdup(child->command); + opts_default[0] = pcmk__str_copy(child->command); if(gid) { // Drop root group access if not needed @@ -759,7 +771,7 @@ find_and_track_existing_processes(void) (long long) PCMK__SPECIAL_PID_AS_0( pcmk_children[i].pid)); pcmk_children[i].respawn_count = -1; /* 0~keep watching */ - pcmk_children[i].active_before_startup = true; + pcmk_children[i].flags |= child_active_before_startup; break; case pcmk_rc_ipc_pid_only: if (pcmk_children[i].respawn_count == WAIT_TRIES) { @@ -802,7 +814,7 @@ find_and_track_existing_processes(void) gboolean init_children_processes(void *user_data) { - if (is_corosync_cluster()) { + if (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) { /* Corosync clusters can drop root group access, because we set * uidgid.gid.${gid}=1 via CMAP, which allows these processes to connect * to corosync. @@ -825,8 +837,8 @@ init_children_processes(void *user_data) * * This may be useful for the daemons to know */ - pcmk__set_env_option(PCMK__ENV_RESPAWNED, "true", false); - pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_RUNNING; + pcmk__set_env_option(PCMK__ENV_RESPAWNED, PCMK_VALUE_TRUE, false); + pacemakerd_state = PCMK__VALUE_RUNNING; return TRUE; } @@ -843,13 +855,13 @@ void restart_cluster_subdaemons(void) { for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) { - if (!pcmk_children[i].needs_retry || pcmk_children[i].pid != 0) { + if (!pcmk_is_set(pcmk_children[i].flags, child_needs_retry) || pcmk_children[i].pid != 0) { continue; } crm_notice("Respawning cluster-based subdaemon: %s", pcmk_children[i].name); if (start_child(&pcmk_children[i])) { - pcmk_children[i].needs_retry = false; + pcmk_children[i].flags &= ~child_needs_retry; } } } |