summaryrefslogtreecommitdiffstats
path: root/daemons/pacemakerd/pcmkd_subdaemons.c
diff options
context:
space:
mode:
Diffstat (limited to 'daemons/pacemakerd/pcmkd_subdaemons.c')
-rw-r--r--daemons/pacemakerd/pcmkd_subdaemons.c888
1 files changed, 888 insertions, 0 deletions
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
new file mode 100644
index 0000000..3b08ecc
--- /dev/null
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
@@ -0,0 +1,888 @@
+/*
+ * Copyright 2010-2022 the Pacemaker project contributors
+ *
+ * The version control history for this file may have further details.
+ *
+ * This source code is licensed under the GNU General Public License version 2
+ * or later (GPLv2+) WITHOUT ANY WARRANTY.
+ */
+
+#include <crm_internal.h>
+#include "pacemakerd.h"
+
+#include <errno.h>
+#include <grp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <crm/cluster.h>
+#include <crm/msg_xml.h>
+
+typedef struct pcmk_child_s {
+ pid_t pid;
+ int respawn_count;
+ bool respawn;
+ const char *name;
+ const char *uid;
+ const char *command;
+ const char *endpoint; /* IPC server name */
+ bool needs_cluster;
+ int check_count;
+
+ /* Anything below here will be dynamically initialized */
+ bool needs_retry;
+ bool active_before_startup;
+} pcmk_child_t;
+
+#define PCMK_PROCESS_CHECK_INTERVAL 1
+#define PCMK_PROCESS_CHECK_RETRIES 5
+#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
+
+/* Index into the array below */
+#define PCMK_CHILD_CONTROLD 5
+
+static pcmk_child_t pcmk_children[] = {
+ {
+ 0, 0, true, "pacemaker-based", CRM_DAEMON_USER,
+ CRM_DAEMON_DIR "/pacemaker-based", PCMK__SERVER_BASED_RO,
+ true
+ },
+ {
+ 0, 0, true, "pacemaker-fenced", NULL,
+ CRM_DAEMON_DIR "/pacemaker-fenced", "stonith-ng",
+ true
+ },
+ {
+ 0, 0, true, "pacemaker-execd", NULL,
+ CRM_DAEMON_DIR "/pacemaker-execd", CRM_SYSTEM_LRMD,
+ false
+ },
+ {
+ 0, 0, true, "pacemaker-attrd", CRM_DAEMON_USER,
+ CRM_DAEMON_DIR "/pacemaker-attrd", T_ATTRD,
+ true
+ },
+ {
+ 0, 0, true, "pacemaker-schedulerd", CRM_DAEMON_USER,
+ CRM_DAEMON_DIR "/pacemaker-schedulerd", CRM_SYSTEM_PENGINE,
+ false
+ },
+ {
+ 0, 0, true, "pacemaker-controld", CRM_DAEMON_USER,
+ CRM_DAEMON_DIR "/pacemaker-controld", CRM_SYSTEM_CRMD,
+ true
+ },
+};
+
+static char *opts_default[] = { NULL, NULL };
+static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
+
+crm_trigger_t *shutdown_trigger = NULL;
+crm_trigger_t *startup_trigger = NULL;
+time_t subdaemon_check_progress = 0;
+
+// Whether we need root group access to talk to cluster layer
+static bool need_root_group = true;
+
+/* When contacted via pacemakerd-api by a client having sbd in
+ * the name we assume it is sbd-daemon which wants to know
+ * if pacemakerd shutdown gracefully.
+ * Thus when everything is shutdown properly pacemakerd
+ * waits till it has reported the graceful completion of
+ * shutdown to sbd and just when sbd-client closes the
+ * connection we can assume that the report has arrived
+ * properly so that pacemakerd can finally exit.
+ * Following two variables are used to track that handshake.
+ */
+unsigned int shutdown_complete_state_reported_to = 0;
+gboolean shutdown_complete_state_reported_client_closed = FALSE;
+
+/* state we report when asked via pacemakerd-api status-ping */
+const char *pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_INIT;
+gboolean running_with_sbd = FALSE; /* local copy */
+
+GMainLoop *mainloop = NULL;
+
+static gboolean fatal_error = FALSE;
+
+static int child_liveness(pcmk_child_t *child);
+static gboolean escalate_shutdown(gpointer data);
+static int start_child(pcmk_child_t * child);
+static void pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode);
+static void pcmk_process_exit(pcmk_child_t * child);
+static gboolean pcmk_shutdown_worker(gpointer user_data);
+static gboolean stop_child(pcmk_child_t * child, int signal);
+
+static bool
+pcmkd_cluster_connected(void)
+{
+#if SUPPORT_COROSYNC
+ return pcmkd_corosync_connected();
+#else
+ return true;
+#endif
+}
+
+static gboolean
+check_next_subdaemon(gpointer user_data)
+{
+ static int next_child = 0;
+ int rc = child_liveness(&pcmk_children[next_child]);
+
+ crm_trace("Checked %s[%lld]: %s (%d)",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[next_child].pid),
+ pcmk_rc_str(rc), rc);
+
+ switch (rc) {
+ case pcmk_rc_ok:
+ pcmk_children[next_child].check_count = 0;
+ subdaemon_check_progress = time(NULL);
+ break;
+ case pcmk_rc_ipc_pid_only: // This case: it was previously OK
+ pcmk_children[next_child].check_count++;
+ if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
+ crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
+ "we found the pid so have it killed that we can restart",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ pcmk_children[next_child].check_count);
+ stop_child(&pcmk_children[next_child], SIGKILL);
+ if (pcmk_children[next_child].respawn) {
+ /* as long as the respawn-limit isn't reached
+ give it another round of check retries
+ */
+ pcmk_children[next_child].check_count = 0;
+ }
+ } else {
+ crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ pcmk_children[next_child].check_count);
+ if (pcmk_children[next_child].respawn) {
+ /* as long as the respawn-limit isn't reached
+ and we haven't run out of connect retries
+ we account this as progress we are willing
+ to tell to sbd
+ */
+ subdaemon_check_progress = time(NULL);
+ }
+ }
+ /* go to the next child and see if
+ we can make progress there
+ */
+ break;
+ case pcmk_rc_ipc_unresponsive:
+ if (!pcmk_children[next_child].respawn) {
+ /* if a subdaemon is down and we don't want it
+ to be restarted this is a success during
+ shutdown. if it isn't restarted anymore
+ due to MAX_RESPAWN it is
+ rather no success.
+ */
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
+ subdaemon_check_progress = time(NULL);
+ }
+ }
+ if (!pcmk_children[next_child].active_before_startup) {
+ crm_trace("found %s[%lld] missing - signal-handler "
+ "will take care of it",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ break;
+ }
+ if (pcmk_children[next_child].respawn) {
+ crm_err("%s[%lld] terminated",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ } else {
+ /* orderly shutdown */
+ crm_notice("%s[%lld] terminated",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ }
+ pcmk_process_exit(&(pcmk_children[next_child]));
+ break;
+ default:
+ crm_exit(CRM_EX_FATAL);
+ break; /* static analysis/noreturn */
+ }
+
+ next_child++;
+ if (next_child >= PCMK__NELEM(pcmk_children)) {
+ next_child = 0;
+ }
+
+ return G_SOURCE_CONTINUE;
+}
+
+static gboolean
+escalate_shutdown(gpointer data)
+{
+ pcmk_child_t *child = data;
+
+ if (child->pid == PCMK__SPECIAL_PID) {
+ pcmk_process_exit(child);
+
+ } else if (child->pid != 0) {
+ /* Use SIGSEGV instead of SIGKILL to create a core so we can see what it was up to */
+ crm_err("Child %s not terminating in a timely manner, forcing", child->name);
+ stop_child(child, SIGSEGV);
+ }
+ return FALSE;
+}
+
+static void
+pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)
+{
+ pcmk_child_t *child = mainloop_child_userdata(p);
+ const char *name = mainloop_child_name(p);
+
+ if (signo) {
+ do_crm_log(((signo == SIGKILL)? LOG_WARNING : LOG_ERR),
+ "%s[%d] terminated with signal %d (%s)%s",
+ name, pid, signo, strsignal(signo),
+ (core? " and dumped core" : ""));
+
+ } else {
+ switch(exitcode) {
+ case CRM_EX_OK:
+ crm_info("%s[%d] exited with status %d (%s)",
+ name, pid, exitcode, crm_exit_str(exitcode));
+ break;
+
+ case CRM_EX_FATAL:
+ crm_warn("Shutting cluster down because %s[%d] had fatal failure",
+ name, pid);
+ child->respawn = false;
+ fatal_error = TRUE;
+ pcmk_shutdown(SIGTERM);
+ break;
+
+ case CRM_EX_PANIC:
+ crm_emerg("%s[%d] instructed the machine to reset", name, pid);
+ child->respawn = false;
+ fatal_error = TRUE;
+ pcmk__panic(__func__);
+ pcmk_shutdown(SIGTERM);
+ break;
+
+ default:
+ crm_err("%s[%d] exited with status %d (%s)",
+ name, pid, exitcode, crm_exit_str(exitcode));
+ break;
+ }
+ }
+
+ pcmk_process_exit(child);
+}
+
+static void
+pcmk_process_exit(pcmk_child_t * child)
+{
+ child->pid = 0;
+ child->active_before_startup = false;
+ child->check_count = 0;
+
+ child->respawn_count += 1;
+ if (child->respawn_count > MAX_RESPAWN) {
+ crm_err("Child respawn count exceeded by %s", child->name);
+ child->respawn = false;
+ }
+
+ if (shutdown_trigger) {
+ /* resume step-wise shutdown (returned TRUE yields no parallelizing) */
+ mainloop_set_trigger(shutdown_trigger);
+
+ } else if (!child->respawn) {
+ /* nothing to do */
+
+ } else if (crm_is_true(getenv("PCMK_fail_fast"))) {
+ crm_err("Rebooting system because of %s", child->name);
+ pcmk__panic(__func__);
+
+ } else if (child_liveness(child) == pcmk_rc_ok) {
+ crm_warn("One-off suppressing strict respawning of a child process %s,"
+ " appears alright per %s IPC end-point",
+ child->name, child->endpoint);
+
+ } else if (child->needs_cluster && !pcmkd_cluster_connected()) {
+ crm_notice("Not respawning %s subdaemon until cluster returns",
+ child->name);
+ child->needs_retry = true;
+
+ } else {
+ crm_notice("Respawning %s subdaemon after unexpected exit",
+ child->name);
+ start_child(child);
+ }
+}
+
+static gboolean
+pcmk_shutdown_worker(gpointer user_data)
+{
+ static int phase = PCMK__NELEM(pcmk_children) - 1;
+ static time_t next_log = 0;
+
+ if (phase == PCMK__NELEM(pcmk_children) - 1) {
+ crm_notice("Shutting down Pacemaker");
+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTTINGDOWN;
+ }
+
+ for (; phase >= 0; phase--) {
+ pcmk_child_t *child = &(pcmk_children[phase]);
+
+ if (child->pid != 0) {
+ time_t now = time(NULL);
+
+ if (child->respawn) {
+ if (child->pid == PCMK__SPECIAL_PID) {
+ crm_warn("The process behind %s IPC cannot be"
+ " terminated, so either wait the graceful"
+ " period of %ld s for its native termination"
+ " if it vitally depends on some other daemons"
+ " going down in a controlled way already,"
+ " or locate and kill the correct %s process"
+ " on your own; set PCMK_fail_fast=1 to avoid"
+ " this altogether next time around",
+ child->name, (long) SHUTDOWN_ESCALATION_PERIOD,
+ child->command);
+ }
+ next_log = now + 30;
+ child->respawn = false;
+ stop_child(child, SIGTERM);
+ if (phase < PCMK_CHILD_CONTROLD) {
+ g_timeout_add(SHUTDOWN_ESCALATION_PERIOD,
+ escalate_shutdown, child);
+ }
+
+ } else if (now >= next_log) {
+ next_log = now + 30;
+ crm_notice("Still waiting for %s to terminate "
+ CRM_XS " pid=%lld",
+ child->name, (long long) child->pid);
+ }
+ return TRUE;
+ }
+
+ /* cleanup */
+ crm_debug("%s confirmed stopped", child->name);
+ child->pid = 0;
+ }
+
+ crm_notice("Shutdown complete");
+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE;
+ if (!fatal_error && running_with_sbd &&
+ pcmk__get_sbd_sync_resource_startup() &&
+ !shutdown_complete_state_reported_client_closed) {
+ crm_notice("Waiting for SBD to pick up shutdown-complete-state.");
+ return TRUE;
+ }
+
+ {
+ const char *delay = pcmk__env_option(PCMK__ENV_SHUTDOWN_DELAY);
+ if(delay) {
+ sync();
+ pcmk__sleep_ms(crm_get_msec(delay));
+ }
+ }
+
+ g_main_loop_quit(mainloop);
+
+ if (fatal_error) {
+ crm_notice("Shutting down and staying down after fatal error");
+#ifdef SUPPORT_COROSYNC
+ pcmkd_shutdown_corosync();
+#endif
+ crm_exit(CRM_EX_FATAL);
+ }
+
+ return TRUE;
+}
+
+/* TODO once libqb is taught to juggle with IPC end-points carried over as
+ bare file descriptor (https://github.com/ClusterLabs/libqb/issues/325)
+ it shall hand over these descriptors here if/once they are successfully
+ pre-opened in (presumably) child_liveness(), to avoid any remaining
+ room for races */
+ // \return Standard Pacemaker return code
+static int
+start_child(pcmk_child_t * child)
+{
+ uid_t uid = 0;
+ gid_t gid = 0;
+ gboolean use_valgrind = FALSE;
+ gboolean use_callgrind = FALSE;
+ const char *env_valgrind = getenv("PCMK_valgrind_enabled");
+ const char *env_callgrind = getenv("PCMK_callgrind_enabled");
+
+ child->active_before_startup = false;
+ child->check_count = 0;
+
+ if (child->command == NULL) {
+ crm_info("Nothing to do for child \"%s\"", child->name);
+ return pcmk_rc_ok;
+ }
+
+ if (env_callgrind != NULL && crm_is_true(env_callgrind)) {
+ use_callgrind = TRUE;
+ use_valgrind = TRUE;
+
+ } else if (env_callgrind != NULL && strstr(env_callgrind, child->name)) {
+ use_callgrind = TRUE;
+ use_valgrind = TRUE;
+
+ } else if (env_valgrind != NULL && crm_is_true(env_valgrind)) {
+ use_valgrind = TRUE;
+
+ } else if (env_valgrind != NULL && strstr(env_valgrind, child->name)) {
+ use_valgrind = TRUE;
+ }
+
+ if (use_valgrind && strlen(VALGRIND_BIN) == 0) {
+ crm_warn("Cannot enable valgrind for %s:"
+ " The location of the valgrind binary is unknown", child->name);
+ use_valgrind = FALSE;
+ }
+
+ if (child->uid) {
+ if (crm_user_lookup(child->uid, &uid, &gid) < 0) {
+ crm_err("Invalid user (%s) for %s: not found", child->uid, child->name);
+ return EACCES;
+ }
+ crm_info("Using uid=%u and group=%u for process %s", uid, gid, child->name);
+ }
+
+ child->pid = fork();
+ CRM_ASSERT(child->pid != -1);
+
+ if (child->pid > 0) {
+ /* parent */
+ mainloop_child_add(child->pid, 0, child->name, child, pcmk_child_exit);
+
+ crm_info("Forked child %lld for process %s%s",
+ (long long) child->pid, child->name,
+ use_valgrind ? " (valgrind enabled: " VALGRIND_BIN ")" : "");
+ return pcmk_rc_ok;
+
+ } else {
+ /* Start a new session */
+ (void)setsid();
+
+ /* Setup the two alternate arg arrays */
+ opts_vgrind[0] = strdup(VALGRIND_BIN);
+ if (use_callgrind) {
+ opts_vgrind[1] = strdup("--tool=callgrind");
+ opts_vgrind[2] = strdup("--callgrind-out-file=" CRM_STATE_DIR "/callgrind.out.%p");
+ opts_vgrind[3] = strdup(child->command);
+ opts_vgrind[4] = NULL;
+ } else {
+ opts_vgrind[1] = strdup(child->command);
+ opts_vgrind[2] = NULL;
+ opts_vgrind[3] = NULL;
+ opts_vgrind[4] = NULL;
+ }
+ opts_default[0] = strdup(child->command);
+
+ if(gid) {
+ // Drop root group access if not needed
+ if (!need_root_group && (setgid(gid) < 0)) {
+ crm_warn("Could not set group to %d: %s", gid, strerror(errno));
+ }
+
+ /* Initialize supplementary groups to only those always granted to
+ * the user, plus haclient (so we can access IPC).
+ */
+ if (initgroups(child->uid, gid) < 0) {
+ crm_err("Cannot initialize groups for %s: %s (%d)",
+ child->uid, pcmk_rc_str(errno), errno);
+ }
+ }
+
+ if (uid && setuid(uid) < 0) {
+ crm_warn("Could not set user to %s (id %d): %s",
+ child->uid, uid, strerror(errno));
+ }
+
+ pcmk__close_fds_in_child(true);
+
+ pcmk__open_devnull(O_RDONLY); // stdin (fd 0)
+ pcmk__open_devnull(O_WRONLY); // stdout (fd 1)
+ pcmk__open_devnull(O_WRONLY); // stderr (fd 2)
+
+ if (use_valgrind) {
+ (void)execvp(VALGRIND_BIN, opts_vgrind);
+ } else {
+ (void)execvp(child->command, opts_default);
+ }
+ crm_crit("Could not execute %s: %s", child->command, strerror(errno));
+ crm_exit(CRM_EX_FATAL);
+ }
+ return pcmk_rc_ok; /* never reached */
+}
+
+/*!
+ * \internal
+ * \brief Check the liveness of the child based on IPC name and PID if tracked
+ *
+ * \param[in,out] child Child tracked data
+ *
+ * \return Standard Pacemaker return code
+ *
+ * \note Return codes of particular interest include pcmk_rc_ipc_unresponsive
+ * indicating that no trace of IPC liveness was detected,
+ * pcmk_rc_ipc_unauthorized indicating that the IPC endpoint is blocked by
+ * an unauthorized process, and pcmk_rc_ipc_pid_only indicating that
+ * the child is up by PID but not IPC end-point (possibly starting).
+ * \note This function doesn't modify any of \p child members but \c pid,
+ * and is not actively toying with processes as such but invoking
+ * \c stop_child in one particular case (there's for some reason
+ * a different authentic holder of the IPC end-point).
+ */
+static int
+child_liveness(pcmk_child_t *child)
+{
+ uid_t cl_uid = 0;
+ gid_t cl_gid = 0;
+ const uid_t root_uid = 0;
+ const gid_t root_gid = 0;
+ const uid_t *ref_uid;
+ const gid_t *ref_gid;
+ int rc = pcmk_rc_ipc_unresponsive;
+ pid_t ipc_pid = 0;
+
+ if (child->endpoint == NULL
+ && (child->pid <= 0 || child->pid == PCMK__SPECIAL_PID)) {
+ crm_err("Cannot track child %s for missing both API end-point and PID",
+ child->name);
+ rc = EINVAL; // Misuse of function when child is not trackable
+
+ } else if (child->endpoint != NULL) {
+ int legacy_rc = pcmk_ok;
+
+ if (child->uid == NULL) {
+ ref_uid = &root_uid;
+ ref_gid = &root_gid;
+ } else {
+ ref_uid = &cl_uid;
+ ref_gid = &cl_gid;
+ legacy_rc = pcmk_daemon_user(&cl_uid, &cl_gid);
+ }
+
+ if (legacy_rc < 0) {
+ rc = pcmk_legacy2rc(legacy_rc);
+ crm_err("Could not find user and group IDs for user %s: %s "
+ CRM_XS " rc=%d", CRM_DAEMON_USER, pcmk_rc_str(rc), rc);
+ } else {
+ rc = pcmk__ipc_is_authentic_process_active(child->endpoint,
+ *ref_uid, *ref_gid,
+ &ipc_pid);
+ if ((rc == pcmk_rc_ok) || (rc == pcmk_rc_ipc_unresponsive)) {
+ if (child->pid <= 0) {
+ /* If rc is pcmk_rc_ok, ipc_pid is nonzero and this
+ * initializes a new child. If rc is
+ * pcmk_rc_ipc_unresponsive, ipc_pid is zero, and we will
+ * investigate further.
+ */
+ child->pid = ipc_pid;
+ } else if ((ipc_pid != 0) && (child->pid != ipc_pid)) {
+ /* An unexpected (but authorized) process is responding to
+ * IPC. Investigate further.
+ */
+ rc = pcmk_rc_ipc_unresponsive;
+ }
+ }
+ }
+ }
+
+ if (rc == pcmk_rc_ipc_unresponsive) {
+ /* If we get here, a child without IPC is being tracked, no IPC liveness
+ * has been detected, or IPC liveness has been detected with an
+ * unexpected (but authorized) process. This is safe on FreeBSD since
+ * the only change possible from a proper child's PID into "special" PID
+ * of 1 behind more loosely related process.
+ */
+ int ret = pcmk__pid_active(child->pid, child->name);
+
+ if (ipc_pid && ((ret != pcmk_rc_ok)
+ || ipc_pid == PCMK__SPECIAL_PID
+ || (pcmk__pid_active(ipc_pid,
+ child->name) == pcmk_rc_ok))) {
+ /* An unexpected (but authorized) process was detected at the IPC
+ * endpoint, and either it is active, or the child we're tracking is
+ * not.
+ */
+
+ if (ret == pcmk_rc_ok) {
+ /* The child we're tracking is active. Kill it, and adopt the
+ * detected process. This assumes that our children don't fork
+ * (thus getting a different PID owning the IPC), but rather the
+ * tracking got out of sync because of some means external to
+ * Pacemaker, and adopting the detected process is better than
+ * killing it and possibly having to spawn a new child.
+ */
+ /* not possessing IPC, afterall (what about corosync CPG?) */
+ stop_child(child, SIGKILL);
+ }
+ rc = pcmk_rc_ok;
+ child->pid = ipc_pid;
+ } else if (ret == pcmk_rc_ok) {
+ // Our tracked child's PID was found active, but not its IPC
+ rc = pcmk_rc_ipc_pid_only;
+ } else if ((child->pid == 0) && (ret == EINVAL)) {
+ // FreeBSD can return EINVAL
+ rc = pcmk_rc_ipc_unresponsive;
+ } else {
+ switch (ret) {
+ case EACCES:
+ rc = pcmk_rc_ipc_unauthorized;
+ break;
+ case ESRCH:
+ rc = pcmk_rc_ipc_unresponsive;
+ break;
+ default:
+ rc = ret;
+ break;
+ }
+ }
+ }
+ return rc;
+}
+
+/*!
+ * \internal
+ * \brief Initial one-off check of the pre-existing "child" processes
+ *
+ * With "child" process, we mean the subdaemon that defines an API end-point
+ * (all of them do as of the comment) -- the possible complement is skipped
+ * as it is deemed it has no such shared resources to cause conflicts about,
+ * hence it can presumably be started anew without hesitation.
+ * If that won't hold true in the future, the concept of a shared resource
+ * will have to be generalized beyond the API end-point.
+ *
+ * For boundary cases that the "child" is still starting (IPC end-point is yet
+ * to be witnessed), or more rarely (practically FreeBSD only), when there's
+ * a pre-existing "untrackable" authentic process, we give the situation some
+ * time to possibly unfold in the right direction, meaning that said socket
+ * will appear or the unattainable process will disappear per the observable
+ * IPC, respectively.
+ *
+ * \return Standard Pacemaker return code
+ *
+ * \note Since this gets run at the very start, \c respawn_count fields
+ * for particular children get temporarily overloaded with "rounds
+ * of waiting" tracking, restored once we are about to finish with
+ * success (i.e. returning value >=0) and will remain unrestored
+ * otherwise. One way to suppress liveness detection logic for
+ * particular child is to set the said value to a negative number.
+ */
+#define WAIT_TRIES 4 /* together with interleaved sleeps, worst case ~ 1s */
+int
+find_and_track_existing_processes(void)
+{
+ bool wait_in_progress;
+ int rc;
+ size_t i, rounds;
+
+ for (rounds = 1; rounds <= WAIT_TRIES; rounds++) {
+ wait_in_progress = false;
+ for (i = 0; i < PCMK__NELEM(pcmk_children); i++) {
+
+ if ((pcmk_children[i].endpoint == NULL)
+ || (pcmk_children[i].respawn_count < 0)) {
+ continue;
+ }
+
+ rc = child_liveness(&pcmk_children[i]);
+ if (rc == pcmk_rc_ipc_unresponsive) {
+ /* As a speculation, don't give up if there are more rounds to
+ * come for other reasons, but don't artificially wait just
+ * because of this, since we would preferably start ASAP.
+ */
+ continue;
+ }
+
+ pcmk_children[i].respawn_count = rounds;
+ switch (rc) {
+ case pcmk_rc_ok:
+ if (pcmk_children[i].pid == PCMK__SPECIAL_PID) {
+ if (crm_is_true(getenv("PCMK_fail_fast"))) {
+ crm_crit("Cannot reliably track pre-existing"
+ " authentic process behind %s IPC on this"
+ " platform and PCMK_fail_fast requested",
+ pcmk_children[i].endpoint);
+ return EOPNOTSUPP;
+ } else if (pcmk_children[i].respawn_count == WAIT_TRIES) {
+ crm_notice("Assuming pre-existing authentic, though"
+ " on this platform untrackable, process"
+ " behind %s IPC is stable (was in %d"
+ " previous samples) so rather than"
+ " bailing out (PCMK_fail_fast not"
+ " requested), we just switch to a less"
+ " optimal IPC liveness monitoring"
+ " (not very suitable for heavy load)",
+ pcmk_children[i].name, WAIT_TRIES - 1);
+ crm_warn("The process behind %s IPC cannot be"
+ " terminated, so the overall shutdown"
+ " will get delayed implicitly (%ld s),"
+ " which serves as a graceful period for"
+ " its native termination if it vitally"
+ " depends on some other daemons going"
+ " down in a controlled way already",
+ pcmk_children[i].name,
+ (long) SHUTDOWN_ESCALATION_PERIOD);
+ } else {
+ wait_in_progress = true;
+ crm_warn("Cannot reliably track pre-existing"
+ " authentic process behind %s IPC on this"
+ " platform, can still disappear in %d"
+ " attempt(s)", pcmk_children[i].endpoint,
+ WAIT_TRIES - pcmk_children[i].respawn_count);
+ continue;
+ }
+ }
+ crm_notice("Tracking existing %s process (pid=%lld)",
+ pcmk_children[i].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[i].pid));
+ pcmk_children[i].respawn_count = -1; /* 0~keep watching */
+ pcmk_children[i].active_before_startup = true;
+ break;
+ case pcmk_rc_ipc_pid_only:
+ if (pcmk_children[i].respawn_count == WAIT_TRIES) {
+ crm_crit("%s IPC end-point for existing authentic"
+ " process %lld did not (re)appear",
+ pcmk_children[i].endpoint,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[i].pid));
+ return rc;
+ }
+ wait_in_progress = true;
+ crm_warn("Cannot find %s IPC end-point for existing"
+ " authentic process %lld, can still (re)appear"
+ " in %d attempts (?)",
+ pcmk_children[i].endpoint,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[i].pid),
+ WAIT_TRIES - pcmk_children[i].respawn_count);
+ continue;
+ default:
+ crm_crit("Checked liveness of %s: %s " CRM_XS " rc=%d",
+ pcmk_children[i].name, pcmk_rc_str(rc), rc);
+ return rc;
+ }
+ }
+ if (!wait_in_progress) {
+ break;
+ }
+ pcmk__sleep_ms(250); // Wait a bit for changes to possibly happen
+ }
+ for (i = 0; i < PCMK__NELEM(pcmk_children); i++) {
+ pcmk_children[i].respawn_count = 0; /* restore pristine state */
+ }
+
+ g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, check_next_subdaemon,
+ NULL);
+ return pcmk_rc_ok;
+}
+
+gboolean
+init_children_processes(void *user_data)
+{
+ if (is_corosync_cluster()) {
+ /* Corosync clusters can drop root group access, because we set
+ * uidgid.gid.${gid}=1 via CMAP, which allows these processes to connect
+ * to corosync.
+ */
+ need_root_group = false;
+ }
+
+ /* start any children that have not been detected */
+ for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
+ if (pcmk_children[i].pid != 0) {
+ /* we are already tracking it */
+ continue;
+ }
+
+ start_child(&(pcmk_children[i]));
+ }
+
+ /* From this point on, any daemons being started will be due to
+ * respawning rather than node start.
+ *
+ * This may be useful for the daemons to know
+ */
+ setenv("PCMK_respawned", "true", 1);
+ pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_RUNNING;
+ return TRUE;
+}
+
+void
+pcmk_shutdown(int nsig)
+{
+ if (shutdown_trigger == NULL) {
+ shutdown_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, pcmk_shutdown_worker, NULL);
+ }
+ mainloop_set_trigger(shutdown_trigger);
+}
+
+void
+restart_cluster_subdaemons(void)
+{
+ for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
+ if (!pcmk_children[i].needs_retry || pcmk_children[i].pid != 0) {
+ continue;
+ }
+
+ crm_notice("Respawning cluster-based subdaemon: %s", pcmk_children[i].name);
+ if (start_child(&pcmk_children[i])) {
+ pcmk_children[i].needs_retry = false;
+ }
+ }
+}
+
+static gboolean
+stop_child(pcmk_child_t * child, int signal)
+{
+ if (signal == 0) {
+ signal = SIGTERM;
+ }
+
+ /* why to skip PID of 1?
+ - FreeBSD ~ how untrackable process behind IPC is masqueraded as
+ - elsewhere: how "init" task is designated; in particular, in systemd
+ arrangement of socket-based activation, this is pretty real */
+ if (child->command == NULL || child->pid == PCMK__SPECIAL_PID) {
+ crm_debug("Nothing to do for child \"%s\" (process %lld)",
+ child->name, (long long) PCMK__SPECIAL_PID_AS_0(child->pid));
+ return TRUE;
+ }
+
+ if (child->pid <= 0) {
+ crm_trace("Client %s not running", child->name);
+ return TRUE;
+ }
+
+ errno = 0;
+ if (kill(child->pid, signal) == 0) {
+ crm_notice("Stopping %s "CRM_XS" sent signal %d to process %lld",
+ child->name, signal, (long long) child->pid);
+
+ } else {
+ crm_err("Could not stop %s (process %lld) with signal %d: %s",
+ child->name, (long long) child->pid, signal, strerror(errno));
+ }
+
+ return TRUE;
+}
+