summaryrefslogtreecommitdiffstats
path: root/lib/common/watchdog.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/common/watchdog.c')
-rw-r--r--lib/common/watchdog.c311
1 files changed, 311 insertions, 0 deletions
diff --git a/lib/common/watchdog.c b/lib/common/watchdog.c
new file mode 100644
index 0000000..ff2d273
--- /dev/null
+++ b/lib/common/watchdog.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright 2013-2023 the Pacemaker project contributors
+ *
+ * The version control history for this file may have further details.
+ *
+ * This source code is licensed under the GNU Lesser General Public License
+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
+ */
+
+#include <crm_internal.h>
+
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/reboot.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <signal.h>
+
+#ifdef _POSIX_MEMLOCK
+# include <sys/mman.h>
+#endif
+
+static pid_t sbd_pid = 0;
+
+static void
+sysrq_trigger(char t)
+{
+#if HAVE_LINUX_PROCFS
+ FILE *procf;
+
+ // Root can always write here, regardless of kernel.sysrq value
+ procf = fopen("/proc/sysrq-trigger", "a");
+ if (!procf) {
+ crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
+ return;
+ }
+ crm_info("sysrq-trigger: %c", t);
+ fprintf(procf, "%c\n", t);
+ fclose(procf);
+#endif // HAVE_LINUX_PROCFS
+ return;
+}
+
+
+/*!
+ * \internal
+ * \brief Panic the local host (if root) or tell pacemakerd to do so
+ */
+static void
+panic_local(void)
+{
+ int rc = pcmk_ok;
+ uid_t uid = geteuid();
+ pid_t ppid = getppid();
+
+ if(uid != 0 && ppid > 1) {
+ /* We're a non-root pacemaker daemon (pacemaker-based,
+ * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
+ * the original pacemakerd parent.
+ *
+ * Of these, only the controller is likely to be initiating resets.
+ */
+ crm_emerg("Signaling parent %lld to panic", (long long) ppid);
+ crm_exit(CRM_EX_PANIC);
+ return;
+
+ } else if (uid != 0) {
+#if HAVE_LINUX_PROCFS
+ /*
+ * No permissions, and no pacemakerd parent to escalate to.
+ * Track down the new pacemakerd process and send a signal instead.
+ */
+ union sigval signal_value;
+
+ memset(&signal_value, 0, sizeof(signal_value));
+ ppid = pcmk__procfs_pid_of("pacemakerd");
+ crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
+
+ if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
+ crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
+ (long long) ppid);
+ }
+#endif // HAVE_LINUX_PROCFS
+
+ /* The best we can do now is die */
+ crm_exit(CRM_EX_PANIC);
+ return;
+ }
+
+ /* We're either pacemakerd, or a pacemaker daemon running as root */
+
+ if (pcmk__str_eq("crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
+ sysrq_trigger('c');
+ } else if (pcmk__str_eq("sync-crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
+ sync();
+ sysrq_trigger('c');
+ } else {
+ if (pcmk__str_eq("sync-reboot", getenv("PCMK_panic_action"), pcmk__str_casei)) {
+ sync();
+ }
+ sysrq_trigger('b');
+ }
+ /* reboot(RB_HALT_SYSTEM); rc = errno; */
+ reboot(RB_AUTOBOOT);
+ rc = errno;
+
+ crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
+ (long long) ppid, pcmk_rc_str(rc), rc);
+
+ if(ppid > 1) {
+ /* child daemon */
+ exit(CRM_EX_PANIC);
+ } else {
+ /* pacemakerd or orphan child */
+ exit(CRM_EX_FATAL);
+ }
+}
+
+/*!
+ * \internal
+ * \brief Tell sbd to kill the local host, then exit
+ */
+static void
+panic_sbd(void)
+{
+ union sigval signal_value;
+ pid_t ppid = getppid();
+
+ crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
+
+ memset(&signal_value, 0, sizeof(signal_value));
+ /* TODO: Arrange for a slightly less brutal option? */
+ if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
+ crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
+ (long long) sbd_pid);
+ panic_local();
+ }
+
+ if(ppid > 1) {
+ /* child daemon */
+ exit(CRM_EX_PANIC);
+ } else {
+ /* pacemakerd or orphan child */
+ exit(CRM_EX_FATAL);
+ }
+}
+
+/*!
+ * \internal
+ * \brief Panic the local host
+ *
+ * Panic the local host either by sbd (if running), directly, or by asking
+ * pacemakerd. If trace logging this function, exit instead.
+ *
+ * \param[in] origin Function caller (for logging only)
+ */
+void
+pcmk__panic(const char *origin)
+{
+ /* Ensure sbd_pid is set */
+ (void) pcmk__locate_sbd();
+
+ pcmk__if_tracing(
+ {
+ // getppid() == 1 means our original parent no longer exists
+ crm_emerg("Shutting down instead of panicking the node "
+ CRM_XS " origin=%s sbd=%lld parent=%d",
+ origin, (long long) sbd_pid, getppid());
+ crm_exit(CRM_EX_FATAL);
+ return;
+ },
+ {}
+ );
+
+ if(sbd_pid > 1) {
+ crm_emerg("Signaling sbd[%lld] to panic the system: %s",
+ (long long) sbd_pid, origin);
+ panic_sbd();
+
+ } else {
+ crm_emerg("Panicking the system directly: %s", origin);
+ panic_local();
+ }
+}
+
+/*!
+ * \internal
+ * \brief Return the process ID of sbd (or 0 if it is not running)
+ */
+pid_t
+pcmk__locate_sbd(void)
+{
+ char *pidfile = NULL;
+ char *sbd_path = NULL;
+ int rc;
+
+ if(sbd_pid > 1) {
+ return sbd_pid;
+ }
+
+ /* Look for the pid file */
+ pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
+ sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
+
+ /* Read the pid file */
+ rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
+ if (rc == pcmk_rc_ok) {
+ crm_trace("SBD detected at pid %lld (via PID file %s)",
+ (long long) sbd_pid, pidfile);
+
+#if HAVE_LINUX_PROCFS
+ } else {
+ /* Fall back to /proc for systems that support it */
+ sbd_pid = pcmk__procfs_pid_of("sbd");
+ crm_trace("SBD detected at pid %lld (via procfs)",
+ (long long) sbd_pid);
+#endif // HAVE_LINUX_PROCFS
+ }
+
+ if(sbd_pid < 0) {
+ sbd_pid = 0;
+ crm_trace("SBD not detected");
+ }
+
+ free(pidfile);
+ free(sbd_path);
+
+ return sbd_pid;
+}
+
+long
+pcmk__get_sbd_timeout(void)
+{
+ static long sbd_timeout = -2;
+
+ if (sbd_timeout == -2) {
+ sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
+ }
+ return sbd_timeout;
+}
+
+bool
+pcmk__get_sbd_sync_resource_startup(void)
+{
+ static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
+ static bool checked_sync_resource_startup = false;
+
+ if (!checked_sync_resource_startup) {
+ const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
+
+ if (sync_env == NULL) {
+ crm_trace("Defaulting to %sstart-up synchronization with sbd",
+ (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
+
+ } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
+ crm_warn("Defaulting to %sstart-up synchronization with sbd "
+ "because environment value '%s' is invalid",
+ (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
+ }
+ checked_sync_resource_startup = true;
+ }
+ return sync_resource_startup != 0;
+}
+
+long
+pcmk__auto_watchdog_timeout(void)
+{
+ long sbd_timeout = pcmk__get_sbd_timeout();
+
+ return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
+}
+
+bool
+pcmk__valid_sbd_timeout(const char *value)
+{
+ long st_timeout = value? crm_get_msec(value) : 0;
+
+ if (st_timeout < 0) {
+ st_timeout = pcmk__auto_watchdog_timeout();
+ crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
+ st_timeout, value);
+ }
+
+ if (st_timeout == 0) {
+ crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
+ value? value : "default");
+
+ } else if (pcmk__locate_sbd() == 0) {
+ crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
+ "but SBD not active", (value? value : "auto"));
+ crm_exit(CRM_EX_FATAL);
+ return false;
+
+ } else {
+ long sbd_timeout = pcmk__get_sbd_timeout();
+
+ if (st_timeout < sbd_timeout) {
+ crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
+ "(must be >%ldms)", value, sbd_timeout);
+ crm_exit(CRM_EX_FATAL);
+ return false;
+ }
+ crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
+ value, sbd_timeout);
+ }
+ return true;
+}