diff options
Diffstat (limited to '')
-rw-r--r-- | src/nspawn/nspawn-stub-pid1.c | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/src/nspawn/nspawn-stub-pid1.c b/src/nspawn/nspawn-stub-pid1.c new file mode 100644 index 0000000..5d17df3 --- /dev/null +++ b/src/nspawn/nspawn-stub-pid1.c @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include <sys/reboot.h> +#include <sys/wait.h> +#include <sys/prctl.h> +#include <unistd.h> + +#include "def.h" +#include "exit-status.h" +#include "fd-util.h" +#include "log.h" +#include "missing.h" +#include "nspawn-stub-pid1.h" +#include "process-util.h" +#include "signal-util.h" +#include "time-util.h" + +static int reset_environ(const char *new_environment, size_t length) { + unsigned long start, end; + + start = (unsigned long) new_environment; + end = start + length; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0) + return -errno; + + if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0) + return -errno; + + return 0; +} + +int stub_pid1(sd_id128_t uuid) { + enum { + STATE_RUNNING, + STATE_REBOOT, + STATE_POWEROFF, + } state = STATE_RUNNING; + + sigset_t fullmask, oldmask, waitmask; + usec_t quit_usec = USEC_INFINITY; + pid_t pid; + int r; + + /* The new environment we set up, on the stack. */ + char new_environment[] = + "container=systemd-nspawn\0" + "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; + + /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful + * for allowing arbitrary processes run in a container, and still have all zombies reaped. */ + + assert_se(sigfillset(&fullmask) >= 0); + assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0); + + pid = fork(); + if (pid < 0) + return log_error_errno(errno, "Failed to fork child pid: %m"); + + if (pid == 0) { + /* Return in the child */ + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0); + setsid(); + return 0; + } + + reset_all_signal_handlers(); + + log_close(); + close_all_fds(NULL, 0); + log_open(); + + /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also, + * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ + * find them set. */ + sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX); + reset_environ(new_environment, sizeof(new_environment)); + + (void) rename_process("(sd-stubinit)"); + + assert_se(sigemptyset(&waitmask) >= 0); + assert_se(sigset_add_many(&waitmask, + SIGCHLD, /* posix: process died */ + SIGINT, /* sysv: ctrl-alt-del */ + SIGRTMIN+3, /* systemd: halt */ + SIGRTMIN+4, /* systemd: poweroff */ + SIGRTMIN+5, /* systemd: reboot */ + SIGRTMIN+6, /* systemd: kexec */ + SIGRTMIN+13, /* systemd: halt */ + SIGRTMIN+14, /* systemd: poweroff */ + SIGRTMIN+15, /* systemd: reboot */ + SIGRTMIN+16, /* systemd: kexec */ + -1) >= 0); + + /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't + * support reexec/reloading in this stub process. */ + + for (;;) { + siginfo_t si; + usec_t current_usec; + + si.si_pid = 0; + r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG); + if (r < 0) { + r = log_error_errno(errno, "Failed to reap children: %m"); + goto finish; + } + + current_usec = now(CLOCK_MONOTONIC); + + if (si.si_pid == pid || current_usec >= quit_usec) { + + /* The child we started ourselves died or we reached a timeout. */ + + if (state == STATE_REBOOT) { /* dispatch a queued reboot */ + (void) reboot(RB_AUTOBOOT); + r = log_error_errno(errno, "Failed to reboot: %m"); + goto finish; + + } else if (state == STATE_POWEROFF) + (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */ + + if (si.si_pid == pid && si.si_code == CLD_EXITED) + r = si.si_status; /* pass on exit code */ + else + r = EXIT_EXCEPTION; /* signal, coredump, timeout, … */ + + goto finish; + } + if (si.si_pid != 0) + /* We reaped something. Retry until there's nothing more to reap. */ + continue; + + if (quit_usec == USEC_INFINITY) + r = sigwaitinfo(&waitmask, &si); + else { + struct timespec ts; + r = sigtimedwait(&waitmask, &si, timespec_store(&ts, quit_usec - current_usec)); + } + if (r < 0) { + if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */ + continue; + if (errno == EAGAIN) /* timeout reached */ + continue; + + r = log_error_errno(errno, "Failed to wait for signal: %m"); + goto finish; + } + + if (si.si_signo == SIGCHLD) + continue; /* Let's reap this */ + + if (state != STATE_RUNNING) + continue; + + /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a + * constant… */ + + if (si.si_signo == SIGRTMIN+3 || + si.si_signo == SIGRTMIN+4 || + si.si_signo == SIGRTMIN+13 || + si.si_signo == SIGRTMIN+14) + + state = STATE_POWEROFF; + + else if (si.si_signo == SIGINT || + si.si_signo == SIGRTMIN+5 || + si.si_signo == SIGRTMIN+6 || + si.si_signo == SIGRTMIN+15 || + si.si_signo == SIGRTMIN+16) + + state = STATE_REBOOT; + else + assert_not_reached("Got unexpected signal"); + + r = kill_and_sigcont(pid, SIGTERM); + + /* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We + * do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those + * processes which handle both. That's because services tend to bind configuration reload or something + * else to SIGHUP. */ + + if (r != -ESRCH) + (void) kill(pid, SIGHUP); + + quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC; + } + +finish: + _exit(r < 0 ? EXIT_FAILURE : r); +} |