diff options
Diffstat (limited to 'tests/topotests/munet/mutini.py')
-rwxr-xr-x | tests/topotests/munet/mutini.py | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/tests/topotests/munet/mutini.py b/tests/topotests/munet/mutini.py new file mode 100755 index 0000000..e5f9931 --- /dev/null +++ b/tests/topotests/munet/mutini.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 eval: (blacken-mode 1) -*- +# SPDX-License-Identifier: GPL-2.0-or-later +# +# January 28 2023, Christian Hopps <chopps@labn.net> +# +# Copyright (c) 2023, LabN Consulting, L.L.C. +# +"""A tiny init for namespaces in python inspired by the C program tini.""" + + +# pylint: disable=global-statement +import argparse +import errno +import logging +import os +import re +import shlex +import signal +import subprocess +import sys + +from signal import Signals as S + + +try: + from munet import linux +except ModuleNotFoundError: + # We cannot use relative imports and still run this module directly as a script, and + # there are some use cases where we want to run this file as a script. + sys.path.append(os.path.dirname(os.path.realpath(__file__))) + import linux + + +class g: + """Global variables for our program.""" + + child_pid = -1 + orig_pid = os.getpid() + exit_signal = False + pid_status_cache = {} + restore_signals = set() + very_verbose = False + + +unshare_flags = { + "C": linux.CLONE_NEWCGROUP, + "i": linux.CLONE_NEWIPC, + "m": linux.CLONE_NEWNS, + "n": linux.CLONE_NEWNET, + "p": linux.CLONE_NEWPID, + "u": linux.CLONE_NEWUTS, + "T": linux.CLONE_NEWTIME, +} + + +ignored_signals = { + S.SIGTTIN, + S.SIGTTOU, +} +abort_signals = { + S.SIGABRT, + S.SIGBUS, + S.SIGFPE, + S.SIGILL, + S.SIGKILL, + S.SIGSEGV, + S.SIGSTOP, + S.SIGSYS, + S.SIGTRAP, +} +no_prop_signals = abort_signals | ignored_signals | {S.SIGCHLD} + + +def vdebug(*args, **kwargs): + if g.very_verbose: + logging.debug(*args, **kwargs) + + +def get_pid_status_item(status, stat): + m = re.search(rf"(?:^|\n){stat}:\t(.*)(?:\n|$)", status) + return m.group(1).strip() if m else None + + +def pget_pid_status_item(pid, stat): + if pid not in g.pid_status_cache: + with open(f"/proc/{pid}/status", "r", encoding="utf-8") as f: + g.pid_status_cache[pid] = f.read().strip() + return get_pid_status_item(g.pid_status_cache[pid], stat).strip() + + +def get_pid_name(pid): + try: + return get_pid_status_item(g.pid_status_cache[pid], "Name") + except Exception: + return str(pid) + + +# def init_get_child_pids(): +# """Return list of "children" pids. +# We consider any process with a 0 parent pid to also be our child as it +# nsentered our pid namespace from an external parent. +# """ +# g.pid_status_cache.clear() +# pids = (int(x) for x in os.listdir("/proc") if x.isdigit() and x != "1") +# return ( +# x for x in pids if x == g.child_pid or pget_pid_status_item(x, "PPid") == "0" +# ) + + +def exit_with_status(status): + if os.WIFEXITED(status): + ec = os.WEXITSTATUS(status) + elif os.WIFSIGNALED(status): + ec = 0x80 | os.WTERMSIG(status) + else: + ec = 255 + logging.debug("exiting with code %s", ec) + sys.exit(ec) + + +def waitpid(tag): + logging.debug("%s: waitid for exiting process", tag) + idobj = os.waitid(os.P_ALL, 0, os.WEXITED) + pid = idobj.si_pid + status = idobj.si_status + + if pid != g.child_pid: + pidname = get_pid_name(pid) + logging.debug( + "%s: reaped zombie %s (%s) w/ status %s", tag, pid, pidname, status + ) + return + + logging.debug("reaped child with status %s", status) + exit_with_status(status) + # NOTREACHED + + +def sig_trasmit(signum, _): + signame = signal.Signals(signum).name + if g.child_pid == -1: + # We've received a signal after setting up to be init proc + # but prior to fork or fork returning with child pid + logging.debug("received %s prior to child exec, exiting", signame) + sys.exit(0x80 | signum) + + try: + os.kill(g.child_pid, signum) + except OSError as error: + if error.errno != errno.ESRCH: + logging.error( + "error forwarding signal %s to child, exiting: %s", signum, error + ) + sys.exit(0x80 | signum) + logging.debug("child pid %s exited prior to signaling", g.child_pid) + + +def sig_sigchld(signum, _): + assert signum == S.SIGCHLD + try: + waitpid("SIGCHLD") + except ChildProcessError as error: + logging.warning("got SIGCHLD but no pid to wait on: %s", error) + + +def setup_init_signals(): + valid = set(signal.valid_signals()) + named = set(x.value for x in signal.Signals) + for snum in sorted(named): + if snum not in valid: + continue + if S.SIGRTMIN <= snum <= S.SIGRTMAX: + continue + + sname = signal.Signals(snum).name + if snum == S.SIGCHLD: + vdebug("installing local handler for %s", sname) + signal.signal(snum, sig_sigchld) + g.restore_signals.add(snum) + elif snum in ignored_signals: + vdebug("installing ignore handler for %s", sname) + signal.signal(snum, signal.SIG_IGN) + g.restore_signals.add(snum) + elif snum in abort_signals: + vdebug("leaving default handler for %s", sname) + # signal.signal(snum, signal.SIG_DFL) + else: + vdebug("installing trasmit signal handler for %s", sname) + try: + signal.signal(snum, sig_trasmit) + g.restore_signals.add(snum) + except OSError as error: + logging.warning( + "failed installing signal handler for %s: %s", sname, error + ) + + +def new_process_group(): + """Create and lead a new process group. + + This function will create a new process group if we are not yet leading one, and + additionally foreground said process group in our session. This foregrounding + action is copied from tini, and I believe serves a purpose when serving as init + for a container (e.g., podman). + """ + pid = os.getpid() + try: + pgid = os.getpgrp() + if pgid == pid: + logging.debug("already process group leader %s", pgid) + else: + logging.debug("creating new process group %s", pid) + os.setpgid(pid, 0) + except Exception as error: + logging.warning("unable to get new process group: %s", error) + return + + # Block these in order to allow foregrounding, otherwise we'd get SIGTTOU blocked + signal.signal(S.SIGTTIN, signal.SIG_IGN) + signal.signal(S.SIGTTOU, signal.SIG_IGN) + fd = sys.stdin.fileno() + if not os.isatty(fd): + logging.debug("stdin not a tty no foregrounding required") + else: + try: + # This will error if our session no longer associated with controlling tty. + pgid = os.tcgetpgrp(fd) + if pgid == pid: + logging.debug("process group already in foreground %s", pgid) + else: + logging.debug("making us the foreground pgid backgrounding %s", pgid) + os.tcsetpgrp(fd, pid) + except OSError as error: + if error.errno == errno.ENOTTY: + logging.debug("session is no longer associated with controlling tty") + else: + logging.warning("unable to foreground pgid %s: %s", pid, error) + signal.signal(S.SIGTTIN, signal.SIG_DFL) + signal.signal(S.SIGTTOU, signal.SIG_DFL) + + +def is_creating_pid_namespace(): + p1name = subprocess.check_output( + "readlink /proc/self/pid", stderr=subprocess.STDOUT, shell=True + ) + p2name = subprocess.check_output( + "readlink /proc/self/pid_for_children", stderr=subprocess.STDOUT, shell=True + ) + return p1name != p2name + + +def be_init(new_pg, exec_args): + # + # Arrange for us to be killed when our parent dies, this will subsequently also kill + # all procs in any PID namespace we are init for. + # + logging.debug("set us to be SIGKILLed when parent exits") + linux.set_parent_death_signal(signal.SIGKILL) + + # If we are createing a new PID namespace for children... + if g.orig_pid != 1: + logging.debug("started as pid %s", g.orig_pid) + # assert is_creating_pid_namespace() + + # Fork to become pid 1 + logging.debug("forking to become pid 1") + child_pid = os.fork() + if child_pid: + logging.debug("in parent waiting on child pid %s to exit", child_pid) + status = os.wait() + logging.debug("got child exit status %s", status) + exit_with_status(status) + # NOTREACHED + + # We must be pid 1 now. + logging.debug("in child as pid %s", os.getpid()) + assert os.getpid() == 1 + + # We need a new /proc now. + logging.debug("mount new /proc") + linux.mount("proc", "/proc", "proc") + + # If the parent exists kill us using SIGKILL + logging.debug("set us to be SIGKILLed when parent exits") + linux.set_parent_death_signal(signal.SIGKILL) + + if not exec_args: + if not new_pg: + logging.debug("no exec args, no new process group") + # # if 0 == os.getpgid(0): + # status = os.setpgid(0, 1) + # logging.debug("os.setpgid(0, 1) == %s", status) + else: + logging.debug("no exec args, creating new process group") + # No exec so we are the "child". + new_process_group() + + # Reap children as init process + vdebug("installing local handler for SIGCHLD") + signal.signal(signal.SIGCHLD, sig_sigchld) + + while True: + logging.info("init: waiting to reap zombies") + linux.pause() + # NOTREACHED + + # Set (parent) signal handlers before any fork to avoid race + setup_init_signals() + + logging.debug("forking to execute child") + g.child_pid = os.fork() + if g.child_pid == 0: + # In child, restore signals to default handling: + for snum in g.restore_signals: + signal.signal(snum, signal.SIG_DFL) + + # XXX is a new pg right? + new_process_group() + logging.debug("child: executing '%s'", shlex.join(exec_args)) + os.execvp(exec_args[0], exec_args) + # NOTREACHED + + while True: + logging.info("parent: waiting for child pid %s to exit", g.child_pid) + waitpid("parent") + + +def unshare(flags): + """Unshare into new namespaces.""" + uflags = 0 + for flag in flags: + if flag not in unshare_flags: + raise ValueError(f"unknown unshare flag '{flag}'") + uflags |= unshare_flags[flag] + new_pid = bool(uflags & linux.CLONE_NEWPID) + new_mnt = bool(uflags & linux.CLONE_NEWNS) + + logging.debug("unshareing with flags: %s", linux.clone_flag_string(uflags)) + linux.unshare(uflags) + + if new_pid and not new_mnt: + try: + # If we are not creating new mount namspace, remount /proc private + # so that our mount of a new /proc doesn't affect parent namespace + logging.debug("remount /proc recursive private") + linux.mount("none", "/proc", None, linux.MS_REC | linux.MS_PRIVATE) + except OSError as error: + # EINVAL is OK b/c /proc not mounted may cause an error + if error.errno != errno.EINVAL: + raise + if new_mnt: + # Remount root as recursive private. + logging.debug("remount / recursive private") + linux.mount("none", "/", None, linux.MS_REC | linux.MS_PRIVATE) + + # if new_pid: + # logging.debug("mount new /proc") + # linux.mount("proc", "/proc", "proc") + + return new_pid + + +def main(): + # + # Parse CLI args. + # + + ap = argparse.ArgumentParser() + ap.add_argument( + "-P", + "--no-proc-group", + action="store_true", + help="set to inherit the process group", + ) + valid_flags = "".join(unshare_flags) + ap.add_argument( + "--unshare-flags", + help=( + f"string of unshare(1) flags. Supported values from '{valid_flags}'." + " 'm' will remount `/` recursive private. 'p' will remount /proc" + " and fork, and the child will be signaled to exit on exit of parent.." + ), + ) + ap.add_argument( + "-v", dest="verbose", action="count", default=0, help="more -v's, more verbose" + ) + ap.add_argument("rest", nargs=argparse.REMAINDER) + args = ap.parse_args() + + # + # Setup logging. + # + + level = logging.DEBUG if args.verbose else logging.INFO + if args.verbose > 1: + g.very_verbose = True + logging.basicConfig( + level=level, format="%(asctime)s mutini: %(levelname)s: %(message)s" + ) + + # + # Run program + # + + status = 5 + try: + new_pid = False + if args.unshare_flags: + new_pid = unshare(args.unshare_flags) + + if g.orig_pid != 1 and not new_pid: + # Simply hold the namespaces + while True: + logging.info("holding namespace waiting to be signaled to exit") + linux.pause() + # NOTREACHED + + be_init(not args.no_proc_group, args.rest) + # NOTREACHED + logging.critical("Exited from be_init!") + except KeyboardInterrupt: + logging.info("exiting (main), received KeyboardInterrupt in main") + status = 0x80 | signal.SIGINT + except Exception as error: + logging.info("exiting (main), do to exception %s", error, exc_info=True) + + sys.exit(status) + + +if __name__ == "__main__": + main() |