diff options
Diffstat (limited to '')
-rw-r--r-- | src/nspawn/nspawn-bind-user.c | 6 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 95 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.h | 3 | ||||
-rw-r--r-- | src/nspawn/nspawn-gperf.gperf | 2 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.c | 102 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.h | 1 | ||||
-rw-r--r-- | src/nspawn/nspawn-network.c | 410 | ||||
-rw-r--r-- | src/nspawn/nspawn-network.h | 2 | ||||
-rw-r--r-- | src/nspawn/nspawn-oci.c | 44 | ||||
-rw-r--r-- | src/nspawn/nspawn-register.c | 19 | ||||
-rw-r--r-- | src/nspawn/nspawn-settings.c | 44 | ||||
-rw-r--r-- | src/nspawn/nspawn-settings.h | 2 | ||||
-rw-r--r-- | src/nspawn/nspawn-setuid.c | 13 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 1119 | ||||
-rw-r--r-- | src/nspawn/nspawn.h | 1 | ||||
-rw-r--r-- | src/nspawn/test-nspawn-util.c | 2 |
16 files changed, 1290 insertions, 575 deletions
diff --git a/src/nspawn/nspawn-bind-user.c b/src/nspawn/nspawn-bind-user.c index 61d8d30..018e7a3 100644 --- a/src/nspawn/nspawn-bind-user.c +++ b/src/nspawn/nspawn-bind-user.c @@ -153,7 +153,7 @@ static int find_free_uid(const char *directory, uid_t max_uid, uid_t *current_ui assert(directory); assert(current_uid); - for (;; (*current_uid) ++) { + for (;; (*current_uid)++) { if (*current_uid > MAP_UID_MAX || *current_uid > max_uid) return log_error_errno( SYNTHETIC_ERRNO(EBUSY), @@ -388,9 +388,9 @@ int bind_user_setup( if (!c || c->n_data == 0) return 0; - r = userns_mkdir(root, "/run/host", 0755, 0, 0); + r = make_run_host(root); if (r < 0) - return log_error_errno(r, "Failed to create /run/host: %m"); + return r; r = userns_mkdir(root, "/run/host/home", 0755, 0, 0); if (r < 0) diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index a500243..4f28b4a 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -13,6 +13,7 @@ #include "mountpoint-util.h" #include "nspawn-cgroup.h" #include "nspawn-mount.h" +#include "nsresource.h" #include "path-util.h" #include "rm-rf.h" #include "string-util.h" @@ -46,38 +47,6 @@ static int chown_cgroup_path(const char *path, uid_t uid_shift) { return 0; } -int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { - _cleanup_free_ char *path = NULL, *fs = NULL; - int r; - - r = cg_pid_get_path(NULL, pid, &path); - if (r < 0) - return log_error_errno(r, "Failed to get container cgroup path: %m"); - - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); - if (r < 0) - return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); - - r = chown_cgroup_path(fs, uid_shift); - if (r < 0) - return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs); - - if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) { - _cleanup_free_ char *lfs = NULL; - /* Always propagate access rights from unified to legacy controller */ - - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, NULL, &lfs); - if (r < 0) - return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); - - r = chown_cgroup_path(lfs, uid_shift); - if (r < 0) - return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs); - } - - return 0; -} - int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { _cleanup_free_ char *cgroup = NULL; char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; @@ -142,7 +111,14 @@ finish: return r; } -int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) { +int create_subcgroup( + pid_t pid, + bool keep_unit, + CGroupUnified unified_requested, + uid_t uid_shift, + int userns_fd, + bool privileged) { + _cleanup_free_ char *cgroup = NULL, *payload = NULL; CGroupMask supported; char *e; @@ -185,13 +161,54 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) if (!payload) return log_oom(); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid); + if (privileged) + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid); + else + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, payload); if (r < 0) return log_error_errno(r, "Failed to create %s subcgroup: %m", payload); + if (privileged) { + _cleanup_free_ char *fs = NULL; + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, payload, NULL, &fs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + r = chown_cgroup_path(fs, uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs); + + } else if (userns_fd >= 0) { + _cleanup_close_ int cgroup_fd = -EBADF; + + cgroup_fd = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, payload); + if (cgroup_fd < 0) + return log_error_errno(cgroup_fd, "Failed to open cgroup %s: %m", payload); + + r = cg_fd_attach(cgroup_fd, pid); + if (r < 0) + return log_error_errno(r, "Failed to add process " PID_FMT " to cgroup %s: %m", pid, payload); + + r = nsresource_add_cgroup(userns_fd, cgroup_fd); + if (r < 0) + return log_error_errno(r, "Failed to add cgroup %s to userns: %m", payload); + } + + if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) { + _cleanup_free_ char *lfs = NULL; + /* Always propagate access rights from unified to legacy controller */ + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, payload, NULL, &lfs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + r = chown_cgroup_path(lfs, uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs); + } + if (keep_unit) { _cleanup_free_ char *supervisor = NULL; - supervisor = path_join(cgroup, "supervisor"); if (!supervisor) return log_oom(); @@ -265,7 +282,7 @@ static int mount_legacy_cgroup_hierarchy( to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy); - r = path_is_mount_point(to, dest, 0); + r = path_is_mount_point_full(to, dest, /* flags = */ 0); if (r < 0 && r != -ENOENT) return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); if (r > 0) @@ -317,7 +334,7 @@ static int mount_legacy_cgns_supported( (void) mkdir_p(cgroup_root, 0755); /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); + r = path_is_mount_point_full(cgroup_root, dest, AT_SYMLINK_FOLLOW); if (r < 0) return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); if (r == 0) { @@ -427,7 +444,7 @@ static int mount_legacy_cgns_unsupported( (void) mkdir_p(cgroup_root, 0755); /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW); + r = path_is_mount_point_full(cgroup_root, dest, AT_SYMLINK_FOLLOW); if (r < 0) return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); if (r == 0) { @@ -529,7 +546,7 @@ static int mount_unified_cgroups(const char *dest) { (void) mkdir_p(p, 0755); - r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW); + r = path_is_mount_point_full(p, dest, AT_SYMLINK_FOLLOW); if (r < 0) return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); if (r > 0) { diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h index 3f5ba62..7e2cd53 100644 --- a/src/nspawn/nspawn-cgroup.h +++ b/src/nspawn/nspawn-cgroup.h @@ -6,9 +6,8 @@ #include "cgroup-util.h" -int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); -int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested); +int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, bool privileged); int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested); diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index 9e1210f..123ef0c 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -58,7 +58,7 @@ Exec.OOMScoreAdjust, config_parse_oom_score_adjust, 0, Exec.CPUAffinity, config_parse_cpu_affinity, 0, 0 Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf) Exec.LinkJournal, config_parse_link_journal, 0, 0 -Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone) +Exec.Timezone, config_parse_timezone_mode, 0, offsetof(Settings, timezone) Exec.SuppressSync, config_parse_tristate, 0, offsetof(Settings, suppress_sync) Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 470f477..c2bd4f6 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -245,7 +245,7 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) assert(l); assert(n); - r = extract_many_words(&s, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL); + r = extract_many_words(&s, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination); if (r < 0) return r; if (r == 0) @@ -444,22 +444,38 @@ int tmpfs_patch_options( } int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { - const char *full, *top; - int r; + _cleanup_free_ char *top = NULL, *full = NULL;; unsigned long extra_flags = 0; + int r; - top = prefix_roota(dest, "/sys"); - r = path_is_fs_type(top, SYSFS_MAGIC); + top = path_join(dest, "/sys"); + if (!top) + return log_oom(); + + r = path_is_mount_point(top); if (r < 0) - return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top); - /* /sys might already be mounted as sysfs by the outer child in the - * !netns case. In this case, it's all good. Don't touch it because we - * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555. - */ - if (r > 0) - return 0; + return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top); + if (r == 0) { + /* If this is not a mount point yet, then mount a tmpfs there */ + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS); + if (r < 0) + return r; + } else { + r = path_is_fs_type(top, SYSFS_MAGIC); + if (r < 0) + return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top); + + /* /sys/ might already be mounted as sysfs by the outer child in the !netns case. In this case, it's + * all good. Don't touch it because we don't have the right to do so, see + * https://github.com/systemd/systemd/issues/1555. + */ + if (r > 0) + return 0; + } - full = prefix_roota(top, "/full"); + full = path_join(top, "/full"); + if (!full) + return log_oom(); (void) mkdir(full, 0755); @@ -501,10 +517,11 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { if (rmdir(full) < 0) return log_error_errno(errno, "Failed to remove %s: %m", full); - /* Create mountpoint for cgroups. Otherwise we are not allowed since we - * remount /sys read-only. - */ - const char *x = prefix_roota(top, "/fs/cgroup"); + /* Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys/ read-only. */ + _cleanup_free_ char *x = path_join(top, "/fs/cgroup"); + if (!x) + return log_oom(); + (void) mkdir_p(x, 0755); return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL, @@ -541,7 +558,7 @@ int mount_all(const char *dest, } MountPoint; static const MountPoint mount_table[] = { - /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */ + /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing when we are privileged) */ { "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */ @@ -575,15 +592,15 @@ int mount_all(const char *dest, { "mqueue", "/dev/mqueue", "mqueue", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_IN_USERNS|MOUNT_MKDIR }, - /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */ + /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing when we are privileged) */ { "tmpfs", "/tmp", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR }, { "tmpfs", "/sys", "tmpfs", "mode=0555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR }, + MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_PRIVILEGED }, { "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS, - MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */ + MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* skipped if above was mounted */ { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */ + MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* skipped if above was mounted */ { "tmpfs", "/dev", "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL|MOUNT_MKDIR }, { "tmpfs", "/dev/shm", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME, @@ -604,11 +621,11 @@ int mount_all(const char *dest, MOUNT_FATAL|MOUNT_IN_USERNS }, #if HAVE_SELINUX { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, - MOUNT_MKDIR }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */ + MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, - 0 }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */ + MOUNT_PRIVILEGED }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */ { NULL, "/sys/fs/selinux", NULL, NULL, MS_PRIVATE, - 0 }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */ + MOUNT_PRIVILEGED }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */ #endif }; @@ -617,6 +634,7 @@ int mount_all(const char *dest, bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO); bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS); bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP); + bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED); int r; for (size_t k = 0; k < ELEMENTSOF(mount_table); k++) { @@ -624,6 +642,10 @@ int mount_all(const char *dest, bool fatal = FLAGS_SET(mount_table[k].mount_settings, MOUNT_FATAL); const char *o; + /* If we are not privileged but the entry is marked as privileged and to be mounted outside the user namespace, then skip it */ + if (!privileged && FLAGS_SET(mount_table[k].mount_settings, MOUNT_PRIVILEGED) && !FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS)) + continue; + if (in_userns != FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS)) continue; @@ -642,7 +664,7 @@ int mount_all(const char *dest, /* Skip this entry if it is not a remount. */ if (mount_table[k].what) { - r = path_is_mount_point(where, NULL, 0); + r = path_is_mount_point(where); if (r < 0 && r != -ENOENT) return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); if (r > 0) @@ -742,6 +764,8 @@ static int parse_mount_bind_options(const char *options, unsigned long *mount_fl new_idmapping = REMOUNT_IDMAPPING_NONE; else if (streq(word, "rootidmap")) new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER; + else if (streq(word, "owneridmap")) + new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER_TO_TARGET_OWNER; else return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid bind mount option: %s", word); @@ -759,6 +783,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u _cleanup_free_ char *mount_opts = NULL, *where = NULL; unsigned long mount_flags = MS_BIND | MS_REC; struct stat source_st, dest_st; + uid_t dest_uid = UID_INVALID; int r; RemountIdmapping idmapping = REMOUNT_IDMAPPING_NONE; @@ -787,6 +812,8 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u if (stat(where, &dest_st) < 0) return log_error_errno(errno, "Failed to stat %s: %m", where); + dest_uid = dest_st.st_uid; + if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot bind mount directory %s on file %s.", @@ -815,6 +842,8 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u if (chown(where, uid_shift, uid_shift) < 0) return log_error_errno(errno, "Failed to chown %s: %m", where); + + dest_uid = uid_shift; } r = mount_nofollow_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts); @@ -828,7 +857,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u } if (idmapping != REMOUNT_IDMAPPING_NONE) { - r = remount_idmap(STRV_MAKE(where), uid_shift, uid_range, source_st.st_uid, idmapping); + r = remount_idmap(STRV_MAKE(where), uid_shift, uid_range, source_st.st_uid, dest_uid, idmapping); if (r < 0) return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where); } @@ -1388,17 +1417,30 @@ int wipe_fully_visible_fs(int mntns_fd) { _cleanup_close_ int orig_mntns_fd = -EBADF; int r, rr; - r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL); + r = namespace_open(0, + /* ret_pidns_fd = */ NULL, + &orig_mntns_fd, + /* ret_netns_fd = */ NULL, + /* ret_userns_fd = */ NULL, + /* ret_root_fd = */ NULL); if (r < 0) return log_error_errno(r, "Failed to pin originating mount namespace: %m"); - r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF); + r = namespace_enter(/* pidns_fd = */ -EBADF, + mntns_fd, + /* netns_fd = */ -EBADF, + /* userns_fd = */ -EBADF, + /* root_fd = */ -EBADF); if (r < 0) return log_error_errno(r, "Failed to enter mount namespace: %m"); rr = do_wipe_fully_visible_fs(); - r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF); + r = namespace_enter(/* pidns_fd = */ -EBADF, + orig_mntns_fd, + /* netns_fd = */ -EBADF, + /* userns_fd = */ -EBADF, + /* root_fd = */ -EBADF); if (r < 0) return log_error_errno(r, "Failed to enter original mount namespace: %m"); diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index bf5e47d..54dafa7 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -20,6 +20,7 @@ typedef enum MountSettingsMask { MOUNT_TOUCH = 1 << 9, /* if set, touch file to mount over first */ MOUNT_PREFIX_ROOT = 1 << 10,/* if set, prefix the source path with the container's root directory */ MOUNT_FOLLOW_SYMLINKS = 1 << 11,/* if set, we'll follow symlinks for the mount target */ + MOUNT_PRIVILEGED = 1 << 12,/* if set, we'll only mount this in the outer child if we are running in privileged mode */ } MountSettingsMask; typedef enum CustomMountType { diff --git a/src/nspawn/nspawn-network.c b/src/nspawn/nspawn-network.c index c661f1d..ec5d396 100644 --- a/src/nspawn/nspawn-network.c +++ b/src/nspawn/nspawn-network.c @@ -1,23 +1,34 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* Make sure the net/if.h header is included before any linux/ one */ #include <net/if.h> #include <linux/if.h> +#include <linux/nl80211.h> #include <linux/veth.h> #include <sys/file.h> +#include <sys/mount.h> #include "sd-device.h" #include "sd-id128.h" #include "sd-netlink.h" #include "alloc-util.h" +#include "device-private.h" +#include "device-util.h" #include "ether-addr-util.h" +#include "fd-util.h" #include "hexdecoct.h" #include "lock-util.h" #include "missing_network.h" +#include "mkdir.h" +#include "mount-util.h" +#include "namespace-util.h" #include "netif-naming-scheme.h" +#include "netif-util.h" #include "netlink-util.h" #include "nspawn-network.h" #include "parse-util.h" +#include "process-util.h" #include "siphash24.h" #include "socket-netlink.h" #include "socket-util.h" @@ -31,7 +42,6 @@ #define VETH_EXTRA_HOST_HASH_KEY SD_ID128_MAKE(48,c7,f6,b7,ea,9d,4c,9e,b7,28,d4,de,91,d5,bf,66) #define VETH_EXTRA_CONTAINER_HASH_KEY SD_ID128_MAKE(af,50,17,61,ce,f9,4d,35,84,0d,2b,20,54,be,ce,59) #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f) -#define SHORTEN_IFNAME_HASH_KEY SD_ID128_MAKE(e1,90,a4,04,a8,ef,4b,51,8c,cc,c3,3a,9f,11,fc,a2) static int remove_one_link(sd_netlink *rtnl, const char *name) { _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; @@ -57,51 +67,6 @@ static int remove_one_link(sd_netlink *rtnl, const char *name) { return 1; } -static int generate_mac( - const char *machine_name, - struct ether_addr *mac, - sd_id128_t hash_key, - uint64_t idx) { - - uint64_t result; - size_t l, sz; - uint8_t *v, *i; - int r; - - l = strlen(machine_name); - sz = sizeof(sd_id128_t) + l; - if (idx > 0) - sz += sizeof(idx); - - v = newa(uint8_t, sz); - - /* fetch some persistent data unique to the host */ - r = sd_id128_get_machine((sd_id128_t*) v); - if (r < 0) - return r; - - /* combine with some data unique (on this host) to this - * container instance */ - i = mempcpy(v + sizeof(sd_id128_t), machine_name, l); - if (idx > 0) { - idx = htole64(idx); - memcpy(i, &idx, sizeof(idx)); - } - - /* Let's hash the host machine ID plus the container name. We - * use a fixed, but originally randomly created hash key here. */ - result = htole64(siphash24(v, sz, hash_key.bytes)); - - assert_cc(ETH_ALEN <= sizeof(result)); - memcpy(mac->ether_addr_octet, &result, ETH_ALEN); - - /* see eth_random_addr in the kernel */ - mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */ - mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */ - - return 0; -} - static int set_alternative_ifname(sd_netlink *rtnl, const char *ifname, const char *altifname) { int r; @@ -200,39 +165,6 @@ static int add_veth( return 0; } -static int shorten_ifname(char *ifname) { - char new_ifname[IFNAMSIZ]; - - assert(ifname); - - if (strlen(ifname) < IFNAMSIZ) /* Name is short enough */ - return 0; - - if (naming_scheme_has(NAMING_NSPAWN_LONG_HASH)) { - uint64_t h; - - /* Calculate 64-bit hash value */ - h = siphash24(ifname, strlen(ifname), SHORTEN_IFNAME_HASH_KEY.bytes); - - /* Set the final four bytes (i.e. 32-bit) to the lower 24bit of the hash, encoded in url-safe base64 */ - memcpy(new_ifname, ifname, IFNAMSIZ - 5); - new_ifname[IFNAMSIZ - 5] = urlsafe_base64char(h >> 18); - new_ifname[IFNAMSIZ - 4] = urlsafe_base64char(h >> 12); - new_ifname[IFNAMSIZ - 3] = urlsafe_base64char(h >> 6); - new_ifname[IFNAMSIZ - 2] = urlsafe_base64char(h); - } else - /* On old nspawn versions we just truncated the name, provide compatibility */ - memcpy(new_ifname, ifname, IFNAMSIZ-1); - - new_ifname[IFNAMSIZ - 1] = 0; - - /* Log the incident to make it more discoverable */ - log_warning("Network interface name '%s' has been changed to '%s' to fit length constraints.", ifname, new_ifname); - - strcpy(ifname, new_ifname); - return 1; -} - int setup_veth(const char *machine_name, pid_t pid, char iface_name[IFNAMSIZ], @@ -252,18 +184,18 @@ int setup_veth(const char *machine_name, /* Use two different interface name prefixes depending whether * we are in bridge mode or not. */ n = strjoina(bridge ? "vb-" : "ve-", machine_name); - r = shorten_ifname(n); + r = net_shorten_ifname(n, /* check_naming_scheme= */ true); if (r > 0) a = strjoina(bridge ? "vb-" : "ve-", machine_name); if (ether_addr_is_null(provided_mac)){ - r = generate_mac(machine_name, &mac_container, CONTAINER_HASH_KEY, 0); + r = net_generate_mac(machine_name, &mac_container, CONTAINER_HASH_KEY, 0); if (r < 0) return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m"); } else mac_container = *provided_mac; - r = generate_mac(machine_name, &mac_host, HOST_HASH_KEY, 0); + r = net_generate_mac(machine_name, &mac_host, HOST_HASH_KEY, 0); if (r < 0) return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m"); @@ -306,11 +238,11 @@ int setup_veth_extra( STRV_FOREACH_PAIR(a, b, pairs) { struct ether_addr mac_host, mac_container; - r = generate_mac(machine_name, &mac_container, VETH_EXTRA_CONTAINER_HASH_KEY, idx); + r = net_generate_mac(machine_name, &mac_container, VETH_EXTRA_CONTAINER_HASH_KEY, idx); if (r < 0) return log_error_errno(r, "Failed to generate predictable MAC address for container side of extra veth link: %m"); - r = generate_mac(machine_name, &mac_host, VETH_EXTRA_HOST_HASH_KEY, idx); + r = net_generate_mac(machine_name, &mac_host, VETH_EXTRA_HOST_HASH_KEY, idx); if (r < 0) return log_error_errno(r, "Failed to generate predictable MAC address for host side of extra veth link: %m"); @@ -480,7 +412,7 @@ static int test_network_interface_initialized(const char *name) { if (r < 0) return log_error_errno(r, "Failed to get device %s: %m", name); - r = sd_device_get_is_initialized(d); + r = device_is_processed(d); if (r < 0) return log_error_errno(r, "Failed to determine whether interface %s is initialized: %m", name); if (r == 0) @@ -505,42 +437,302 @@ int test_network_interfaces_initialized(char **iface_pairs) { return 0; } -int move_network_interfaces(int netns_fd, char **iface_pairs) { +int resolve_network_interface_names(char **iface_pairs) { _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; int r; - if (strv_isempty(iface_pairs)) - return 0; + /* Due to a bug in kernel fixed by 8e15aee621618a3ee3abecaf1fd8c1428098b7ef (v6.6, backported to + * 6.1.60 and 6.5.9), an interface with alternative names cannot be resolved by the alternative name + * if the interface is moved to another network namespace. Hence, we need to adjust the provided + * names before moving interfaces to container namespace. */ - r = sd_netlink_open(&rtnl); + STRV_FOREACH_PAIR(from, to, iface_pairs) { + _cleanup_free_ char *name = NULL; + _cleanup_strv_free_ char **altnames = NULL; + + r = rtnl_resolve_ifname_full(&rtnl, _RESOLVE_IFNAME_ALL, *from, &name, &altnames); + if (r < 0) + return r; + + /* Always use the resolved name for 'from'. */ + free_and_replace(*from, name); + + /* If the name 'to' is assigned as an alternative name, we cannot rename the interface. + * Hence, use the assigned interface name (including the alternative names) as is, and + * use the resolved name for 'to'. */ + if (strv_contains(altnames, *to)) { + r = free_and_strdup_warn(to, *from); + if (r < 0) + return r; + } + } + return 0; +} + +static int netns_child_begin(int netns_fd, int *ret_original_netns_fd) { + _cleanup_close_ int original_netns_fd = -EBADF; + int r; + + assert(netns_fd >= 0); + + if (ret_original_netns_fd) { + r = namespace_open(0, + /* ret_pidns_fd = */ NULL, + /* ret_mntns_fd = */ NULL, + &original_netns_fd, + /* ret_userns_fd = */ NULL, + /* ret_root_fd = */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to open original network namespace: %m"); + } + + r = namespace_enter(/* pidns_fd = */ -EBADF, + /* mntns_fd = */ -EBADF, + netns_fd, + /* userns_fd = */ -EBADF, + /* root_fd = */ -EBADF); if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); + return log_error_errno(r, "Failed to enter child network namespace: %m"); - STRV_FOREACH_PAIR(i, b, iface_pairs) { - _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; - int ifi; + r = umount_recursive("/sys/", /* flags = */ 0); + if (r < 0) + log_debug_errno(r, "Failed to unmount directories below /sys/, ignoring: %m"); - ifi = rtnl_resolve_interface_or_warn(&rtnl, *i); - if (ifi < 0) - return ifi; + (void) mkdir_p("/sys/", 0755); + + /* Populate new sysfs instance associated with the client netns, to make sd_device usable. */ + r = mount_nofollow_verbose(LOG_ERR, "sysfs", "/sys/", "sysfs", + MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to mount sysfs on /sys/: %m"); + + /* udev_avaliable() might be called previously and the result may be cached. + * Now, we (re-)mount sysfs. Hence, we need to reset the cache. */ + reset_cached_udev_availability(); + + if (ret_original_netns_fd) + *ret_original_netns_fd = TAKE_FD(original_netns_fd); + + return 0; +} + +static int netns_fork_and_wait(int netns_fd, int *ret_original_netns_fd) { + int r; + + assert(netns_fd >= 0); + + r = safe_fork("(sd-netns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL); + if (r < 0) + return log_error_errno(r, "Failed to fork process (sd-netns): %m"); + if (r == 0) { + if (netns_child_begin(netns_fd, ret_original_netns_fd) < 0) + _exit(EXIT_FAILURE); + + return 0; + } + + if (ret_original_netns_fd) + *ret_original_netns_fd = -EBADF; - r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi); + return 1; +} + +static int move_wlan_interface_impl(sd_netlink **genl, int netns_fd, sd_device *dev) { + _cleanup_(sd_netlink_unrefp) sd_netlink *our_genl = NULL; + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(netns_fd >= 0); + assert(dev); + + if (!genl) + genl = &our_genl; + if (!*genl) { + r = sd_genl_socket_open(genl); if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); + return log_error_errno(r, "Failed to connect to generic netlink: %m"); + } + + r = sd_genl_message_new(*genl, NL80211_GENL_NAME, NL80211_CMD_SET_WIPHY_NETNS, &m); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to allocate netlink message: %m"); + + uint32_t phy_index; + r = device_get_sysattr_u32(dev, "phy80211/index", &phy_index); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get phy index: %m"); + + r = sd_netlink_message_append_u32(m, NL80211_ATTR_WIPHY, phy_index); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to append phy index to netlink message: %m"); + + r = sd_netlink_message_append_u32(m, NL80211_ATTR_NETNS_FD, netns_fd); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to append namespace fd to netlink message: %m"); + + r = sd_netlink_call(*genl, m, 0, NULL); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to move interface to namespace: %m"); + + return 0; +} + +static int move_wlan_interface_one( + sd_netlink **rtnl, + sd_netlink **genl, + int *temp_netns_fd, + int netns_fd, + sd_device *dev, + const char *name) { + + int r; + + assert(rtnl); + assert(genl); + assert(temp_netns_fd); + assert(netns_fd >= 0); + assert(dev); - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd); + if (!name) + return move_wlan_interface_impl(genl, netns_fd, dev); + + /* The command NL80211_CMD_SET_WIPHY_NETNS takes phy instead of network interface, and does not take + * an interface name in the passed network namespace. Hence, we need to move the phy and interface to + * a temporary network namespace, rename the interface in it, and move them to the requested netns. */ + + if (*temp_netns_fd < 0) { + r = netns_acquire(); if (r < 0) - return log_error_errno(r, "Failed to append namespace fd to netlink message: %m"); + return log_error_errno(r, "Failed to acquire new network namespace: %m"); + *temp_netns_fd = r; + } - if (!streq(*b, *i)) { - r = sd_netlink_message_append_string(m, IFLA_IFNAME, *b); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); + r = move_wlan_interface_impl(genl, *temp_netns_fd, dev); + if (r < 0) + return r; + + const char *sysname; + r = sd_device_get_sysname(dev, &sysname); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get interface name: %m"); + + r = netns_fork_and_wait(*temp_netns_fd, NULL); + if (r < 0) + return log_error_errno(r, "Failed to fork process (nspawn-rename-wlan): %m"); + if (r == 0) { + _cleanup_(sd_device_unrefp) sd_device *temp_dev = NULL; + + r = rtnl_rename_link(NULL, sysname, name); + if (r < 0) { + log_error_errno(r, "Failed to rename network interface '%s' to '%s': %m", sysname, name); + goto finalize; } - r = sd_netlink_call(rtnl, m, 0, NULL); + r = sd_device_new_from_ifname(&temp_dev, name); + if (r < 0) { + log_error_errno(r, "Failed to acquire device '%s': %m", name); + goto finalize; + } + + r = move_wlan_interface_impl(NULL, netns_fd, temp_dev); + + finalize: + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + return 0; +} + +static int move_network_interface_one(sd_netlink **rtnl, int netns_fd, sd_device *dev, const char *name) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + assert(rtnl); + assert(netns_fd >= 0); + assert(dev); + + if (!*rtnl) { + r = sd_netlink_open(rtnl); if (r < 0) - return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i); + return log_error_errno(r, "Failed to connect to rtnetlink: %m"); + } + + int ifindex; + r = sd_device_get_ifindex(dev, &ifindex); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to get ifindex: %m"); + + r = sd_rtnl_message_new_link(*rtnl, &m, RTM_SETLINK, ifindex); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to append namespace fd to netlink message: %m"); + + if (name) { + r = sd_netlink_message_append_string(m, IFLA_IFNAME, name); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to add netlink interface name: %m"); + } + + r = sd_netlink_call(*rtnl, m, 0, NULL); + if (r < 0) + return log_device_error_errno(dev, r, "Failed to move interface to namespace: %m"); + + return 0; +} + +int move_network_interfaces(int netns_fd, char **iface_pairs) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL, *genl = NULL; + _cleanup_close_ int temp_netns_fd = -EBADF; + int r; + + assert(netns_fd >= 0); + + if (strv_isempty(iface_pairs)) + return 0; + + STRV_FOREACH_PAIR(from, to, iface_pairs) { + _cleanup_(sd_device_unrefp) sd_device *dev = NULL; + const char *name; + + name = streq(*from, *to) ? NULL : *to; + + r = sd_device_new_from_ifname(&dev, *from); + if (r < 0) + return log_error_errno(r, "Unknown interface name %s: %m", *from); + + if (device_is_devtype(dev, "wlan")) + r = move_wlan_interface_one(&rtnl, &genl, &temp_netns_fd, netns_fd, dev, name); + else + r = move_network_interface_one(&rtnl, netns_fd, dev, name); + if (r < 0) + return r; + } + + return 0; +} + +int move_back_network_interfaces(int child_netns_fd, char **interface_pairs) { + _cleanup_close_ int parent_netns_fd = -EBADF; + int r; + + assert(child_netns_fd >= 0); + + if (strv_isempty(interface_pairs)) + return 0; + + r = netns_fork_and_wait(child_netns_fd, &parent_netns_fd); + if (r < 0) + return r; + if (r == 0) { + /* Reverse network interfaces pair list so that interfaces get their initial name back. + * This is about ensuring interfaces get their old name back when being moved back. */ + interface_pairs = strv_reverse(interface_pairs); + + r = move_network_interfaces(parent_netns_fd, interface_pairs); + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); } return 0; @@ -568,7 +760,7 @@ int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs) { if (ifi < 0) return ifi; - r = generate_mac(machine_name, &mac, MACVLAN_HASH_KEY, idx++); + r = net_generate_mac(machine_name, &mac, MACVLAN_HASH_KEY, idx++); if (r < 0) return log_error_errno(r, "Failed to create MACVLAN MAC address: %m"); @@ -584,7 +776,7 @@ int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs) { if (!n) return log_oom(); - shortened = shorten_ifname(n); + shortened = net_shorten_ifname(n, /* check_naming_scheme= */ true); r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); if (r < 0) @@ -661,7 +853,7 @@ int setup_ipvlan(const char *machine_name, pid_t pid, char **iface_pairs) { if (!n) return log_oom(); - shortened = shorten_ifname(n); + shortened = net_shorten_ifname(n, /* check_naming_scheme= */ true); r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); if (r < 0) diff --git a/src/nspawn/nspawn-network.h b/src/nspawn/nspawn-network.h index a785f8e..840fe15 100644 --- a/src/nspawn/nspawn-network.h +++ b/src/nspawn/nspawn-network.h @@ -8,6 +8,7 @@ #include "ether-addr-util.h" int test_network_interfaces_initialized(char **iface_pairs); +int resolve_network_interface_names(char **iface_pairs); int setup_veth(const char *machine_name, pid_t pid, char iface_name[IFNAMSIZ], bool bridge, const struct ether_addr *provided_mac); int setup_veth_extra(const char *machine_name, pid_t pid, char **pairs); @@ -19,6 +20,7 @@ int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs); int setup_ipvlan(const char *machine_name, pid_t pid, char **iface_pairs); int move_network_interfaces(int netns_fd, char **iface_pairs); +int move_back_network_interfaces(int child_netns_fd, char **interface_pairs); int veth_extra_parse(char ***l, const char *p); diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c index 8f1ac7c..a00934c 100644 --- a/src/nspawn/nspawn-oci.c +++ b/src/nspawn/nspawn-oci.c @@ -409,18 +409,18 @@ static int oci_user(const char *name, JsonVariant *v, JsonDispatchFlags flags, v static int oci_process(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { static const JsonDispatch table[] = { - { "terminal", JSON_VARIANT_BOOLEAN, oci_terminal, 0, 0 }, - { "consoleSize", JSON_VARIANT_OBJECT, oci_console_size, 0, 0 }, - { "cwd", JSON_VARIANT_STRING, oci_absolute_path, offsetof(Settings, working_directory), 0 }, - { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(Settings, environment), 0 }, - { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(Settings, parameters), 0 }, - { "rlimits", JSON_VARIANT_ARRAY, oci_rlimits, 0, 0 }, - { "apparmorProfile", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, - { "capabilities", JSON_VARIANT_OBJECT, oci_capabilities, 0, 0 }, - { "noNewPrivileges", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, no_new_privileges), 0 }, - { "oomScoreAdj", JSON_VARIANT_INTEGER, oci_oom_score_adj, 0, 0 }, - { "selinuxLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, - { "user", JSON_VARIANT_OBJECT, oci_user, 0, 0 }, + { "terminal", JSON_VARIANT_BOOLEAN, oci_terminal, 0, 0 }, + { "consoleSize", JSON_VARIANT_OBJECT, oci_console_size, 0, 0 }, + { "cwd", JSON_VARIANT_STRING, oci_absolute_path, offsetof(Settings, working_directory), 0 }, + { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(Settings, environment), 0 }, + { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(Settings, parameters), 0 }, + { "rlimits", JSON_VARIANT_ARRAY, oci_rlimits, 0, 0 }, + { "apparmorProfile", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "capabilities", JSON_VARIANT_OBJECT, oci_capabilities, 0, 0 }, + { "noNewPrivileges", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Settings, no_new_privileges), 0 }, + { "oomScoreAdj", JSON_VARIANT_INTEGER, oci_oom_score_adj, 0, 0 }, + { "selinuxLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "user", JSON_VARIANT_OBJECT, oci_user, 0, 0 }, {} }; @@ -432,8 +432,8 @@ static int oci_root(const char *name, JsonVariant *v, JsonDispatchFlags flags, v int r; static const JsonDispatch table[] = { - { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Settings, root) }, - { "readonly", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, read_only) }, + { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Settings, root) }, + { "readonly", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Settings, read_only) }, {} }; @@ -863,7 +863,7 @@ static int oci_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags if (node->major == UINT_MAX || node->minor == UINT_MAX) { r = json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), - "Major/minor required when device node is device node"); + "Major/minor required when device node is device node."); goto fail_element; } @@ -1148,7 +1148,7 @@ static int oci_cgroup_memory_limit(const char *name, JsonVariant *v, JsonDispatc if (!json_variant_is_unsigned(v)) return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), - "Memory limit is not an unsigned integer"); + "Memory limit is not an unsigned integer."); k = json_variant_unsigned(v); if (k >= UINT64_MAX) @@ -1588,7 +1588,7 @@ static int oci_sysctl(const char *name, JsonVariant *v, JsonDispatchFlags flags, return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), "sysctl key invalid, refusing: %s", k); - r = strv_extend_strv(&s->sysctl, STRV_MAKE(k, m), false); + r = strv_extend_many(&s->sysctl, k, m); if (r < 0) return log_oom(); } @@ -1716,7 +1716,7 @@ static int oci_seccomp_archs(const char *name, JsonVariant *v, JsonDispatchFlags if (!json_variant_is_string(e)) return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), - "Architecture entry is not a string"); + "Architecture entry is not a string."); r = oci_seccomp_arch_from_string(json_variant_string(e), &a); if (r < 0) @@ -1837,10 +1837,8 @@ static int oci_seccomp_syscalls(const char *name, JsonVariant *v, JsonDispatchFl if (r < 0) return r; - if (strv_isempty(rule.names)) { - json_log(e, flags, 0, "System call name list is empty."); - return -EINVAL; - } + if (strv_isempty(rule.names)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "System call name list is empty."); STRV_FOREACH(i, rule.names) { int nr; @@ -2082,7 +2080,7 @@ static int oci_hooks_array(const char *name, JsonVariant *v, JsonDispatchFlags f return r; } - (*n_array) ++; + (*n_array)++; } return 0; diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c index 66962d7..b63516d 100644 --- a/src/nspawn/nspawn-register.c +++ b/src/nspawn/nspawn-register.c @@ -297,15 +297,12 @@ int allocate_scope( description = strjoina("Container ", machine_name); - if (allow_pidfd) { - _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - r = pidref_set_pid(&pidref, pid); - if (r < 0) - return log_error_errno(r, "Failed to allocate PID reference: %m"); + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + r = pidref_set_pid(&pidref, pid); + if (r < 0) + return log_error_errno(r, "Failed to allocate PID reference: %m"); - r = bus_append_scope_pidref(m, &pidref); - } else - r = sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, pid); + r = bus_append_scope_pidref(m, &pidref, allow_pidfd); if (r < 0) return bus_log_create_error(r); @@ -368,7 +365,11 @@ int allocate_scope( if (r < 0) return bus_log_parse_error(r); - r = bus_wait_for_jobs_one(w, object, false, NULL); + r = bus_wait_for_jobs_one( + w, + object, + BUS_WAIT_JOBS_LOG_ERROR, + /* extra_args= */ NULL); if (r < 0) return r; diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 161b1c1..132a543 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -835,21 +835,21 @@ int config_parse_cpu_affinity( DEFINE_CONFIG_PARSE_ENUM(config_parse_resolv_conf, resolv_conf_mode, ResolvConfMode, "Failed to parse resolv.conf mode"); static const char *const resolv_conf_mode_table[_RESOLV_CONF_MODE_MAX] = { - [RESOLV_CONF_OFF] = "off", - [RESOLV_CONF_COPY_HOST] = "copy-host", - [RESOLV_CONF_COPY_STATIC] = "copy-static", - [RESOLV_CONF_COPY_UPLINK] = "copy-uplink", - [RESOLV_CONF_COPY_STUB] = "copy-stub", - [RESOLV_CONF_REPLACE_HOST] = "replace-host", + [RESOLV_CONF_OFF] = "off", + [RESOLV_CONF_COPY_HOST] = "copy-host", + [RESOLV_CONF_COPY_STATIC] = "copy-static", + [RESOLV_CONF_COPY_UPLINK] = "copy-uplink", + [RESOLV_CONF_COPY_STUB] = "copy-stub", + [RESOLV_CONF_REPLACE_HOST] = "replace-host", [RESOLV_CONF_REPLACE_STATIC] = "replace-static", [RESOLV_CONF_REPLACE_UPLINK] = "replace-uplink", - [RESOLV_CONF_REPLACE_STUB] = "replace-stub", - [RESOLV_CONF_BIND_HOST] = "bind-host", - [RESOLV_CONF_BIND_STATIC] = "bind-static", - [RESOLV_CONF_BIND_UPLINK] = "bind-uplink", - [RESOLV_CONF_BIND_STUB] = "bind-stub", - [RESOLV_CONF_DELETE] = "delete", - [RESOLV_CONF_AUTO] = "auto", + [RESOLV_CONF_REPLACE_STUB] = "replace-stub", + [RESOLV_CONF_BIND_HOST] = "bind-host", + [RESOLV_CONF_BIND_STATIC] = "bind-static", + [RESOLV_CONF_BIND_UPLINK] = "bind-uplink", + [RESOLV_CONF_BIND_STUB] = "bind-stub", + [RESOLV_CONF_DELETE] = "delete", + [RESOLV_CONF_AUTO] = "auto", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(resolv_conf_mode, ResolvConfMode, RESOLV_CONF_AUTO); @@ -914,15 +914,15 @@ int config_parse_link_journal( return 0; } -DEFINE_CONFIG_PARSE_ENUM(config_parse_timezone, timezone_mode, TimezoneMode, "Failed to parse timezone mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_timezone_mode, timezone_mode, TimezoneMode, "Failed to parse timezone mode"); static const char *const timezone_mode_table[_TIMEZONE_MODE_MAX] = { - [TIMEZONE_OFF] = "off", - [TIMEZONE_COPY] = "copy", - [TIMEZONE_BIND] = "bind", + [TIMEZONE_OFF] = "off", + [TIMEZONE_COPY] = "copy", + [TIMEZONE_BIND] = "bind", [TIMEZONE_SYMLINK] = "symlink", - [TIMEZONE_DELETE] = "delete", - [TIMEZONE_AUTO] = "auto", + [TIMEZONE_DELETE] = "delete", + [TIMEZONE_AUTO] = "auto", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(timezone_mode, TimezoneMode, TIMEZONE_AUTO); @@ -930,10 +930,10 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(timezone_mode, TimezoneMode, TIMEZONE_AU DEFINE_CONFIG_PARSE_ENUM(config_parse_userns_ownership, user_namespace_ownership, UserNamespaceOwnership, "Failed to parse user namespace ownership mode"); static const char *const user_namespace_ownership_table[_USER_NAMESPACE_OWNERSHIP_MAX] = { - [USER_NAMESPACE_OWNERSHIP_OFF] = "off", + [USER_NAMESPACE_OWNERSHIP_OFF] = "off", [USER_NAMESPACE_OWNERSHIP_CHOWN] = "chown", - [USER_NAMESPACE_OWNERSHIP_MAP] = "map", - [USER_NAMESPACE_OWNERSHIP_AUTO] = "auto", + [USER_NAMESPACE_OWNERSHIP_MAP] = "map", + [USER_NAMESPACE_OWNERSHIP_AUTO] = "auto", }; DEFINE_STRING_TABLE_LOOKUP(user_namespace_ownership, UserNamespaceOwnership); diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index 8edf8a3..0bcb285 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -268,7 +268,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_oom_score_adjust); CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity); CONFIG_PARSER_PROTOTYPE(config_parse_resolv_conf); CONFIG_PARSER_PROTOTYPE(config_parse_link_journal); -CONFIG_PARSER_PROTOTYPE(config_parse_timezone); +CONFIG_PARSER_PROTOTYPE(config_parse_timezone_mode); CONFIG_PARSER_PROTOTYPE(config_parse_userns_chown); CONFIG_PARSER_PROTOTYPE(config_parse_userns_ownership); CONFIG_PARSER_PROTOTYPE(config_parse_bind_user); diff --git a/src/nspawn/nspawn-setuid.c b/src/nspawn/nspawn-setuid.c index 2d67c3d..e350b22 100644 --- a/src/nspawn/nspawn-setuid.c +++ b/src/nspawn/nspawn-setuid.c @@ -56,6 +56,8 @@ int change_uid_gid_raw( size_t n_supplementary_gids, bool chown_stdio) { + int r; + if (!uid_is_valid(uid)) uid = 0; if (!gid_is_valid(gid)) @@ -67,14 +69,9 @@ int change_uid_gid_raw( (void) fchown(STDERR_FILENO, uid, gid); } - if (setgroups(n_supplementary_gids, supplementary_gids) < 0) - return log_error_errno(errno, "Failed to set auxiliary groups: %m"); - - if (setresgid(gid, gid, gid) < 0) - return log_error_errno(errno, "setresgid() failed: %m"); - - if (setresuid(uid, uid, uid) < 0) - return log_error_errno(errno, "setresuid() failed: %m"); + r = fully_set_uid_gid(uid, gid, supplementary_gids, n_supplementary_gids); + if (r < 0) + return log_error_errno(r, "Changing privileges failed: %m"); return 0; } diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index e46cc1c..5842d3b 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1,10 +1,7 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ -#if HAVE_BLKID -#endif #include <errno.h> #include <getopt.h> -#include <linux/fs.h> #include <linux/loop.h> #if HAVE_SELINUX #include <selinux/selinux.h> @@ -12,6 +9,7 @@ #include <stdlib.h> #include <sys/file.h> #include <sys/ioctl.h> +#include <sys/mount.h> #include <sys/personality.h> #include <sys/prctl.h> #include <sys/types.h> @@ -19,6 +17,8 @@ #include <termios.h> #include <unistd.h> +#include <linux/fs.h> /* Must be included after <sys/mount.h> */ + #include "sd-bus.h" #include "sd-daemon.h" #include "sd-id128.h" @@ -84,6 +84,7 @@ #include "nspawn-stub-pid1.h" #include "nspawn-util.h" #include "nspawn.h" +#include "nsresource.h" #include "nulstr-util.h" #include "os-util.h" #include "pager.h" @@ -112,6 +113,7 @@ #include "umask-util.h" #include "unit-name.h" #include "user-util.h" +#include "vpick.h" /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */ #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify" @@ -229,13 +231,14 @@ static DeviceNode* arg_extra_nodes = NULL; static size_t arg_n_extra_nodes = 0; static char **arg_sysctl = NULL; static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID; -static MachineCredential *arg_credentials = NULL; -static size_t arg_n_credentials = 0; +static MachineCredentialContext arg_credentials = {}; static char **arg_bind_user = NULL; static bool arg_suppress_sync = false; static char *arg_settings_filename = NULL; static Architecture arg_architecture = _ARCHITECTURE_INVALID; static ImagePolicy *arg_image_policy = NULL; +static char *arg_background = NULL; +static bool arg_privileged = false; STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); STATIC_DESTRUCTOR_REGISTER(arg_template, freep); @@ -266,11 +269,13 @@ STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep); #if HAVE_SECCOMP STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep); #endif +STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done); STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset); STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep); STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep); STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep); STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep); +STATIC_DESTRUCTOR_REGISTER(arg_background, freep); static int handle_arg_console(const char *arg) { if (streq(arg, "help")) { @@ -289,7 +294,7 @@ static int handle_arg_console(const char *arg) { else if (streq(arg, "passive")) arg_console_mode = CONSOLE_PASSIVE; else if (streq(arg, "pipe")) { - if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0) + if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO)) log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE, "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. " "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. " @@ -297,7 +302,7 @@ static int handle_arg_console(const char *arg) { arg_console_mode = CONSOLE_PIPE; } else if (streq(arg, "autopipe")) { - if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0) + if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO)) arg_console_mode = CONSOLE_INTERACTIVE; else arg_console_mode = CONSOLE_PIPE; @@ -324,8 +329,8 @@ static int help(void) { " --version Print version string\n" " -q --quiet Do not show status information\n" " --no-pager Do not pipe output into a pager\n" - " --settings=BOOLEAN Load additional settings from .nspawn file\n\n" - "%3$sImage:%4$s\n" + " --settings=BOOLEAN Load additional settings from .nspawn file\n" + "\n%3$sImage:%4$s\n" " -D --directory=PATH Root directory for the container\n" " --template=PATH Initialize root directory from template directory,\n" " if missing\n" @@ -344,8 +349,8 @@ static int help(void) { " 'base64:'\n" " --verity-data=PATH Specify hash device for verity\n" " --pivot-root=PATH[:PATH]\n" - " Pivot root to given directory in the container\n\n" - "%3$sExecution:%4$s\n" + " Pivot root to given directory in the container\n" + "\n%3$sExecution:%4$s\n" " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n" " -b --boot Boot up full system (i.e. invoke init)\n" " --chdir=PATH Set working directory in the container\n" @@ -354,18 +359,18 @@ static int help(void) { " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n" " --notify-ready=BOOLEAN Receive notifications from the child init process\n" " --suppress-sync=BOOLEAN\n" - " Suppress any form of disk data synchronization\n\n" - "%3$sSystem Identity:%4$s\n" + " Suppress any form of disk data synchronization\n" + "\n%3$sSystem Identity:%4$s\n" " -M --machine=NAME Set the machine name for the container\n" " --hostname=NAME Override the hostname for the container\n" - " --uuid=UUID Set a specific machine UUID for the container\n\n" - "%3$sProperties:%4$s\n" + " --uuid=UUID Set a specific machine UUID for the container\n" + "\n%3$sProperties:%4$s\n" " -S --slice=SLICE Place the container in the specified slice\n" " --property=NAME=VALUE Set scope unit property\n" " --register=BOOLEAN Register container as machine\n" " --keep-unit Do not register a scope for the machine, reuse\n" - " the service unit nspawn is running in\n\n" - "%3$sUser Namespacing:%4$s\n" + " the service unit nspawn is running in\n" + "\n%3$sUser Namespacing:%4$s\n" " --private-users=no Run without user namespacing\n" " --private-users=yes|pick|identity\n" " Run within user namespace, autoselect UID/GID range\n" @@ -375,8 +380,8 @@ static int help(void) { " Adjust ('chown') or map ('map') OS tree ownership\n" " to private UID/GID range\n" " -U Equivalent to --private-users=pick and\n" - " --private-users-ownership=auto\n\n" - "%3$sNetworking:%4$s\n" + " --private-users-ownership=auto\n" + "\n%3$sNetworking:%4$s\n" " --private-network Disable network in container\n" " --network-interface=HOSTIF[:CONTAINERIF]\n" " Assign an existing network interface to the\n" @@ -401,8 +406,8 @@ static int help(void) { " Set network namespace to the one represented by\n" " the specified kernel namespace file node\n" " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n" - " Expose a container IP port on the host\n\n" - "%3$sSecurity:%4$s\n" + " Expose a container IP port on the host\n" + "\n%3$sSecurity:%4$s\n" " --capability=CAP In addition to the default, retain specified\n" " capability\n" " --drop-capability=CAP Drop the specified capability from the default set\n" @@ -417,20 +422,20 @@ static int help(void) { " processes in the container\n" " -L --selinux-apifs-context=SECLABEL\n" " Set the SELinux security context to be used by\n" - " API/tmpfs file systems in the container\n\n" - "%3$sResources:%4$s\n" + " API/tmpfs file systems in the container\n" + "\n%3$sResources:%4$s\n" " --rlimit=NAME=LIMIT Set a resource limit for the payload\n" " --oom-score-adjust=VALUE\n" " Adjust the OOM score value for the payload\n" " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n" - " --personality=ARCH Pick personality for this container\n\n" - "%3$sIntegration:%4$s\n" + " --personality=ARCH Pick personality for this container\n" + "\n%3$sIntegration:%4$s\n" " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n" " --timezone=MODE Select mode of /etc/localtime initialization\n" " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n" " host, try-guest, try-host\n" - " -j Equivalent to --link-journal=try-guest\n\n" - "%3$sMounts:%4$s\n" + " -j Equivalent to --link-journal=try-guest\n" + "\n%3$sMounts:%4$s\n" " --bind=PATH[:PATH[:OPTIONS]]\n" " Bind mount a file or directory from the host into\n" " the container\n" @@ -444,12 +449,13 @@ static int help(void) { " the container\n" " --overlay-ro=PATH[:PATH...]:PATH\n" " Similar, but creates a read-only overlay mount\n" - " --bind-user=NAME Bind user from host to container\n\n" - "%3$sInput/Output:%4$s\n" + " --bind-user=NAME Bind user from host to container\n" + "\n%3$sInput/Output:%4$s\n" " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n" " set up for the container.\n" - " -P --pipe Equivalent to --console=pipe\n\n" - "%3$sCredentials:%4$s\n" + " -P --pipe Equivalent to --console=pipe\n" + " --background=COLOR Set ANSI color for background\n" + "\n%3$sCredentials:%4$s\n" " --set-credential=ID:VALUE\n" " Pass a credential with literal value to container.\n" " --load-credential=ID:PATH\n" @@ -514,6 +520,12 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) { static int detect_unified_cgroup_hierarchy_from_image(const char *directory) { int r; + if (!arg_privileged) { + /* We only support the unified mode when running unprivileged */ + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; + return 0; + } + /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd * in the image actually supports. */ r = cg_all_unified(); @@ -615,7 +627,6 @@ static int parse_mount_settings_env(void) { e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE"); if (streq_ptr(e, "network")) arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS; - else if (e) { r = parse_boolean(e); if (r < 0) @@ -744,6 +755,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_BIND_USER, ARG_SUPPRESS_SYNC, ARG_IMAGE_POLICY, + ARG_BACKGROUND, }; static const struct option options[] = { @@ -818,6 +830,7 @@ static int parse_argv(int argc, char *argv[]) { { "bind-user", required_argument, NULL, ARG_BIND_USER }, { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC }, { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY }, + { "background", required_argument, NULL, ARG_BACKGROUND }, {} }; @@ -1249,33 +1262,11 @@ static int parse_argv(int argc, char *argv[]) { arg_uid_shift = 0; arg_uid_range = UINT32_C(0x10000); } else { - _cleanup_free_ char *buffer = NULL; - const char *range, *shift; - /* anything else: User namespacing on, UID range is explicitly configured */ - - range = strchr(optarg, ':'); - if (range) { - buffer = strndup(optarg, range - optarg); - if (!buffer) - return log_oom(); - shift = buffer; - - range++; - r = safe_atou32(range, &arg_uid_range); - if (r < 0) - return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range); - } else - shift = optarg; - - r = parse_uid(shift, &arg_uid_shift); + r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range); if (r < 0) - return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg); - + return r; arg_userns_mode = USER_NAMESPACE_FIXED; - - if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range)) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID); } arg_settings_mask |= SETTING_USERNS; @@ -1362,17 +1353,27 @@ static int parse_argv(int argc, char *argv[]) { break; - case ARG_CHDIR: + case ARG_CHDIR: { + _cleanup_free_ char *wd = NULL; + if (!path_is_absolute(optarg)) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory %s is not an absolute path.", optarg); - r = free_and_strdup(&arg_chdir, optarg); + r = path_simplify_alloc(optarg, &wd); if (r < 0) - return log_oom(); + return log_error_errno(r, "Failed to simplify path %s: %m", optarg); + + if (!path_is_normalized(wd)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory path is not normalized: %s", wd); + + if (path_below_api_vfs(wd)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory is below API VFS, refusing: %s", wd); + free_and_replace(arg_chdir, wd); arg_settings_mask |= SETTING_WORKING_DIRECTORY; break; + } case ARG_PIVOT_ROOT: r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg); @@ -1395,7 +1396,7 @@ static int parse_argv(int argc, char *argv[]) { _cleanup_free_ void *k = NULL; size_t l; - r = unhexmem(optarg, strlen(optarg), &k, &l); + r = unhexmem(optarg, &k, &l); if (r < 0) return log_error_errno(r, "Failed to parse root hash: %s", optarg); if (l < sizeof(sd_id128_t)) @@ -1412,7 +1413,7 @@ static int parse_argv(int argc, char *argv[]) { void *p; if ((value = startswith(optarg, "base64:"))) { - r = unbase64mem(value, strlen(value), &p, &l); + r = unbase64mem(value, &p, &l); if (r < 0) return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg); @@ -1568,7 +1569,7 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_SET_CREDENTIAL: - r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg); + r = machine_credential_set(&arg_credentials, optarg); if (r < 0) return r; @@ -1576,7 +1577,7 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_LOAD_CREDENTIAL: - r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg); + r = machine_credential_load(&arg_credentials, optarg); if (r < 0) return r; @@ -1607,6 +1608,12 @@ static int parse_argv(int argc, char *argv[]) { return r; break; + case ARG_BACKGROUND: + r = free_and_strdup_warn(&arg_background, optarg); + if (r < 0) + return r; + break; + case '?': return -EINVAL; @@ -1653,6 +1660,21 @@ static int parse_argv(int argc, char *argv[]) { static int verify_arguments(void) { int r; + SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged); + + if (!arg_privileged) { + /* machined is not accessible to unpriv clients */ + if (arg_register) { + log_notice("Automatically implying --register=no, since machined is not accessible to unprivileged clients."); + arg_register = false; + } + + if (!arg_private_network) { + log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing."); + arg_private_network = true; + } + } + if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { /* If we are running the stub init in the container, we don't need to look at what the init * in the container supports, because we are not using it. Let's immediately pick the right @@ -2184,7 +2206,7 @@ static int copy_devnodes(const char *dest) { if (mknod(to, st.st_mode, st.st_rdev) < 0) { /* Explicitly warn the user when /dev is already populated. */ if (errno == EEXIST) - log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest); + log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest); if (errno != EPERM) return log_error_errno(errno, "mknod(%s) failed: %m", to); @@ -2365,18 +2387,44 @@ static int setup_keyring(void) { return 0; } +int make_run_host(const char *root) { + int r; + + assert(root); + + r = userns_mkdir(root, "/run/host", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host/: %m"); + + return 0; +} + static int setup_credentials(const char *root) { + bool world_readable = false; const char *q; int r; - if (arg_n_credentials <= 0) + if (arg_credentials.n_credentials == 0) return 0; - r = userns_mkdir(root, "/run/host", 0755, 0, 0); + /* If starting a single-process container as a non-root user, the uid will only be resolved after we + * are inside the inner child, when credential directories and files are already read-only, so they + * are unusable as the single process won't have access to them. We also don't have access to the + * uid that will actually be used from here, as we are setting credentials up from the outer child. + * In order to make them usable as requested by the configuration, make them world readable in that + * case, as by definition there are no other processes in that case besides the one being started, + * which is being configured to be able to access credentials, and any of its children which will + * inherit its privileges anyway. To ensure this, also enforce (and document) that + * --no-new-privileges is necessary for this combination to work. */ + if (arg_no_new_privileges && !isempty(arg_user) && !STR_IN_SET(arg_user, "root", "0") && + arg_start_mode == START_PID1) + world_readable = true; + + r = make_run_host(root); if (r < 0) - return log_error_errno(r, "Failed to create /run/host: %m"); + return r; - r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0); + r = userns_mkdir(root, "/run/host/credentials", world_readable ? 0777 : 0700, 0, 0); if (r < 0) return log_error_errno(r, "Failed to create /run/host/credentials: %m"); @@ -2385,23 +2433,23 @@ static int setup_credentials(const char *root) { if (r < 0) return r; - for (size_t i = 0; i < arg_n_credentials; i++) { + FOREACH_ARRAY(cred, arg_credentials.credentials, arg_credentials.n_credentials) { _cleanup_free_ char *j = NULL; _cleanup_close_ int fd = -EBADF; - j = path_join(q, arg_credentials[i].id); + j = path_join(q, cred->id); if (!j) return log_oom(); - fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600); + fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, world_readable ? 0666 : 0600); if (fd < 0) return log_error_errno(errno, "Failed to create credential file %s: %m", j); - r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size); + r = loop_write(fd, cred->data, cred->size); if (r < 0) return log_error_errno(r, "Failed to write credential to file %s: %m", j); - if (fchmod(fd, 0400) < 0) + if (fchmod(fd, world_readable ? 0444 : 0400) < 0) return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j); if (arg_userns_mode != USER_NAMESPACE_NO) { @@ -2410,7 +2458,7 @@ static int setup_credentials(const char *root) { } } - if (chmod(q, 0500) < 0) + if (chmod(q, world_readable ? 0555 : 0500) < 0) return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q); r = userns_lchown(q, 0, 0); @@ -2536,7 +2584,7 @@ static int setup_journal(const char *directory) { p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid)); q = prefix_roota(directory, p); - if (path_is_mount_point(p, NULL, 0) > 0) { + if (path_is_mount_point(p) > 0) { if (try) return 0; @@ -2544,7 +2592,7 @@ static int setup_journal(const char *directory) { "%s: already a mount point, refusing to use for journal", p); } - if (path_is_mount_point(q, NULL, 0) > 0) { + if (path_is_mount_point(q) > 0) { if (try) return 0; @@ -2620,7 +2668,7 @@ static int setup_journal(const char *directory) { r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL); if (r < 0) - return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m"); + return log_error_errno(r, "Failed to bind mount journal from host into guest: %m"); return 0; } @@ -2680,6 +2728,9 @@ static int reset_audit_loginuid(void) { if ((arg_clone_ns_flags & CLONE_NEWPID) == 0) return 0; + if (!arg_privileged) + return 0; + r = read_one_line_file("/proc/self/loginuid", &p); if (r == -ENOENT) return 0; @@ -2709,14 +2760,19 @@ static int mount_tunnel_dig(const char *root) { const char *p, *q; int r; + if (!arg_privileged) { + log_debug("Not digging mount tunnel, because running unprivileged."); + return 0; + } + (void) mkdir_p("/run/systemd/nspawn/", 0755); (void) mkdir_p("/run/systemd/nspawn/propagate", 0600); p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); (void) mkdir_p(p, 0600); - r = userns_mkdir(root, "/run/host", 0755, 0, 0); + r = make_run_host(root); if (r < 0) - return log_error_errno(r, "Failed to create /run/host: %m"); + return r; r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0); if (r < 0) @@ -2737,6 +2793,11 @@ static int mount_tunnel_dig(const char *root) { static int mount_tunnel_open(void) { int r; + if (!arg_privileged) { + log_debug("Not opening up mount tunnel, because running unprivileged."); + return 0; + } + r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL); if (r < 0) return r; @@ -2913,14 +2974,72 @@ static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *erro return 0; } +static int pick_paths(void) { + int r; + + if (arg_directory) { + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; + PickFilter filter = pick_filter_image_dir; + + filter.architecture = arg_architecture; + + r = path_pick_update_warn( + &arg_directory, + &filter, + PICK_ARCHITECTURE|PICK_TRIES, + &result); + if (r < 0) { + /* Accept ENOENT here so that the --template= logic can work */ + if (r != -ENOENT) + return r; + } else + arg_architecture = result.architecture; + } + + if (arg_image) { + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; + PickFilter filter = pick_filter_image_raw; + + filter.architecture = arg_architecture; + + r = path_pick_update_warn( + &arg_image, + &filter, + PICK_ARCHITECTURE|PICK_TRIES, + &result); + if (r < 0) + return r; + + arg_architecture = result.architecture; + } + + if (arg_template) { + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; + PickFilter filter = pick_filter_image_dir; + + filter.architecture = arg_architecture; + + r = path_pick_update_warn( + &arg_template, + &filter, + PICK_ARCHITECTURE, + &result); + if (r < 0) + return r; + + arg_architecture = result.architecture; + } + + return 0; +} + static int determine_names(void) { int r; if (arg_template && !arg_directory && arg_machine) { - /* If --template= was specified then we should not - * search for a machine, but instead create a new one - * in /var/lib/machine. */ + /* If --template= was specified then we should not search for a machine, but instead create a + * new one in /var/lib/machine. */ arg_directory = path_join("/var/lib/machines", arg_machine); if (!arg_directory) @@ -2957,9 +3076,11 @@ static int determine_names(void) { } if (!arg_machine) { - if (arg_directory && path_equal(arg_directory, "/")) + if (arg_directory && path_equal(arg_directory, "/")) { arg_machine = gethostname_malloc(); - else if (arg_image) { + if (!arg_machine) + return log_oom(); + } else if (arg_image) { char *e; r = path_extract_filename(arg_image, &arg_machine); @@ -3198,20 +3319,32 @@ static int inner_child( return r; if (!arg_network_namespace_path && arg_private_network) { - r = unshare(CLONE_NEWNET); + _cleanup_close_ int netns_fd = -EBADF; + + if (arg_privileged) { + if (unshare(CLONE_NEWNET) < 0) + return log_error_errno(errno, "Failed to unshare network namespace: %m"); + } + + netns_fd = namespace_open_by_type(NAMESPACE_NET); + if (netns_fd < 0) + return log_error_errno(netns_fd, "Failed to open newly allocate network namespace: %m"); + + r = send_one_fd(fd_inner_socket, netns_fd, 0); if (r < 0) - return log_error_errno(errno, "Failed to unshare network namespace: %m"); + return log_error_errno(r, "Failed to send network namespace to supervisor: %m"); /* Tell the parent that it can setup network interfaces. */ (void) barrier_place(barrier); /* #3 */ } - r = mount_sysfs(NULL, arg_mount_settings); - if (r < 0) - return r; + if (arg_privileged) { + r = mount_sysfs(NULL, arg_mount_settings); + if (r < 0) + return r; + } - /* Wait until we are cgroup-ified, so that we - * can mount the right cgroup path writable */ + /* Wait until we are cgroup-ified, so that we can mount the right cgroup path writable */ if (!barrier_place_and_sync(barrier)) /* #4 */ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early"); @@ -3396,7 +3529,7 @@ static int inner_child( if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0) return log_oom(); - if (fdset_size(fds) > 0) { + if (!fdset_isempty(fds)) { r = fdset_cloexec(fds, false); if (r < 0) return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors."); @@ -3408,7 +3541,7 @@ static int inner_child( if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0) return log_oom(); - if (arg_n_credentials > 0) { + if (arg_credentials.n_credentials > 0) { envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials"); if (!envp[n_env]) return log_oom(); @@ -3430,6 +3563,9 @@ static int inner_child( if (!barrier_place_and_sync(barrier)) /* #5 */ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early"); + /* Note, this should be done this late (💣 and not moved earlier! 💣), so that all namespacing + * changes are already in effect by now, so that any resolved paths here definitely reference + * resources inside the container, and not outside of them. */ if (arg_chdir) if (chdir(arg_chdir) < 0) return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir); @@ -3509,11 +3645,11 @@ static int inner_child( return log_error_errno(errno, "execv(%s) failed: %m", exec_target); } -static int setup_notify_child(void) { +static int setup_notify_child(const void *directory) { _cleanup_close_ int fd = -EBADF; - static const union sockaddr_union sa = { + _cleanup_free_ char *j = NULL; + union sockaddr_union sa = { .un.sun_family = AF_UNIX, - .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH, }; int r; @@ -3521,14 +3657,26 @@ static int setup_notify_child(void) { if (fd < 0) return log_error_errno(errno, "Failed to allocate notification socket: %m"); - (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755); - (void) sockaddr_un_unlink(&sa.un); + if (directory) { + j = path_join(directory, NSPAWN_NOTIFY_SOCKET_PATH); + if (!j) + return log_oom(); + } - r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + r = sockaddr_un_set_path(&sa.un, j ?: NSPAWN_NOTIFY_SOCKET_PATH); if (r < 0) - return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m"); + return log_error_errno(r, "Failed to set AF_UNIX path to %s: %m", j ?: NSPAWN_NOTIFY_SOCKET_PATH); - r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0); + (void) mkdir_parents(sa.un.sun_path, 0755); + (void) sockaddr_un_unlink(&sa.un); + + WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */ + r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) + return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m"); + } + + r = userns_lchown(sa.un.sun_path, 0, 0); if (r < 0) return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m"); @@ -3539,6 +3687,125 @@ static int setup_notify_child(void) { return TAKE_FD(fd); } +static int setup_unix_export_dir_outside(char **ret) { + int r; + + assert(ret); + + if (!arg_privileged) { + log_debug("Not digging socket tunnel, because running unprivileged."); + return 0; + } + + _cleanup_free_ char *p = NULL; + p = path_join("/run/systemd/nspawn/unix-export", arg_machine); + if (!p) + return log_oom(); + + r = path_is_mount_point(p); + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Mount point '%s' exists already, refusing.", p); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to detect if '%s' is a mount point: %m", p); + + r = mkdir_p(p, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create '%s': %m", p); + + _cleanup_(rmdir_and_freep) char *q = TAKE_PTR(p); + + /* Mount the "unix export" directory really tiny, just 64 inodes. We mark the superblock writable + * (since the container shall bind sockets into it). */ + r = mount_nofollow_verbose( + LOG_ERR, + "tmpfs", + q, + "tmpfs", + MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(), + "size=4M,nr_inodes=64,mode=0755"); + if (r < 0) + return r; + + _cleanup_(umount_and_rmdir_and_freep) char *w = TAKE_PTR(q); + + /* After creating the superblock we change the bind mount to be read-only. This means that the fs + * itself is writable, but not through the mount accessible from the host. */ + r = mount_nofollow_verbose( + LOG_ERR, + /* source= */ NULL, + w, + /* fstype= */ NULL, + MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(), + /* options= */ NULL); + if (r < 0) + return r; + + *ret = TAKE_PTR(w); + return 0; +} + +static int setup_unix_export_host_inside(const char *directory, const char *unix_export_path) { + int r; + + assert(directory); + + if (!arg_privileged) + return 0; + + assert(unix_export_path); + + r = make_run_host(directory); + if (r < 0) + return r; + + _cleanup_free_ char *p = path_join(directory, "run/host/unix-export"); + if (!p) + return log_oom(); + + if (mkdir(p, 0755) < 0) + return log_error_errno(errno, "Failed to create '%s': %m", p); + + r = mount_nofollow_verbose( + LOG_ERR, + unix_export_path, + p, + /* fstype= */ NULL, + MS_BIND, + /* options= */ NULL); + if (r < 0) + return r; + + r = mount_nofollow_verbose( + LOG_ERR, + /* source= */ NULL, + p, + /* fstype= */ NULL, + MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(), + /* options= */ NULL); + if (r < 0) + return r; + + r = userns_lchown(p, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to chown '%s': %m", p); + + return 0; +} + +static DissectImageFlags determine_dissect_image_flags(void) { + return + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_RELAX_VAR_CHECK | + DISSECT_IMAGE_USR_NO_ROOT | + DISSECT_IMAGE_DISCARD_ON_LOOP | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES | + (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) | + DISSECT_IMAGE_ALLOW_USERSPACE_VERITY | + (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0); +} + static int outer_child( Barrier *barrier, const char *directory, @@ -3546,7 +3813,8 @@ static int outer_child( int fd_outer_socket, int fd_inner_socket, FDSet *fds, - int netns_fd) { + int netns_fd, + const char *unix_export_path) { _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL; _cleanup_strv_free_ char **os_release_pairs = NULL; @@ -3599,10 +3867,8 @@ static int outer_child( arg_uid_shift, arg_uid_range, /* userns_fd= */ -EBADF, + determine_dissect_image_flags()| DISSECT_IMAGE_MOUNT_ROOT_ONLY| - DISSECT_IMAGE_DISCARD_ON_LOOP| - DISSECT_IMAGE_USR_NO_ROOT| - (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)| (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0)); if (r < 0) return r; @@ -3613,7 +3879,12 @@ static int outer_child( return r; if (arg_userns_mode != USER_NAMESPACE_NO) { - r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL); + r = namespace_open(0, + /* ret_pidns_fd = */ NULL, + &mntns_fd, + /* ret_netns_fd = */ NULL, + /* ret_userns_fd = */ NULL, + /* ret_root_fd = */ NULL); if (r < 0) return log_error_errno(r, "Failed to pin outer mount namespace: %m"); @@ -3752,7 +4023,7 @@ static int outer_child( dirs[i] = NULL; - r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT); + r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT); if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) { /* This might fail because the kernel or file system doesn't support idmapping. We * can't really distinguish this nicely, nor do we have any guarantees about the @@ -3773,21 +4044,17 @@ static int outer_child( if (dissected_image) { /* Now we know the uid shift, let's now mount everything else that might be in the image. */ - r = dissected_image_mount( + r = dissected_image_mount_and_warn( dissected_image, directory, arg_uid_shift, arg_uid_range, /* userns_fd= */ -EBADF, + determine_dissect_image_flags()| DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY| - DISSECT_IMAGE_DISCARD_ON_LOOP| - DISSECT_IMAGE_USR_NO_ROOT| - (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)| (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0)); - if (r == -EUCLEAN) - return log_error_errno(r, "File system check for image failed: %m"); if (r < 0) - return log_error_errno(r, "Failed to mount image file system: %m"); + return r; } if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { @@ -3840,6 +4107,10 @@ static int outer_child( p = prefix_roota(directory, "/run/host"); (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift); + r = setup_unix_export_host_inside(directory, unix_export_path); + if (r < 0) + return r; + r = setup_pts(directory); if (r < 0) return r; @@ -3889,11 +4160,11 @@ static int outer_child( /* The same stuff as the $container env var, but nicely readable for the entire payload */ p = prefix_roota(directory, "/run/host/container-manager"); - (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE); + (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444); /* The same stuff as the $container_uuid env var */ p = prefix_roota(directory, "/run/host/container-uuid"); - (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)); + (void) write_string_filef(p, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)); if (!arg_use_cgns) { r = mount_cgroups( @@ -3908,47 +4179,59 @@ static int outer_child( return r; } - /* Mark everything as shared so our mounts get propagated down. This is required to make new bind - * mounts available in systemd services inside the container that create a new mount namespace. See - * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this - * will inherit the shared propagation mode. - * - * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root - * directory mount to root later on. - * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251 - */ - r = mount_switch_root(directory, MS_SHARED); - if (r < 0) - return log_error_errno(r, "Failed to move root directory: %m"); + /* We have different codepaths here for privileged and non-privileged mode. In privileged mode we'll + * now switch into the target directory, and then do the final setup from there. If a user namespace + * is then allocated for the container, the root mount and everything else will be out of reach for + * it. For unprivileged containers we cannot do that however, since we couldn't mount a sysfs and + * procfs then anymore, since that only works if there's an unobstructed instance currently + * visible. Hence there we do it the other way round: we first allocate a new set of namespaces + * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */ - /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a - * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into - * the container. */ - r = mount_tunnel_open(); - if (r < 0) - return r; + if (arg_privileged) { + /* Mark everything as shared so our mounts get propagated down. This is required to make new + * bind mounts available in systemd services inside the container that create a new mount + * namespace. See https://github.com/systemd/systemd/issues/3860 Further submounts (such as + * /dev/) done after this will inherit the shared propagation mode. + * + * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root + * directory mount to root later on. + * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251 + */ + r = mount_switch_root(directory, MS_SHARED); + if (r < 0) + return log_error_errno(r, "Failed to move root directory: %m"); - if (arg_userns_mode != USER_NAMESPACE_NO) { - /* In order to mount procfs and sysfs in an unprivileged container the kernel - * requires that a fully visible instance is already present in the target mount - * namespace. Mount one here so the inner child can mount its own instances. Later - * we umount the temporary instances created here before we actually exec the - * payload. Since the rootfs is shared the umount will propagate into the container. - * Note, the inner child wouldn't be able to unmount the instances on its own since - * it doesn't own the originating mount namespace. IOW, the outer child needs to do - * this. */ - r = pin_fully_visible_fs(); + /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a + * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into + * the container. */ + r = mount_tunnel_open(); if (r < 0) return r; - } - fd = setup_notify_child(); + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* In order to mount procfs and sysfs in an unprivileged container the kernel + * requires that a fully visible instance is already present in the target mount + * namespace. Mount one here so the inner child can mount its own instances. Later + * we umount the temporary instances created here before we actually exec the + * payload. Since the rootfs is shared the umount will propagate into the container. + * Note, the inner child wouldn't be able to unmount the instances on its own since + * it doesn't own the originating mount namespace. IOW, the outer child needs to do + * this. */ + r = pin_fully_visible_fs(); + if (r < 0) + return r; + } + + fd = setup_notify_child(NULL); + } else + fd = setup_notify_child(directory); if (fd < 0) return fd; pid = raw_clone(SIGCHLD|CLONE_NEWNS| arg_clone_ns_flags | - (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); + (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) | + ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0)); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); if (pid == 0) { @@ -3958,11 +4241,35 @@ static int outer_child( * user if user namespaces are turned on. */ if (arg_network_namespace_path) { - r = namespace_enter(-1, -1, netns_fd, -1, -1); + r = namespace_enter(/* pidns_fd = */ -EBADF, + /* mntns_fd = */ -EBADF, + netns_fd, + /* userns_fd = */ -EBADF, + /* root_fd = */ -EBADF); if (r < 0) return log_error_errno(r, "Failed to join network namespace: %m"); } + if (!arg_privileged) { + /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them + * inside the inner namespaces, but before we switch root. Hence do so here. */ + _cleanup_free_ char *j = path_join(directory, "/proc"); + if (!j) + return log_oom(); + + r = mount_follow_verbose(LOG_ERR, "proc", j, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) + return r; + + r = mount_sysfs(directory, arg_mount_settings); + if (r < 0) + return r; + + r = mount_switch_root(directory, MS_SHARED); + if (r < 0) + return log_error_errno(r, "Failed to move root directory: %m"); + } + r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs); if (r < 0) _exit(EXIT_FAILURE); @@ -4030,13 +4337,13 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { return r; /* Make some superficial checks whether the range is currently known in the user database */ - if (getpwuid(candidate)) + if (getpwuid_malloc(candidate, /* ret= */ NULL) >= 0) goto next; - if (getpwuid(candidate + UINT32_C(0xFFFE))) + if (getpwuid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0) goto next; - if (getgrgid(candidate)) + if (getgrgid_malloc(candidate, /* ret= */ NULL) >= 0) goto next; - if (getgrgid(candidate + UINT32_C(0xFFFE))) + if (getgrgid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0) goto next; *ret_lock_file = lf; @@ -4217,6 +4524,17 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r if (!tags) return log_oom(); + if (DEBUG_LOGGING) { + _cleanup_free_ char *joined = strv_join(tags, " "); + + if (joined) { + _cleanup_free_ char *j = cescape(joined); + free_and_replace(joined, j); + } + + log_debug("Got sd_notify() message: %s", strnull(joined)); + } + if (strv_contains(tags, "READY=1")) { r = sd_notify(false, "READY=1\n"); if (r < 0) @@ -4233,6 +4551,9 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) { int r; + if (fd < 0) + return 0; + r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid); if (r < 0) return log_error_errno(r, "Failed to allocate notify event source: %m"); @@ -4242,6 +4563,25 @@ static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, return 0; } +static void set_window_title(PTYForward *f) { + _cleanup_free_ char *hn = NULL, *dot = NULL; + + assert(f); + + (void) gethostname_strict(&hn); + + if (emoji_enabled()) + dot = strjoin(special_glyph(SPECIAL_GLYPH_BLUE_CIRCLE), " "); + + if (hn) + (void) pty_forward_set_titlef(f, "%sContainer %s on %s", strempty(dot), arg_machine, hn); + else + (void) pty_forward_set_titlef(f, "%sContainer %s", strempty(dot), arg_machine); + + if (dot) + (void) pty_forward_set_title_prefix(f, dot); +} + static int merge_settings(Settings *settings, const char *path) { int rl; @@ -4457,7 +4797,7 @@ static int merge_settings(Settings *settings, const char *path) { #endif } - for (rl = 0; rl < _RLIMIT_MAX; rl ++) { + for (rl = 0; rl < _RLIMIT_MAX; rl++) { if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl))) continue; @@ -4593,26 +4933,28 @@ static int load_settings(void) { return 0; /* We first look in the admin's directories in /etc and /run */ - FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") { - _cleanup_free_ char *j = NULL; + if (arg_privileged) { + FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") { + _cleanup_free_ char *j = NULL; - j = path_join(i, arg_settings_filename); - if (!j) - return log_oom(); + j = path_join(i, arg_settings_filename); + if (!j) + return log_oom(); - f = fopen(j, "re"); - if (f) { - p = TAKE_PTR(j); + f = fopen(j, "re"); + if (f) { + p = TAKE_PTR(j); - /* By default, we trust configuration from /etc and /run */ - if (arg_settings_trusted < 0) - arg_settings_trusted = true; + /* By default, we trust configuration from /etc and /run */ + if (arg_settings_trusted < 0) + arg_settings_trusted = true; - break; - } + break; + } - if (errno != ENOENT) - return log_error_errno(errno, "Failed to open %s: %m", j); + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", j); + } } if (!f) { @@ -4672,10 +5014,14 @@ static int load_oci_bundle(void) { static int run_container( DissectedImage *dissected_image, + int userns_fd, FDSet *fds, - char veth_name[IFNAMSIZ], bool *veth_created, + char veth_name[IFNAMSIZ], + bool *veth_created, struct ExposeArgs *expose_args, - int *master, pid_t *pid, int *ret) { + int *master, + pid_t *pid, + int *ret) { static const struct sigaction sa = { .sa_handler = nop_signal_handler, @@ -4691,6 +5037,7 @@ static int run_container( _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *unix_export_host_dir = NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; @@ -4706,6 +5053,11 @@ static int run_container( assert_se(sigemptyset(&mask_chld) == 0); assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); + /* Set up the unix export host directory on the host first */ + r = setup_unix_export_dir_outside(&unix_export_host_dir); + if (r < 0) + return r; + if (arg_userns_mode == USER_NAMESPACE_PICK) { /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely * check with getpwuid() if the specific user already exists. Note that /etc might be @@ -4754,11 +5106,44 @@ static int run_container( "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path); } - *pid = raw_clone(SIGCHLD|CLONE_NEWNS); - if (*pid < 0) - return log_error_errno(errno, "clone() failed%s: %m", - errno == EINVAL ? - ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : ""); + if (arg_privileged) { + assert(userns_fd < 0); + + /* If we have no user namespace then we'll clone and create a new mount namespace right-away. */ + + *pid = raw_clone(SIGCHLD|CLONE_NEWNS); + if (*pid < 0) + return log_error_errno(errno, "clone() failed%s: %m", + errno == EINVAL ? + ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : ""); + } else { + assert(userns_fd >= 0); + + /* If we have a user namespace then we'll clone() first, and then join the user namespace, + * and then open the mount namespace, so that it is owned by the user namespace */ + + *pid = raw_clone(SIGCHLD); + if (*pid < 0) + return log_error_errno(errno, "clone() failed: %m"); + + if (*pid == 0) { + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + log_error_errno(errno, "Failed to join allocate user namespace: %m"); + _exit(EXIT_FAILURE); + } + + r = reset_uid_gid(); + if (r < 0) { + log_error_errno(r, "Failed to reset UID/GID to root: %m"); + _exit(EXIT_FAILURE); + } + + if (unshare(CLONE_NEWNS) < 0) { + log_error_errno(errno, "Failed to unshare file system namespace: %m"); + _exit(EXIT_FAILURE); + } + } + } if (*pid == 0) { /* The outer child only has a file system namespace. */ @@ -4776,7 +5161,8 @@ static int run_container( fd_outer_socket_pair[1], fd_inner_socket_pair[1], fds, - child_netns_fd); + child_netns_fd, + unix_export_host_dir); if (r < 0) _exit(EXIT_FAILURE); @@ -4894,14 +5280,13 @@ static int run_container( /* Wait until the child has unshared its network namespace. */ if (!barrier_place_and_sync(&barrier)) /* #3 */ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early"); - } - if (child_netns_fd < 0) { - /* Make sure we have an open file descriptor to the child's network - * namespace so it stays alive even if the child exits. */ - r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL); - if (r < 0) - return log_error_errno(r, "Failed to open child network namespace: %m"); + /* Make sure we have an open file descriptor to the child's network namespace so it + * stays alive even if the child exits. */ + assert(child_netns_fd < 0); + child_netns_fd = receive_one_fd(fd_inner_socket_pair[0], 0); + if (child_netns_fd < 0) + return log_error_errno(r, "Failed to receive child network namespace: %m"); } r = move_network_interfaces(child_netns_fd, arg_network_interfaces); @@ -4909,12 +5294,29 @@ static int run_container( return r; if (arg_network_veth) { - r = setup_veth(arg_machine, *pid, veth_name, - arg_network_bridge || arg_network_zone, &arg_network_provided_mac); - if (r < 0) - return r; - else if (r > 0) - ifi = r; + if (arg_privileged) { + r = setup_veth(arg_machine, *pid, veth_name, + arg_network_bridge || arg_network_zone, &arg_network_provided_mac); + if (r < 0) + return r; + else if (r > 0) + ifi = r; + } else { + _cleanup_free_ char *host_ifname = NULL; + + r = nsresource_add_netif(userns_fd, child_netns_fd, /* namespace_ifname= */ NULL, &host_ifname, /* ret_namespace_ifname= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to add network interface to container: %m"); + + ifi = if_nametoindex(host_ifname); + if (ifi == 0) + return log_error_errno(errno, "Failed to resolve interface '%s': %m", host_ifname); + + if (strlen(host_ifname) >= IFNAMSIZ) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Host interface name too long?"); + + strcpy(veth_name, host_ifname); + } if (arg_network_bridge) { /* Add the interface to a bridge */ @@ -4953,9 +5355,12 @@ static int run_container( } if (arg_register || !arg_keep_unit) { - r = sd_bus_default_system(&bus); + if (arg_privileged) + r = sd_bus_default_system(&bus); + else + r = sd_bus_default_user(&bus); if (r < 0) - return log_error_errno(r, "Failed to open system bus: %m"); + return log_error_errno(r, "Failed to open bus: %m"); r = sd_bus_set_close_on_exit(bus, false); if (r < 0) @@ -5016,7 +5421,13 @@ static int run_container( } else if (arg_slice || arg_property) log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect."); - r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy); + r = create_subcgroup( + *pid, + arg_keep_unit, + arg_unified_cgroup_hierarchy, + arg_uid_shift, + userns_fd, + arg_privileged); if (r < 0) return r; @@ -5024,14 +5435,8 @@ static int run_container( if (r < 0) return r; - r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift); - if (r < 0) - return r; - - /* Notify the child that the parent is ready with all - * its setup (including cgroup-ification), and that - * the child can now hand over control to the code to - * run inside the container. */ + /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and + * that the child can now hand over control to the code to run inside the container. */ (void) barrier_place(&barrier); /* #4 */ /* Block SIGCHLD here, before notifying child. @@ -5146,9 +5551,23 @@ static int run_container( return log_error_errno(r, "Failed to create PTY forwarder: %m"); if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX) - (void) pty_forward_set_width_height(forward, - arg_console_width, - arg_console_height); + (void) pty_forward_set_width_height( + forward, + arg_console_width, + arg_console_height); + + if (!arg_background && shall_tint_background()) { + _cleanup_free_ char *bg = NULL; + + r = terminal_tint_color(220 /* blue */, &bg); + if (r < 0) + log_debug_errno(r, "Failed to determine terminal background color, not tinting."); + else + (void) pty_forward_set_background_color(forward, bg); + } else if (!isempty(arg_background)) + (void) pty_forward_set_background_color(forward, arg_background); + + set_window_title(forward); break; default: @@ -5183,38 +5602,10 @@ static int run_container( fd_kmsg_fifo = safe_close(fd_kmsg_fifo); - if (arg_private_network) { - /* Move network interfaces back to the parent network namespace. We use `safe_fork` - * to avoid having to move the parent to the child network namespace. */ - r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL); + if (arg_private_network && arg_privileged) { + r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces); if (r < 0) return r; - - if (r == 0) { - _cleanup_close_ int parent_netns_fd = -EBADF; - - r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL); - if (r < 0) { - log_error_errno(r, "Failed to open parent network namespace: %m"); - _exit(EXIT_FAILURE); - } - - r = namespace_enter(-1, -1, child_netns_fd, -1, -1); - if (r < 0) { - log_error_errno(r, "Failed to enter child network namespace: %m"); - _exit(EXIT_FAILURE); - } - - /* Reverse network interfaces pair list so that interfaces get their initial name back. - * This is about ensuring interfaces get their old name back when being moved back. */ - arg_network_interfaces = strv_reverse(arg_network_interfaces); - - r = move_network_interfaces(parent_netns_fd, arg_network_interfaces); - if (r < 0) - log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m"); - - _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); - } } r = wait_for_container(TAKE_PID(*pid), &container_status); @@ -5288,7 +5679,7 @@ static int initialize_rlimits(void) { * don't read the other limits from PID 1 but prefer the static table above. */ }; - int rl; + int rl, r; for (rl = 0; rl < _RLIMIT_MAX; rl++) { /* Let's only fill in what the user hasn't explicitly configured anyway */ @@ -5299,8 +5690,9 @@ static int initialize_rlimits(void) { if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) { /* For these two let's read the limits off PID 1. See above for an explanation. */ - if (prlimit(1, rl, NULL, &buffer) < 0) - return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl)); + r = pid_getrlimit(1, rl, &buffer); + if (r < 0) + return log_error_errno(r, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl)); v = &buffer; } else if (rl == RLIMIT_NOFILE) { @@ -5351,6 +5743,10 @@ static int cant_be_in_netns(void) { if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r)) return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev."); + if (ERRNO_IS_NEG_PRIVILEGE(r)) { + log_debug_errno(r, "Can't connect to udev control socket, assuming we are in same netns."); + return 0; + } if (r < 0) return log_error_errno(r, "Failed to connect socket to udev control socket: %m"); @@ -5369,7 +5765,7 @@ static int cant_be_in_netns(void) { static int run(int argc, char *argv[]) { bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false; - _cleanup_close_ int master = -EBADF; + _cleanup_close_ int master = -EBADF, userns_fd = -EBADF; _cleanup_fdset_free_ FDSet *fds = NULL; int r, n_fd_passed, ret = EXIT_SUCCESS; char veth_name[IFNAMSIZ] = ""; @@ -5381,20 +5777,14 @@ static int run(int argc, char *argv[]) { _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL; pid_t pid = 0; - log_parse_environment(); - log_open(); + log_setup(); + + arg_privileged = getuid() == 0; r = parse_argv(argc, argv); if (r <= 0) goto finish; - if (geteuid() != 0) { - r = log_warning_errno(SYNTHETIC_ERRNO(EPERM), - argc >= 2 ? "Need to be root." : - "Need to be root (and some arguments are usually required).\nHint: try --help"); - goto finish; - } - r = cant_be_in_netns(); if (r < 0) goto finish; @@ -5407,6 +5797,10 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; + r = pick_paths(); + if (r < 0) + goto finish; + r = determine_names(); if (r < 0) goto finish; @@ -5421,7 +5815,7 @@ static int run(int argc, char *argv[]) { if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0) arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE); - r = cg_unified(); + r = cg_unified(); /* initialize cache early */ if (r < 0) { log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); goto finish; @@ -5431,6 +5825,10 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; + r = resolve_network_interface_names(arg_network_interfaces); + if (r < 0) + goto finish; + r = verify_network_interfaces_initialized(); if (r < 0) goto finish; @@ -5438,6 +5836,16 @@ static int run(int argc, char *argv[]) { /* Reapply environment settings. */ (void) detect_unified_cgroup_hierarchy_from_environment(); + if (!arg_privileged) { + r = cg_all_unified(); + if (r < 0) { + log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m"); + goto finish; + } + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode."); + } + /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if * the result is closed. Note that the container payload child will reset signal mask+handler anyway, * so just turning this off here means we only turn it off in nspawn itself, not any children. */ @@ -5457,9 +5865,21 @@ static int run(int argc, char *argv[]) { * the child. Functions like copy_devnodes() change the umask temporarily. */ umask(0022); + if (arg_console_mode < 0) + arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ? + CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY; + + if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ + arg_quiet = true; + if (arg_directory) { assert(!arg_image); + if (!arg_privileged) { + r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges."); + goto finish; + } + /* Safety precaution: let's not allow running images from the live host OS image, as long as * /var from the host will propagate into container dynamically (because bad things happen if * two systems write to the same /var). Let's allow it for the special cases where /var is @@ -5480,7 +5900,7 @@ static int run(int argc, char *argv[]) { /* If the specified path is a mount point we generate the new snapshot immediately * inside it under a random name. However if the specified is not a mount point we * create the new snapshot in the parent directory, just next to it. */ - r = path_is_mount_point(arg_directory, NULL, 0); + r = path_is_mount_point(arg_directory); if (r < 0) { log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory); goto finish; @@ -5496,7 +5916,11 @@ static int run(int argc, char *argv[]) { /* We take an exclusive lock on this image, since it's our private, ephemeral copy * only owned by us and no one else. */ - r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + np, + LOCK_EX|LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r < 0) { log_error_errno(r, "Failed to lock %s: %m", np); goto finish; @@ -5528,7 +5952,11 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; - r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + arg_directory, + (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r == -EBUSY) { log_error_errno(r, "Directory tree %s is currently busy.", arg_directory); goto finish; @@ -5620,15 +6048,12 @@ static int run(int argc, char *argv[]) { } else { DissectImageFlags dissect_image_flags = - DISSECT_IMAGE_GENERIC_ROOT | - DISSECT_IMAGE_REQUIRE_ROOT | - DISSECT_IMAGE_RELAX_VAR_CHECK | - DISSECT_IMAGE_USR_NO_ROOT | - DISSECT_IMAGE_ADD_PARTITION_DEVICES | - DISSECT_IMAGE_PIN_PARTITION_DEVICES; + determine_dissect_image_flags(); + assert(arg_image); assert(!arg_template); + r = chase_and_update(&arg_image, 0); if (r < 0) goto finish; @@ -5643,9 +6068,13 @@ static int run(int argc, char *argv[]) { } /* Always take an exclusive lock on our own ephemeral copy. */ - r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + np, + LOCK_EX|LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r < 0) { - r = log_error_errno(r, "Failed to create image lock: %m"); + log_error_errno(r, "Failed to create image lock: %m"); goto finish; } @@ -5668,13 +6097,17 @@ static int run(int argc, char *argv[]) { free_and_replace(arg_image, np); remove_image = true; } else { - r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + arg_image, + (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r == -EBUSY) { - r = log_error_errno(r, "Disk image %s is currently busy.", arg_image); + log_error_errno(r, "Disk image %s is currently busy.", arg_image); goto finish; } if (r < 0) { - r = log_error_errno(r, "Failed to create image lock: %m"); + log_error_errno(r, "Failed to create image lock: %m"); goto finish; } @@ -5703,56 +6136,80 @@ static int run(int argc, char *argv[]) { goto finish; } - r = loop_device_make_by_path( - arg_image, - arg_read_only ? O_RDONLY : O_RDWR, - /* sector_size= */ UINT32_MAX, - FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, - LOCK_SH, - &loop); - if (r < 0) { - log_error_errno(r, "Failed to set up loopback block device: %m"); - goto finish; - } + if (arg_privileged) { + r = loop_device_make_by_path( + arg_image, + arg_read_only ? O_RDONLY : O_RDWR, + /* sector_size= */ UINT32_MAX, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &loop); + if (r < 0) { + log_error_errno(r, "Failed to set up loopback block device: %m"); + goto finish; + } - r = dissect_loop_device_and_warn( - loop, - &arg_verity_settings, - /* mount_options=*/ NULL, - arg_image_policy ?: &image_policy_container, - dissect_image_flags, - &dissected_image); - if (r == -ENOPKG) { - /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */ - log_notice("Note that the disk image needs to\n" - " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n" - " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n" - " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n" - " d) or contain a file system without a partition table\n" - "in order to be bootable with systemd-nspawn."); - goto finish; - } - if (r < 0) - goto finish; + r = dissect_loop_device_and_warn( + loop, + &arg_verity_settings, + /* mount_options=*/ NULL, + arg_image_policy ?: &image_policy_container, + dissect_image_flags, + &dissected_image); + if (r == -ENOPKG) { + /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */ + log_notice("Note that the disk image needs to\n" + " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n" + " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n" + " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n" + " d) or contain a file system without a partition table\n" + "in order to be bootable with systemd-nspawn."); + goto finish; + } + if (r < 0) + goto finish; - r = dissected_image_load_verity_sig_partition( - dissected_image, - loop->fd, - &arg_verity_settings); - if (r < 0) - goto finish; + r = dissected_image_load_verity_sig_partition( + dissected_image, + loop->fd, + &arg_verity_settings); + if (r < 0) + goto finish; - if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig) - log_notice("Note: image %s contains verity information, but no root hash specified and no embedded " - "root hash signature found! Proceeding without integrity checking.", arg_image); + if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig) + log_notice("Note: image %s contains verity information, but no root hash specified and no embedded " + "root hash signature found! Proceeding without integrity checking.", arg_image); - r = dissected_image_decrypt_interactively( - dissected_image, - NULL, - &arg_verity_settings, - 0); - if (r < 0) - goto finish; + r = dissected_image_decrypt_interactively( + dissected_image, + NULL, + &arg_verity_settings, + dissect_image_flags); + if (r < 0) + goto finish; + } else { + _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine); + if (!userns_name) { + r = log_oom(); + goto finish; + } + + /* if we are unprivileged, let's allocate a 64K userns first */ + userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000)); + if (userns_fd < 0) { + r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m"); + goto finish; + } + + r = mountfsd_mount_image( + arg_image, + userns_fd, + arg_image_policy, + dissect_image_flags, + &dissected_image); + if (r < 0) + goto finish; + } /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */ if (remove_image && unlink(arg_image) >= 0) @@ -5766,19 +6223,20 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; - if (arg_console_mode < 0) - arg_console_mode = - isatty(STDIN_FILENO) > 0 && - isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY; + if (!arg_quiet) { + const char *t = arg_image ?: arg_directory; + _cleanup_free_ char *u = NULL; + (void) terminal_urlify_path(t, t, &u); - if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ - arg_quiet = true; + log_info("%s %sSpawning container %s on %s.%s", + special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), arg_machine, u ?: t, ansi_normal()); - if (!arg_quiet) - log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.", - arg_machine, arg_image ?: arg_directory); + if (arg_console_mode == CONSOLE_INTERACTIVE) + log_info("%s %sPress %sCtrl-]%s three times within 1s to kill container.%s", + special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal()); + } - assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18) >= 0); r = make_reaper_process(true); if (r < 0) { @@ -5795,11 +6253,13 @@ static int run(int argc, char *argv[]) { expose_args.fw_ctx = fw_ctx; } for (;;) { - r = run_container(dissected_image, - fds, - veth_name, &veth_created, - &expose_args, &master, - &pid, &ret); + r = run_container( + dissected_image, + userns_fd, + fds, + veth_name, &veth_created, + &expose_args, &master, + &pid, &ret); if (r <= 0) break; } @@ -5841,25 +6301,30 @@ finish: log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir); } - if (arg_machine) { + if (arg_machine && arg_privileged) { const char *p; p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); (void) rm_rf(p, REMOVE_ROOT); + + p = strjoina("/run/systemd/nspawn/unix-export/", arg_machine); + (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW); + (void) rmdir(p); } expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4); expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6); - if (veth_created) - (void) remove_veth_links(veth_name, arg_network_veth_extra); - (void) remove_bridge(arg_network_zone); + if (arg_privileged) { + if (veth_created) + (void) remove_veth_links(veth_name, arg_network_veth_extra); + (void) remove_bridge(arg_network_zone); + } custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); expose_port_free_all(arg_expose_ports); rlimit_free_all(arg_rlimit); device_node_array_free(arg_extra_nodes, arg_n_extra_nodes); - machine_credential_free_all(arg_credentials, arg_n_credentials); if (r < 0) return r; diff --git a/src/nspawn/nspawn.h b/src/nspawn/nspawn.h index 27fb0b4..556f8ee 100644 --- a/src/nspawn/nspawn.h +++ b/src/nspawn/nspawn.h @@ -5,3 +5,4 @@ int userns_lchown(const char *p, uid_t uid, gid_t gid); int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid); +int make_run_host(const char *root); diff --git a/src/nspawn/test-nspawn-util.c b/src/nspawn/test-nspawn-util.c index 08c8050..533edde 100644 --- a/src/nspawn/test-nspawn-util.c +++ b/src/nspawn/test-nspawn-util.c @@ -8,7 +8,7 @@ TEST(systemd_installation_has_version) { int r; - FOREACH_STRING(version, "0", "231", STRINGIFY(PROJECT_VERSION), "999") { + FOREACH_STRING(version, "0", "231", PROJECT_VERSION_FULL, "999") { r = systemd_installation_has_version(saved_argv[1], version); assert_se(r >= 0); log_info("%s has systemd >= %s: %s", |