16 files changed, 1290 insertions, 575 deletions
diff --git a/src/nspawn/nspawn-bind-user.c b/src/nspawn/nspawn-bind-user.c
index 61d8d30..018e7a3 100644
--- a/src/nspawn/nspawn-bind-user.c
+++ b/src/nspawn/nspawn-bind-user.c
@@ -153,7 +153,7 @@ static int find_free_uid(const char *directory, uid_t max_uid, uid_t *current_ui
         assert(directory);
         assert(current_uid);
 
-        for (;; (*current_uid) ++) {
+        for (;; (*current_uid)++) {
                 if (*current_uid > MAP_UID_MAX || *current_uid > max_uid)
                         return log_error_errno(
                                         SYNTHETIC_ERRNO(EBUSY),
@@ -388,9 +388,9 @@ int bind_user_setup(
         if (!c || c->n_data == 0)
                 return 0;
 
-        r = userns_mkdir(root, "/run/host", 0755, 0, 0);
+        r = make_run_host(root);
         if (r < 0)
-                return log_error_errno(r, "Failed to create /run/host: %m");
+                return r;
 
         r = userns_mkdir(root, "/run/host/home", 0755, 0, 0);
         if (r < 0)
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index a500243..4f28b4a 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -13,6 +13,7 @@
 #include "mountpoint-util.h"
 #include "nspawn-cgroup.h"
 #include "nspawn-mount.h"
+#include "nsresource.h"
 #include "path-util.h"
 #include "rm-rf.h"
 #include "string-util.h"
@@ -46,38 +47,6 @@ static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         return 0;
 }
 
-int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
-        _cleanup_free_ char *path = NULL, *fs = NULL;
-        int r;
-
-        r = cg_pid_get_path(NULL, pid, &path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get container cgroup path: %m");
-
-        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-        r = chown_cgroup_path(fs, uid_shift);
-        if (r < 0)
-                return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
-
-        if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) {
-                _cleanup_free_ char *lfs = NULL;
-                /* Always propagate access rights from unified to legacy controller */
-
-                r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, NULL, &lfs);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-                r = chown_cgroup_path(lfs, uid_shift);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs);
-        }
-
-        return 0;
-}
-
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
         _cleanup_free_ char *cgroup = NULL;
         char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
@@ -142,7 +111,14 @@ finish:
         return r;
 }
 
-int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
+int create_subcgroup(
+                pid_t pid,
+                bool keep_unit,
+                CGroupUnified unified_requested,
+                uid_t uid_shift,
+                int userns_fd,
+                bool privileged) {
+
         _cleanup_free_ char *cgroup = NULL, *payload = NULL;
         CGroupMask supported;
         char *e;
@@ -185,13 +161,54 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested)
         if (!payload)
                 return log_oom();
 
-        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
+        if (privileged)
+                r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
+        else
+                r = cg_create(SYSTEMD_CGROUP_CONTROLLER, payload);
         if (r < 0)
                 return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
 
+        if (privileged) {
+                _cleanup_free_ char *fs = NULL;
+                r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, payload, NULL, &fs);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+                r = chown_cgroup_path(fs, uid_shift);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+
+        } else if (userns_fd >= 0) {
+                _cleanup_close_ int cgroup_fd = -EBADF;
+
+                cgroup_fd = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, payload);
+                if (cgroup_fd < 0)
+                        return log_error_errno(cgroup_fd, "Failed to open cgroup %s: %m", payload);
+
+                r = cg_fd_attach(cgroup_fd, pid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add process " PID_FMT " to cgroup %s: %m", pid, payload);
+
+                r = nsresource_add_cgroup(userns_fd, cgroup_fd);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add cgroup %s to userns: %m", payload);
+        }
+
+        if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) {
+                _cleanup_free_ char *lfs = NULL;
+                /* Always propagate access rights from unified to legacy controller */
+
+                r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, payload, NULL, &lfs);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+                r = chown_cgroup_path(lfs, uid_shift);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs);
+        }
+
         if (keep_unit) {
                 _cleanup_free_ char *supervisor = NULL;
-
                 supervisor = path_join(cgroup, "supervisor");
                 if (!supervisor)
                         return log_oom();
@@ -265,7 +282,7 @@ static int mount_legacy_cgroup_hierarchy(
 
         to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
 
-        r = path_is_mount_point(to, dest, 0);
+        r = path_is_mount_point_full(to, dest, /* flags = */ 0);
         if (r < 0 && r != -ENOENT)
                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
         if (r > 0)
@@ -317,7 +334,7 @@ static int mount_legacy_cgns_supported(
         (void) mkdir_p(cgroup_root, 0755);
 
         /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
-        r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
+        r = path_is_mount_point_full(cgroup_root, dest, AT_SYMLINK_FOLLOW);
         if (r < 0)
                 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
         if (r == 0) {
@@ -427,7 +444,7 @@ static int mount_legacy_cgns_unsupported(
         (void) mkdir_p(cgroup_root, 0755);
 
         /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
-        r = path_is_mount_point(cgroup_root, dest, AT_SYMLINK_FOLLOW);
+        r = path_is_mount_point_full(cgroup_root, dest, AT_SYMLINK_FOLLOW);
         if (r < 0)
                 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
         if (r == 0) {
@@ -529,7 +546,7 @@ static int mount_unified_cgroups(const char *dest) {
 
         (void) mkdir_p(p, 0755);
 
-        r = path_is_mount_point(p, dest, AT_SYMLINK_FOLLOW);
+        r = path_is_mount_point_full(p, dest, AT_SYMLINK_FOLLOW);
         if (r < 0)
                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
         if (r > 0) {
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index 3f5ba62..7e2cd53 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -6,9 +6,8 @@
 
 #include "cgroup-util.h"
 
-int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
-int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested);
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, bool privileged);
 
 int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf
index 9e1210f..123ef0c 100644
--- a/src/nspawn/nspawn-gperf.gperf
+++ b/src/nspawn/nspawn-gperf.gperf
@@ -58,7 +58,7 @@ Exec.OOMScoreAdjust,          config_parse_oom_score_adjust, 0,
 Exec.CPUAffinity,             config_parse_cpu_affinity,   0,                        0
 Exec.ResolvConf,              config_parse_resolv_conf,    0,                        offsetof(Settings, resolv_conf)
 Exec.LinkJournal,             config_parse_link_journal,   0,                        0
-Exec.Timezone,                config_parse_timezone,       0,                        offsetof(Settings, timezone)
+Exec.Timezone,                config_parse_timezone_mode,  0,                        offsetof(Settings, timezone)
 Exec.SuppressSync,            config_parse_tristate,       0,                        offsetof(Settings, suppress_sync)
 Files.ReadOnly,               config_parse_tristate,       0,                        offsetof(Settings, read_only)
 Files.Volatile,               config_parse_volatile_mode,  0,                        offsetof(Settings, volatile_mode)
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 470f477..c2bd4f6 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -245,7 +245,7 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only)
         assert(l);
         assert(n);
 
-        r = extract_many_words(&s, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
+        r = extract_many_words(&s, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination);
         if (r < 0)
                 return r;
         if (r == 0)
@@ -444,22 +444,38 @@ int tmpfs_patch_options(
 }
 
 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
-        const char *full, *top;
-        int r;
+        _cleanup_free_ char *top = NULL, *full = NULL;;
         unsigned long extra_flags = 0;
+        int r;
 
-        top = prefix_roota(dest, "/sys");
-        r = path_is_fs_type(top, SYSFS_MAGIC);
+        top = path_join(dest, "/sys");
+        if (!top)
+                return log_oom();
+
+        r = path_is_mount_point(top);
         if (r < 0)
-                return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
-        /* /sys might already be mounted as sysfs by the outer child in the
-         * !netns case. In this case, it's all good. Don't touch it because we
-         * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
-         */
-        if (r > 0)
-                return 0;
+                return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top);
+        if (r == 0) {
+                /* If this is not a mount point yet, then mount a tmpfs there */
+                r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS);
+                if (r < 0)
+                        return r;
+        } else {
+                r = path_is_fs_type(top, SYSFS_MAGIC);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
+
+                /* /sys/ might already be mounted as sysfs by the outer child in the !netns case. In this case, it's
+                 * all good. Don't touch it because we don't have the right to do so, see
+                 * https://github.com/systemd/systemd/issues/1555.
+                 */
+                if (r > 0)
+                        return 0;
+        }
 
-        full = prefix_roota(top, "/full");
+        full = path_join(top, "/full");
+        if (!full)
+                return log_oom();
 
         (void) mkdir(full, 0755);
 
@@ -501,10 +517,11 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
         if (rmdir(full) < 0)
                 return log_error_errno(errno, "Failed to remove %s: %m", full);
 
-        /* Create mountpoint for cgroups. Otherwise we are not allowed since we
-         * remount /sys read-only.
-         */
-        const char *x = prefix_roota(top, "/fs/cgroup");
+        /* Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys/ read-only. */
+        _cleanup_free_ char *x = path_join(top, "/fs/cgroup");
+        if (!x)
+                return log_oom();
+
         (void) mkdir_p(x, 0755);
 
         return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL,
@@ -541,7 +558,7 @@ int mount_all(const char *dest,
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
+                /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing when we are privileged) */
                 { "proc",            "/proc",           "proc",  NULL,        PROC_DEFAULT_MOUNT_FLAGS,
                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
 
@@ -575,15 +592,15 @@ int mount_all(const char *dest,
                 { "mqueue",                 "/dev/mqueue",                  "mqueue", NULL,                            MS_NOSUID|MS_NOEXEC|MS_NODEV,
                   MOUNT_IN_USERNS|MOUNT_MKDIR },
 
-                /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
+                /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing when we are privileged) */
                 { "tmpfs",                  "/tmp",                         "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
                 { "tmpfs",                  "/sys",                         "tmpfs", "mode=0555" TMPFS_LIMITS_SYS,     MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_PRIVILEGED },
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             SYS_DEFAULT_MOUNT_FLAGS,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR },    /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_PRIVILEGED },    /* skipped if above was mounted */
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_MKDIR },                          /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PRIVILEGED },                          /* skipped if above was mounted */
                 { "tmpfs",                  "/dev",                         "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_MKDIR },
                 { "tmpfs",                  "/dev/shm",                     "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
@@ -604,11 +621,11 @@ int mount_all(const char *dest,
                   MOUNT_FATAL|MOUNT_IN_USERNS },
 #if HAVE_SELINUX
                 { "/sys/fs/selinux",        "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND,
-                  MOUNT_MKDIR },  /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
+                  MOUNT_MKDIR|MOUNT_PRIVILEGED },  /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
-                  0 },            /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
+                  MOUNT_PRIVILEGED },              /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_PRIVATE,
-                  0 },            /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
+                  MOUNT_PRIVILEGED },              /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
 #endif
         };
 
@@ -617,6 +634,7 @@ int mount_all(const char *dest,
         bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO);
         bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS);
         bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP);
+        bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED);
         int r;
 
         for (size_t k = 0; k < ELEMENTSOF(mount_table); k++) {
@@ -624,6 +642,10 @@ int mount_all(const char *dest,
                 bool fatal = FLAGS_SET(mount_table[k].mount_settings, MOUNT_FATAL);
                 const char *o;
 
+                /* If we are not privileged but the entry is marked as privileged and to be mounted outside the user namespace, then skip it */
+                if (!privileged && FLAGS_SET(mount_table[k].mount_settings, MOUNT_PRIVILEGED) && !FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS))
+                        continue;
+
                 if (in_userns != FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS))
                         continue;
 
@@ -642,7 +664,7 @@ int mount_all(const char *dest,
 
                 /* Skip this entry if it is not a remount. */
                 if (mount_table[k].what) {
-                        r = path_is_mount_point(where, NULL, 0);
+                        r = path_is_mount_point(where);
                         if (r < 0 && r != -ENOENT)
                                 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
                         if (r > 0)
@@ -742,6 +764,8 @@ static int parse_mount_bind_options(const char *options, unsigned long *mount_fl
                         new_idmapping = REMOUNT_IDMAPPING_NONE;
                 else if (streq(word, "rootidmap"))
                         new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER;
+                else if (streq(word, "owneridmap"))
+                        new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER_TO_TARGET_OWNER;
                 else
                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
                                                "Invalid bind mount option: %s", word);
@@ -759,6 +783,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
         _cleanup_free_ char *mount_opts = NULL, *where = NULL;
         unsigned long mount_flags = MS_BIND | MS_REC;
         struct stat source_st, dest_st;
+        uid_t dest_uid = UID_INVALID;
         int r;
         RemountIdmapping idmapping = REMOUNT_IDMAPPING_NONE;
 
@@ -787,6 +812,8 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
                 if (stat(where, &dest_st) < 0)
                         return log_error_errno(errno, "Failed to stat %s: %m", where);
 
+                dest_uid = dest_st.st_uid;
+
                 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode))
                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
                                                "Cannot bind mount directory %s on file %s.",
@@ -815,6 +842,8 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
 
                 if (chown(where, uid_shift, uid_shift) < 0)
                         return log_error_errno(errno, "Failed to chown %s: %m", where);
+
+                dest_uid = uid_shift;
         }
 
         r = mount_nofollow_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
@@ -828,7 +857,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
         }
 
         if (idmapping != REMOUNT_IDMAPPING_NONE) {
-                r = remount_idmap(STRV_MAKE(where), uid_shift, uid_range, source_st.st_uid, idmapping);
+                r = remount_idmap(STRV_MAKE(where), uid_shift, uid_range, source_st.st_uid, dest_uid, idmapping);
                 if (r < 0)
                         return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where);
         }
@@ -1388,17 +1417,30 @@ int wipe_fully_visible_fs(int mntns_fd) {
         _cleanup_close_ int orig_mntns_fd = -EBADF;
         int r, rr;
 
-        r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL);
+        r = namespace_open(0,
+                           /* ret_pidns_fd = */ NULL,
+                           &orig_mntns_fd,
+                           /* ret_netns_fd = */ NULL,
+                           /* ret_userns_fd = */ NULL,
+                           /* ret_root_fd = */ NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to pin originating mount namespace: %m");
 
-        r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF);
+        r = namespace_enter(/* pidns_fd = */ -EBADF,
+                            mntns_fd,
+                            /* netns_fd = */ -EBADF,
+                            /* userns_fd = */ -EBADF,
+                            /* root_fd = */ -EBADF);
         if (r < 0)
                 return log_error_errno(r, "Failed to enter mount namespace: %m");
 
         rr = do_wipe_fully_visible_fs();
 
-        r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF);
+        r = namespace_enter(/* pidns_fd = */ -EBADF,
+                            orig_mntns_fd,
+                            /* netns_fd = */ -EBADF,
+                            /* userns_fd = */ -EBADF,
+                            /* root_fd = */ -EBADF);
         if (r < 0)
                 return log_error_errno(r, "Failed to enter original mount namespace: %m");
 
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index bf5e47d..54dafa7 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -20,6 +20,7 @@ typedef enum MountSettingsMask {
         MOUNT_TOUCH              = 1 << 9, /* if set, touch file to mount over first */
         MOUNT_PREFIX_ROOT        = 1 << 10,/* if set, prefix the source path with the container's root directory */
         MOUNT_FOLLOW_SYMLINKS    = 1 << 11,/* if set, we'll follow symlinks for the mount target */
+        MOUNT_PRIVILEGED         = 1 << 12,/* if set, we'll only mount this in the outer child if we are running in privileged mode */
 } MountSettingsMask;
 
 typedef enum CustomMountType {
diff --git a/src/nspawn/nspawn-network.c b/src/nspawn/nspawn-network.c
index c661f1d..ec5d396 100644
--- a/src/nspawn/nspawn-network.c
+++ b/src/nspawn/nspawn-network.c
@@ -1,23 +1,34 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
+/* Make sure the net/if.h header is included before any linux/ one */
 #include <net/if.h>
 #include <linux/if.h>
+#include <linux/nl80211.h>
 #include <linux/veth.h>
 #include <sys/file.h>
+#include <sys/mount.h>
 
 #include "sd-device.h"
 #include "sd-id128.h"
 #include "sd-netlink.h"
 
 #include "alloc-util.h"
+#include "device-private.h"
+#include "device-util.h"
 #include "ether-addr-util.h"
+#include "fd-util.h"
 #include "hexdecoct.h"
 #include "lock-util.h"
 #include "missing_network.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "namespace-util.h"
 #include "netif-naming-scheme.h"
+#include "netif-util.h"
 #include "netlink-util.h"
 #include "nspawn-network.h"
 #include "parse-util.h"
+#include "process-util.h"
 #include "siphash24.h"
 #include "socket-netlink.h"
 #include "socket-util.h"
@@ -31,7 +42,6 @@
 #define VETH_EXTRA_HOST_HASH_KEY SD_ID128_MAKE(48,c7,f6,b7,ea,9d,4c,9e,b7,28,d4,de,91,d5,bf,66)
 #define VETH_EXTRA_CONTAINER_HASH_KEY SD_ID128_MAKE(af,50,17,61,ce,f9,4d,35,84,0d,2b,20,54,be,ce,59)
 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
-#define SHORTEN_IFNAME_HASH_KEY SD_ID128_MAKE(e1,90,a4,04,a8,ef,4b,51,8c,cc,c3,3a,9f,11,fc,a2)
 
 static int remove_one_link(sd_netlink *rtnl, const char *name) {
         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
@@ -57,51 +67,6 @@ static int remove_one_link(sd_netlink *rtnl, const char *name) {
         return 1;
 }
 
-static int generate_mac(
-                const char *machine_name,
-                struct ether_addr *mac,
-                sd_id128_t hash_key,
-                uint64_t idx) {
-
-        uint64_t result;
-        size_t l, sz;
-        uint8_t *v, *i;
-        int r;
-
-        l = strlen(machine_name);
-        sz = sizeof(sd_id128_t) + l;
-        if (idx > 0)
-                sz += sizeof(idx);
-
-        v = newa(uint8_t, sz);
-
-        /* fetch some persistent data unique to the host */
-        r = sd_id128_get_machine((sd_id128_t*) v);
-        if (r < 0)
-                return r;
-
-        /* combine with some data unique (on this host) to this
-         * container instance */
-        i = mempcpy(v + sizeof(sd_id128_t), machine_name, l);
-        if (idx > 0) {
-                idx = htole64(idx);
-                memcpy(i, &idx, sizeof(idx));
-        }
-
-        /* Let's hash the host machine ID plus the container name. We
-         * use a fixed, but originally randomly created hash key here. */
-        result = htole64(siphash24(v, sz, hash_key.bytes));
-
-        assert_cc(ETH_ALEN <= sizeof(result));
-        memcpy(mac->ether_addr_octet, &result, ETH_ALEN);
-
-        /* see eth_random_addr in the kernel */
-        mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
-        mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
-
-        return 0;
-}
-
 static int set_alternative_ifname(sd_netlink *rtnl, const char *ifname, const char *altifname) {
         int r;
 
@@ -200,39 +165,6 @@ static int add_veth(
         return 0;
 }
 
-static int shorten_ifname(char *ifname) {
-        char new_ifname[IFNAMSIZ];
-
-        assert(ifname);
-
-        if (strlen(ifname) < IFNAMSIZ) /* Name is short enough */
-                return 0;
-
-        if (naming_scheme_has(NAMING_NSPAWN_LONG_HASH)) {
-                uint64_t h;
-
-                /* Calculate 64-bit hash value */
-                h = siphash24(ifname, strlen(ifname), SHORTEN_IFNAME_HASH_KEY.bytes);
-
-                /* Set the final four bytes (i.e. 32-bit) to the lower 24bit of the hash, encoded in url-safe base64 */
-                memcpy(new_ifname, ifname, IFNAMSIZ - 5);
-                new_ifname[IFNAMSIZ - 5] = urlsafe_base64char(h >> 18);
-                new_ifname[IFNAMSIZ - 4] = urlsafe_base64char(h >> 12);
-                new_ifname[IFNAMSIZ - 3] = urlsafe_base64char(h >> 6);
-                new_ifname[IFNAMSIZ - 2] = urlsafe_base64char(h);
-        } else
-                /* On old nspawn versions we just truncated the name, provide compatibility */
-                memcpy(new_ifname, ifname, IFNAMSIZ-1);
-
-        new_ifname[IFNAMSIZ - 1] = 0;
-
-        /* Log the incident to make it more discoverable */
-        log_warning("Network interface name '%s' has been changed to '%s' to fit length constraints.", ifname, new_ifname);
-
-        strcpy(ifname, new_ifname);
-        return 1;
-}
-
 int setup_veth(const char *machine_name,
                pid_t pid,
                char iface_name[IFNAMSIZ],
@@ -252,18 +184,18 @@ int setup_veth(const char *machine_name,
         /* Use two different interface name prefixes depending whether
          * we are in bridge mode or not. */
         n = strjoina(bridge ? "vb-" : "ve-", machine_name);
-        r = shorten_ifname(n);
+        r = net_shorten_ifname(n, /* check_naming_scheme= */ true);
         if (r > 0)
                 a = strjoina(bridge ? "vb-" : "ve-", machine_name);
 
         if (ether_addr_is_null(provided_mac)){
-                r = generate_mac(machine_name, &mac_container, CONTAINER_HASH_KEY, 0);
+                r = net_generate_mac(machine_name, &mac_container, CONTAINER_HASH_KEY, 0);
                 if (r < 0)
                         return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
         } else
                 mac_container = *provided_mac;
 
-        r = generate_mac(machine_name, &mac_host, HOST_HASH_KEY, 0);
+        r = net_generate_mac(machine_name, &mac_host, HOST_HASH_KEY, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
 
@@ -306,11 +238,11 @@ int setup_veth_extra(
         STRV_FOREACH_PAIR(a, b, pairs) {
                 struct ether_addr mac_host, mac_container;
 
-                r = generate_mac(machine_name, &mac_container, VETH_EXTRA_CONTAINER_HASH_KEY, idx);
+                r = net_generate_mac(machine_name, &mac_container, VETH_EXTRA_CONTAINER_HASH_KEY, idx);
                 if (r < 0)
                         return log_error_errno(r, "Failed to generate predictable MAC address for container side of extra veth link: %m");
 
-                r = generate_mac(machine_name, &mac_host, VETH_EXTRA_HOST_HASH_KEY, idx);
+                r = net_generate_mac(machine_name, &mac_host, VETH_EXTRA_HOST_HASH_KEY, idx);
                 if (r < 0)
                         return log_error_errno(r, "Failed to generate predictable MAC address for host side of extra veth link: %m");
 
@@ -480,7 +412,7 @@ static int test_network_interface_initialized(const char *name) {
         if (r < 0)
                 return log_error_errno(r, "Failed to get device %s: %m", name);
 
-        r = sd_device_get_is_initialized(d);
+        r = device_is_processed(d);
         if (r < 0)
                 return log_error_errno(r, "Failed to determine whether interface %s is initialized: %m", name);
         if (r == 0)
@@ -505,42 +437,302 @@ int test_network_interfaces_initialized(char **iface_pairs) {
         return 0;
 }
 
-int move_network_interfaces(int netns_fd, char **iface_pairs) {
+int resolve_network_interface_names(char **iface_pairs) {
         _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
         int r;
 
-        if (strv_isempty(iface_pairs))
-                return 0;
+        /* Due to a bug in kernel fixed by 8e15aee621618a3ee3abecaf1fd8c1428098b7ef (v6.6, backported to
+         * 6.1.60 and 6.5.9), an interface with alternative names cannot be resolved by the alternative name
+         * if the interface is moved to another network namespace. Hence, we need to adjust the provided
+         * names before moving interfaces to container namespace. */
 
-        r = sd_netlink_open(&rtnl);
+        STRV_FOREACH_PAIR(from, to, iface_pairs) {
+                _cleanup_free_ char *name = NULL;
+                _cleanup_strv_free_ char **altnames = NULL;
+
+                r = rtnl_resolve_ifname_full(&rtnl, _RESOLVE_IFNAME_ALL, *from, &name, &altnames);
+                if (r < 0)
+                        return r;
+
+                /* Always use the resolved name for 'from'. */
+                free_and_replace(*from, name);
+
+                /* If the name 'to' is assigned as an alternative name, we cannot rename the interface.
+                 * Hence, use the assigned interface name (including the alternative names) as is, and
+                 * use the resolved name for 'to'. */
+                if (strv_contains(altnames, *to)) {
+                        r = free_and_strdup_warn(to, *from);
+                        if (r < 0)
+                                return r;
+                }
+        }
+        return 0;
+}
+
+static int netns_child_begin(int netns_fd, int *ret_original_netns_fd) {
+        _cleanup_close_ int original_netns_fd = -EBADF;
+        int r;
+
+        assert(netns_fd >= 0);
+
+        if (ret_original_netns_fd) {
+                r = namespace_open(0,
+                                   /* ret_pidns_fd = */ NULL,
+                                   /* ret_mntns_fd = */ NULL,
+                                   &original_netns_fd,
+                                   /* ret_userns_fd = */ NULL,
+                                   /* ret_root_fd = */ NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to open original network namespace: %m");
+        }
+
+        r = namespace_enter(/* pidns_fd = */ -EBADF,
+                            /* mntns_fd = */ -EBADF,
+                            netns_fd,
+                            /* userns_fd = */ -EBADF,
+                            /* root_fd = */ -EBADF);
         if (r < 0)
-                return log_error_errno(r, "Failed to connect to netlink: %m");
+                return log_error_errno(r, "Failed to enter child network namespace: %m");
 
-        STRV_FOREACH_PAIR(i, b, iface_pairs) {
-                _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
-                int ifi;
+        r = umount_recursive("/sys/", /* flags = */ 0);
+        if (r < 0)
+                log_debug_errno(r, "Failed to unmount directories below /sys/, ignoring: %m");
 
-                ifi = rtnl_resolve_interface_or_warn(&rtnl, *i);
-                if (ifi < 0)
-                        return ifi;
+        (void) mkdir_p("/sys/", 0755);
+
+        /* Populate new sysfs instance associated with the client netns, to make sd_device usable. */
+        r = mount_nofollow_verbose(LOG_ERR, "sysfs", "/sys/", "sysfs",
+                                   MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to mount sysfs on /sys/: %m");
+
+        /* udev_avaliable() might be called previously and the result may be cached.
+         * Now, we (re-)mount sysfs. Hence, we need to reset the cache. */
+        reset_cached_udev_availability();
+
+        if (ret_original_netns_fd)
+                *ret_original_netns_fd = TAKE_FD(original_netns_fd);
+
+        return 0;
+}
+
+static int netns_fork_and_wait(int netns_fd, int *ret_original_netns_fd) {
+        int r;
+
+        assert(netns_fd >= 0);
+
+        r = safe_fork("(sd-netns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to fork process (sd-netns): %m");
+        if (r == 0) {
+                if (netns_child_begin(netns_fd, ret_original_netns_fd) < 0)
+                        _exit(EXIT_FAILURE);
+
+                return 0;
+        }
+
+        if (ret_original_netns_fd)
+                *ret_original_netns_fd = -EBADF;
 
-                r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
+        return 1;
+}
+
+static int move_wlan_interface_impl(sd_netlink **genl, int netns_fd, sd_device *dev) {
+        _cleanup_(sd_netlink_unrefp) sd_netlink *our_genl = NULL;
+        _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+        int r;
+
+        assert(netns_fd >= 0);
+        assert(dev);
+
+        if (!genl)
+                genl = &our_genl;
+        if (!*genl) {
+                r = sd_genl_socket_open(genl);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to allocate netlink message: %m");
+                        return log_error_errno(r, "Failed to connect to generic netlink: %m");
+        }
+
+        r = sd_genl_message_new(*genl, NL80211_GENL_NAME, NL80211_CMD_SET_WIPHY_NETNS, &m);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to allocate netlink message: %m");
+
+        uint32_t phy_index;
+        r = device_get_sysattr_u32(dev, "phy80211/index", &phy_index);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to get phy index: %m");
+
+        r = sd_netlink_message_append_u32(m, NL80211_ATTR_WIPHY, phy_index);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to append phy index to netlink message: %m");
+
+        r = sd_netlink_message_append_u32(m, NL80211_ATTR_NETNS_FD, netns_fd);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to append namespace fd to netlink message: %m");
+
+        r = sd_netlink_call(*genl, m, 0, NULL);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to move interface to namespace: %m");
+
+        return 0;
+}
+
+static int move_wlan_interface_one(
+                        sd_netlink **rtnl,
+                        sd_netlink **genl,
+                        int *temp_netns_fd,
+                        int netns_fd,
+                        sd_device *dev,
+                        const char *name) {
+
+        int r;
+
+        assert(rtnl);
+        assert(genl);
+        assert(temp_netns_fd);
+        assert(netns_fd >= 0);
+        assert(dev);
 
-                r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
+        if (!name)
+                return move_wlan_interface_impl(genl, netns_fd, dev);
+
+        /* The command NL80211_CMD_SET_WIPHY_NETNS takes phy instead of network interface, and does not take
+         * an interface name in the passed network namespace. Hence, we need to move the phy and interface to
+         * a temporary network namespace, rename the interface in it, and move them to the requested netns. */
+
+        if (*temp_netns_fd < 0) {
+                r = netns_acquire();
                 if (r < 0)
-                        return log_error_errno(r, "Failed to append namespace fd to netlink message: %m");
+                        return log_error_errno(r, "Failed to acquire new network namespace: %m");
+                *temp_netns_fd = r;
+        }
 
-                if (!streq(*b, *i)) {
-                        r = sd_netlink_message_append_string(m, IFLA_IFNAME, *b);
-                        if (r < 0)
-                                return log_error_errno(r, "Failed to add netlink interface name: %m");
+        r = move_wlan_interface_impl(genl, *temp_netns_fd, dev);
+        if (r < 0)
+                return r;
+
+        const char *sysname;
+        r = sd_device_get_sysname(dev, &sysname);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to get interface name: %m");
+
+        r = netns_fork_and_wait(*temp_netns_fd, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to fork process (nspawn-rename-wlan): %m");
+        if (r == 0) {
+                _cleanup_(sd_device_unrefp) sd_device *temp_dev = NULL;
+
+                r = rtnl_rename_link(NULL, sysname, name);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to rename network interface '%s' to '%s': %m", sysname, name);
+                        goto finalize;
                 }
 
-                r = sd_netlink_call(rtnl, m, 0, NULL);
+                r = sd_device_new_from_ifname(&temp_dev, name);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to acquire device '%s': %m", name);
+                        goto finalize;
+                }
+
+                r = move_wlan_interface_impl(NULL, netns_fd, temp_dev);
+
+        finalize:
+                _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
+        }
+
+        return 0;
+}
+
+static int move_network_interface_one(sd_netlink **rtnl, int netns_fd, sd_device *dev, const char *name) {
+        _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+        int r;
+
+        assert(rtnl);
+        assert(netns_fd >= 0);
+        assert(dev);
+
+        if (!*rtnl) {
+                r = sd_netlink_open(rtnl);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
+                        return log_error_errno(r, "Failed to connect to rtnetlink: %m");
+        }
+
+        int ifindex;
+        r = sd_device_get_ifindex(dev, &ifindex);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to get ifindex: %m");
+
+        r = sd_rtnl_message_new_link(*rtnl, &m, RTM_SETLINK, ifindex);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to allocate netlink message: %m");
+
+        r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to append namespace fd to netlink message: %m");
+
+        if (name) {
+                r = sd_netlink_message_append_string(m, IFLA_IFNAME, name);
+                if (r < 0)
+                        return log_device_error_errno(dev, r, "Failed to add netlink interface name: %m");
+        }
+
+        r = sd_netlink_call(*rtnl, m, 0, NULL);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to move interface to namespace: %m");
+
+        return 0;
+}
+
+int move_network_interfaces(int netns_fd, char **iface_pairs) {
+        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL, *genl = NULL;
+        _cleanup_close_ int temp_netns_fd = -EBADF;
+        int r;
+
+        assert(netns_fd >= 0);
+
+        if (strv_isempty(iface_pairs))
+                return 0;
+
+        STRV_FOREACH_PAIR(from, to, iface_pairs) {
+                _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+                const char *name;
+
+                name = streq(*from, *to) ? NULL : *to;
+
+                r = sd_device_new_from_ifname(&dev, *from);
+                if (r < 0)
+                        return log_error_errno(r, "Unknown interface name %s: %m", *from);
+
+                if (device_is_devtype(dev, "wlan"))
+                        r = move_wlan_interface_one(&rtnl, &genl, &temp_netns_fd, netns_fd, dev, name);
+                else
+                        r = move_network_interface_one(&rtnl, netns_fd, dev, name);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int move_back_network_interfaces(int child_netns_fd, char **interface_pairs) {
+        _cleanup_close_ int parent_netns_fd = -EBADF;
+        int r;
+
+        assert(child_netns_fd >= 0);
+
+        if (strv_isempty(interface_pairs))
+                return 0;
+
+        r = netns_fork_and_wait(child_netns_fd, &parent_netns_fd);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* Reverse network interfaces pair list so that interfaces get their initial name back.
+                 * This is about ensuring interfaces get their old name back when being moved back. */
+                interface_pairs = strv_reverse(interface_pairs);
+
+                r = move_network_interfaces(parent_netns_fd, interface_pairs);
+                _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
         }
 
         return 0;
@@ -568,7 +760,7 @@ int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs) {
                 if (ifi < 0)
                         return ifi;
 
-                r = generate_mac(machine_name, &mac, MACVLAN_HASH_KEY, idx++);
+                r = net_generate_mac(machine_name, &mac, MACVLAN_HASH_KEY, idx++);
                 if (r < 0)
                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
 
@@ -584,7 +776,7 @@ int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs) {
                 if (!n)
                         return log_oom();
 
-                shortened = shorten_ifname(n);
+                shortened = net_shorten_ifname(n, /* check_naming_scheme= */ true);
 
                 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
                 if (r < 0)
@@ -661,7 +853,7 @@ int setup_ipvlan(const char *machine_name, pid_t pid, char **iface_pairs) {
                 if (!n)
                         return log_oom();
 
-                shortened = shorten_ifname(n);
+                shortened = net_shorten_ifname(n, /* check_naming_scheme= */ true);
 
                 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
                 if (r < 0)
diff --git a/src/nspawn/nspawn-network.h b/src/nspawn/nspawn-network.h
index a785f8e..840fe15 100644
--- a/src/nspawn/nspawn-network.h
+++ b/src/nspawn/nspawn-network.h
@@ -8,6 +8,7 @@
 #include "ether-addr-util.h"
 
 int test_network_interfaces_initialized(char **iface_pairs);
+int resolve_network_interface_names(char **iface_pairs);
 
 int setup_veth(const char *machine_name, pid_t pid, char iface_name[IFNAMSIZ], bool bridge, const struct ether_addr *provided_mac);
 int setup_veth_extra(const char *machine_name, pid_t pid, char **pairs);
@@ -19,6 +20,7 @@ int setup_macvlan(const char *machine_name, pid_t pid, char **iface_pairs);
 int setup_ipvlan(const char *machine_name, pid_t pid, char **iface_pairs);
 
 int move_network_interfaces(int netns_fd, char **iface_pairs);
+int move_back_network_interfaces(int child_netns_fd, char **interface_pairs);
 
 int veth_extra_parse(char ***l, const char *p);
 
diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c
index 8f1ac7c..a00934c 100644
--- a/src/nspawn/nspawn-oci.c
+++ b/src/nspawn/nspawn-oci.c
@@ -409,18 +409,18 @@ static int oci_user(const char *name, JsonVariant *v, JsonDispatchFlags flags, v
 static int oci_process(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
 
         static const JsonDispatch table[] = {
-                { "terminal",        JSON_VARIANT_BOOLEAN, oci_terminal,          0,                                     0               },
-                { "consoleSize",     JSON_VARIANT_OBJECT,  oci_console_size,      0,                                     0               },
-                { "cwd",             JSON_VARIANT_STRING,  oci_absolute_path,     offsetof(Settings, working_directory), 0               },
-                { "env",             JSON_VARIANT_ARRAY,   oci_env,               offsetof(Settings, environment),       0               },
-                { "args",            JSON_VARIANT_ARRAY,   oci_args,              offsetof(Settings, parameters),        0               },
-                { "rlimits",         JSON_VARIANT_ARRAY,   oci_rlimits,           0,                                     0               },
-                { "apparmorProfile", JSON_VARIANT_STRING,  oci_unsupported,       0,                                     JSON_PERMISSIVE },
-                { "capabilities",    JSON_VARIANT_OBJECT,  oci_capabilities,      0,                                     0               },
-                { "noNewPrivileges", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, no_new_privileges), 0               },
-                { "oomScoreAdj",     JSON_VARIANT_INTEGER, oci_oom_score_adj,     0,                                     0               },
-                { "selinuxLabel",    JSON_VARIANT_STRING,  oci_unsupported,       0,                                     JSON_PERMISSIVE },
-                { "user",            JSON_VARIANT_OBJECT,  oci_user,              0,                                     0               },
+                { "terminal",        JSON_VARIANT_BOOLEAN, oci_terminal,           0,                                     0               },
+                { "consoleSize",     JSON_VARIANT_OBJECT,  oci_console_size,       0,                                     0               },
+                { "cwd",             JSON_VARIANT_STRING,  oci_absolute_path,      offsetof(Settings, working_directory), 0               },
+                { "env",             JSON_VARIANT_ARRAY,   oci_env,                offsetof(Settings, environment),       0               },
+                { "args",            JSON_VARIANT_ARRAY,   oci_args,               offsetof(Settings, parameters),        0               },
+                { "rlimits",         JSON_VARIANT_ARRAY,   oci_rlimits,            0,                                     0               },
+                { "apparmorProfile", JSON_VARIANT_STRING,  oci_unsupported,        0,                                     JSON_PERMISSIVE },
+                { "capabilities",    JSON_VARIANT_OBJECT,  oci_capabilities,       0,                                     0               },
+                { "noNewPrivileges", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Settings, no_new_privileges), 0               },
+                { "oomScoreAdj",     JSON_VARIANT_INTEGER, oci_oom_score_adj,      0,                                     0               },
+                { "selinuxLabel",    JSON_VARIANT_STRING,  oci_unsupported,        0,                                     JSON_PERMISSIVE },
+                { "user",            JSON_VARIANT_OBJECT,  oci_user,               0,                                     0               },
                 {}
         };
 
@@ -432,8 +432,8 @@ static int oci_root(const char *name, JsonVariant *v, JsonDispatchFlags flags, v
         int r;
 
         static const JsonDispatch table[] = {
-                { "path",     JSON_VARIANT_STRING,  json_dispatch_string,  offsetof(Settings, root)      },
-                { "readonly", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, read_only) },
+                { "path",     JSON_VARIANT_STRING,  json_dispatch_string,   offsetof(Settings, root)      },
+                { "readonly", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Settings, read_only) },
                 {}
         };
 
@@ -863,7 +863,7 @@ static int oci_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags
 
                         if (node->major == UINT_MAX || node->minor == UINT_MAX) {
                                 r = json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
-                                             "Major/minor required when device node is device node");
+                                             "Major/minor required when device node is device node.");
                                 goto fail_element;
                         }
 
@@ -1148,7 +1148,7 @@ static int oci_cgroup_memory_limit(const char *name, JsonVariant *v, JsonDispatc
 
         if (!json_variant_is_unsigned(v))
                 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
-                                "Memory limit is not an unsigned integer");
+                                "Memory limit is not an unsigned integer.");
 
         k = json_variant_unsigned(v);
         if (k >= UINT64_MAX)
@@ -1588,7 +1588,7 @@ static int oci_sysctl(const char *name, JsonVariant *v, JsonDispatchFlags flags,
                         return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
                                         "sysctl key invalid, refusing: %s", k);
 
-                r = strv_extend_strv(&s->sysctl, STRV_MAKE(k, m), false);
+                r = strv_extend_many(&s->sysctl, k, m);
                 if (r < 0)
                         return log_oom();
         }
@@ -1716,7 +1716,7 @@ static int oci_seccomp_archs(const char *name, JsonVariant *v, JsonDispatchFlags
 
                 if (!json_variant_is_string(e))
                         return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
-                                        "Architecture entry is not a string");
+                                        "Architecture entry is not a string.");
 
                 r = oci_seccomp_arch_from_string(json_variant_string(e), &a);
                 if (r < 0)
@@ -1837,10 +1837,8 @@ static int oci_seccomp_syscalls(const char *name, JsonVariant *v, JsonDispatchFl
                 if (r < 0)
                         return r;
 
-                if (strv_isempty(rule.names)) {
-                        json_log(e, flags, 0, "System call name list is empty.");
-                        return -EINVAL;
-                }
+                if (strv_isempty(rule.names))
+                        return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "System call name list is empty.");
 
                 STRV_FOREACH(i, rule.names) {
                         int nr;
@@ -2082,7 +2080,7 @@ static int oci_hooks_array(const char *name, JsonVariant *v, JsonDispatchFlags f
                         return r;
                 }
 
-                (*n_array) ++;
+                (*n_array)++;
         }
 
         return 0;
diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c
index 66962d7..b63516d 100644
--- a/src/nspawn/nspawn-register.c
+++ b/src/nspawn/nspawn-register.c
@@ -297,15 +297,12 @@ int allocate_scope(
 
         description = strjoina("Container ", machine_name);
 
-        if (allow_pidfd) {
-                _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
-                r = pidref_set_pid(&pidref, pid);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to allocate PID reference: %m");
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        r = pidref_set_pid(&pidref, pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate PID reference: %m");
 
-                r = bus_append_scope_pidref(m, &pidref);
-        } else
-                r = sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, pid);
+        r = bus_append_scope_pidref(m, &pidref, allow_pidfd);
         if (r < 0)
                 return bus_log_create_error(r);
 
@@ -368,7 +365,11 @@ int allocate_scope(
         if (r < 0)
                 return bus_log_parse_error(r);
 
-        r = bus_wait_for_jobs_one(w, object, false, NULL);
+        r = bus_wait_for_jobs_one(
+                        w,
+                        object,
+                        BUS_WAIT_JOBS_LOG_ERROR,
+                        /* extra_args= */ NULL);
         if (r < 0)
                 return r;
 
diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c
index 161b1c1..132a543 100644
--- a/src/nspawn/nspawn-settings.c
+++ b/src/nspawn/nspawn-settings.c
@@ -835,21 +835,21 @@ int config_parse_cpu_affinity(
 DEFINE_CONFIG_PARSE_ENUM(config_parse_resolv_conf, resolv_conf_mode, ResolvConfMode, "Failed to parse resolv.conf mode");
 
 static const char *const resolv_conf_mode_table[_RESOLV_CONF_MODE_MAX] = {
-        [RESOLV_CONF_OFF] = "off",
-        [RESOLV_CONF_COPY_HOST] = "copy-host",
-        [RESOLV_CONF_COPY_STATIC] = "copy-static",
-        [RESOLV_CONF_COPY_UPLINK] = "copy-uplink",
-        [RESOLV_CONF_COPY_STUB] = "copy-stub",
-        [RESOLV_CONF_REPLACE_HOST] = "replace-host",
+        [RESOLV_CONF_OFF]            = "off",
+        [RESOLV_CONF_COPY_HOST]      = "copy-host",
+        [RESOLV_CONF_COPY_STATIC]    = "copy-static",
+        [RESOLV_CONF_COPY_UPLINK]    = "copy-uplink",
+        [RESOLV_CONF_COPY_STUB]      = "copy-stub",
+        [RESOLV_CONF_REPLACE_HOST]   = "replace-host",
         [RESOLV_CONF_REPLACE_STATIC] = "replace-static",
         [RESOLV_CONF_REPLACE_UPLINK] = "replace-uplink",
-        [RESOLV_CONF_REPLACE_STUB] = "replace-stub",
-        [RESOLV_CONF_BIND_HOST] = "bind-host",
-        [RESOLV_CONF_BIND_STATIC] = "bind-static",
-        [RESOLV_CONF_BIND_UPLINK] = "bind-uplink",
-        [RESOLV_CONF_BIND_STUB] = "bind-stub",
-        [RESOLV_CONF_DELETE] = "delete",
-        [RESOLV_CONF_AUTO] = "auto",
+        [RESOLV_CONF_REPLACE_STUB]   = "replace-stub",
+        [RESOLV_CONF_BIND_HOST]      = "bind-host",
+        [RESOLV_CONF_BIND_STATIC]    = "bind-static",
+        [RESOLV_CONF_BIND_UPLINK]    = "bind-uplink",
+        [RESOLV_CONF_BIND_STUB]      = "bind-stub",
+        [RESOLV_CONF_DELETE]         = "delete",
+        [RESOLV_CONF_AUTO]           = "auto",
 };
 
 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(resolv_conf_mode, ResolvConfMode, RESOLV_CONF_AUTO);
@@ -914,15 +914,15 @@ int config_parse_link_journal(
         return 0;
 }
 
-DEFINE_CONFIG_PARSE_ENUM(config_parse_timezone, timezone_mode, TimezoneMode, "Failed to parse timezone mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_timezone_mode, timezone_mode, TimezoneMode, "Failed to parse timezone mode");
 
 static const char *const timezone_mode_table[_TIMEZONE_MODE_MAX] = {
-        [TIMEZONE_OFF] = "off",
-        [TIMEZONE_COPY] = "copy",
-        [TIMEZONE_BIND] = "bind",
+        [TIMEZONE_OFF]     = "off",
+        [TIMEZONE_COPY]    = "copy",
+        [TIMEZONE_BIND]    = "bind",
         [TIMEZONE_SYMLINK] = "symlink",
-        [TIMEZONE_DELETE] = "delete",
-        [TIMEZONE_AUTO] = "auto",
+        [TIMEZONE_DELETE]  = "delete",
+        [TIMEZONE_AUTO]    = "auto",
 };
 
 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(timezone_mode, TimezoneMode, TIMEZONE_AUTO);
@@ -930,10 +930,10 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(timezone_mode, TimezoneMode, TIMEZONE_AU
 DEFINE_CONFIG_PARSE_ENUM(config_parse_userns_ownership, user_namespace_ownership, UserNamespaceOwnership, "Failed to parse user namespace ownership mode");
 
 static const char *const user_namespace_ownership_table[_USER_NAMESPACE_OWNERSHIP_MAX] = {
-        [USER_NAMESPACE_OWNERSHIP_OFF] = "off",
+        [USER_NAMESPACE_OWNERSHIP_OFF]   = "off",
         [USER_NAMESPACE_OWNERSHIP_CHOWN] = "chown",
-        [USER_NAMESPACE_OWNERSHIP_MAP] = "map",
-        [USER_NAMESPACE_OWNERSHIP_AUTO] = "auto",
+        [USER_NAMESPACE_OWNERSHIP_MAP]   = "map",
+        [USER_NAMESPACE_OWNERSHIP_AUTO]  = "auto",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(user_namespace_ownership, UserNamespaceOwnership);
diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h
index 8edf8a3..0bcb285 100644
--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@@ -268,7 +268,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_oom_score_adjust);
 CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity);
 CONFIG_PARSER_PROTOTYPE(config_parse_resolv_conf);
 CONFIG_PARSER_PROTOTYPE(config_parse_link_journal);
-CONFIG_PARSER_PROTOTYPE(config_parse_timezone);
+CONFIG_PARSER_PROTOTYPE(config_parse_timezone_mode);
 CONFIG_PARSER_PROTOTYPE(config_parse_userns_chown);
 CONFIG_PARSER_PROTOTYPE(config_parse_userns_ownership);
 CONFIG_PARSER_PROTOTYPE(config_parse_bind_user);
diff --git a/src/nspawn/nspawn-setuid.c b/src/nspawn/nspawn-setuid.c
index 2d67c3d..e350b22 100644
--- a/src/nspawn/nspawn-setuid.c
+++ b/src/nspawn/nspawn-setuid.c
@@ -56,6 +56,8 @@ int change_uid_gid_raw(
                 size_t n_supplementary_gids,
                 bool chown_stdio) {
 
+        int r;
+
         if (!uid_is_valid(uid))
                 uid = 0;
         if (!gid_is_valid(gid))
@@ -67,14 +69,9 @@ int change_uid_gid_raw(
                 (void) fchown(STDERR_FILENO, uid, gid);
         }
 
-        if (setgroups(n_supplementary_gids, supplementary_gids) < 0)
-                return log_error_errno(errno, "Failed to set auxiliary groups: %m");
-
-        if (setresgid(gid, gid, gid) < 0)
-                return log_error_errno(errno, "setresgid() failed: %m");
-
-        if (setresuid(uid, uid, uid) < 0)
-                return log_error_errno(errno, "setresuid() failed: %m");
+        r = fully_set_uid_gid(uid, gid, supplementary_gids, n_supplementary_gids);
+        if (r < 0)
+                return log_error_errno(r, "Changing privileges failed: %m");
 
         return 0;
 }
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index e46cc1c..5842d3b 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -1,10 +1,7 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
-#if HAVE_BLKID
-#endif
 #include <errno.h>
 #include <getopt.h>
-#include <linux/fs.h>
 #include <linux/loop.h>
 #if HAVE_SELINUX
 #include <selinux/selinux.h>
@@ -12,6 +9,7 @@
 #include <stdlib.h>
 #include <sys/file.h>
 #include <sys/ioctl.h>
+#include <sys/mount.h>
 #include <sys/personality.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
@@ -19,6 +17,8 @@
 #include <termios.h>
 #include <unistd.h>
 
+#include <linux/fs.h> /* Must be included after <sys/mount.h> */
+
 #include "sd-bus.h"
 #include "sd-daemon.h"
 #include "sd-id128.h"
@@ -84,6 +84,7 @@
 #include "nspawn-stub-pid1.h"
 #include "nspawn-util.h"
 #include "nspawn.h"
+#include "nsresource.h"
 #include "nulstr-util.h"
 #include "os-util.h"
 #include "pager.h"
@@ -112,6 +113,7 @@
 #include "umask-util.h"
 #include "unit-name.h"
 #include "user-util.h"
+#include "vpick.h"
 
 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
@@ -229,13 +231,14 @@ static DeviceNode* arg_extra_nodes = NULL;
 static size_t arg_n_extra_nodes = 0;
 static char **arg_sysctl = NULL;
 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
-static MachineCredential *arg_credentials = NULL;
-static size_t arg_n_credentials = 0;
+static MachineCredentialContext arg_credentials = {};
 static char **arg_bind_user = NULL;
 static bool arg_suppress_sync = false;
 static char *arg_settings_filename = NULL;
 static Architecture arg_architecture = _ARCHITECTURE_INVALID;
 static ImagePolicy *arg_image_policy = NULL;
+static char *arg_background = NULL;
+static bool arg_privileged = false;
 
 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
@@ -266,11 +269,13 @@ STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
 #if HAVE_SECCOMP
 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
 #endif
+STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done);
 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
+STATIC_DESTRUCTOR_REGISTER(arg_background, freep);
 
 static int handle_arg_console(const char *arg) {
         if (streq(arg, "help")) {
@@ -289,7 +294,7 @@ static int handle_arg_console(const char *arg) {
         else if (streq(arg, "passive"))
                 arg_console_mode = CONSOLE_PASSIVE;
         else if (streq(arg, "pipe")) {
-                if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
+                if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
                         log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
                                  "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
                                  "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
@@ -297,7 +302,7 @@ static int handle_arg_console(const char *arg) {
 
                 arg_console_mode = CONSOLE_PIPE;
         } else if (streq(arg, "autopipe")) {
-                if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
+                if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
                         arg_console_mode = CONSOLE_INTERACTIVE;
                 else
                         arg_console_mode = CONSOLE_PIPE;
@@ -324,8 +329,8 @@ static int help(void) {
                "     --version              Print version string\n"
                "  -q --quiet                Do not show status information\n"
                "     --no-pager             Do not pipe output into a pager\n"
-               "     --settings=BOOLEAN     Load additional settings from .nspawn file\n\n"
-               "%3$sImage:%4$s\n"
+               "     --settings=BOOLEAN     Load additional settings from .nspawn file\n"
+               "\n%3$sImage:%4$s\n"
                "  -D --directory=PATH       Root directory for the container\n"
                "     --template=PATH        Initialize root directory from template directory,\n"
                "                            if missing\n"
@@ -344,8 +349,8 @@ static int help(void) {
                "                            'base64:'\n"
                "     --verity-data=PATH     Specify hash device for verity\n"
                "     --pivot-root=PATH[:PATH]\n"
-               "                            Pivot root to given directory in the container\n\n"
-               "%3$sExecution:%4$s\n"
+               "                            Pivot root to given directory in the container\n"
+               "\n%3$sExecution:%4$s\n"
                "  -a --as-pid2              Maintain a stub init as PID1, invoke binary as PID2\n"
                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
                "     --chdir=PATH           Set working directory in the container\n"
@@ -354,18 +359,18 @@ static int help(void) {
                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
                "     --notify-ready=BOOLEAN Receive notifications from the child init process\n"
                "     --suppress-sync=BOOLEAN\n"
-               "                            Suppress any form of disk data synchronization\n\n"
-               "%3$sSystem Identity:%4$s\n"
+               "                            Suppress any form of disk data synchronization\n"
+               "\n%3$sSystem Identity:%4$s\n"
                "  -M --machine=NAME         Set the machine name for the container\n"
                "     --hostname=NAME        Override the hostname for the container\n"
-               "     --uuid=UUID            Set a specific machine UUID for the container\n\n"
-               "%3$sProperties:%4$s\n"
+               "     --uuid=UUID            Set a specific machine UUID for the container\n"
+               "\n%3$sProperties:%4$s\n"
                "  -S --slice=SLICE          Place the container in the specified slice\n"
                "     --property=NAME=VALUE  Set scope unit property\n"
                "     --register=BOOLEAN     Register container as machine\n"
                "     --keep-unit            Do not register a scope for the machine, reuse\n"
-               "                            the service unit nspawn is running in\n\n"
-               "%3$sUser Namespacing:%4$s\n"
+               "                            the service unit nspawn is running in\n"
+               "\n%3$sUser Namespacing:%4$s\n"
                "     --private-users=no     Run without user namespacing\n"
                "     --private-users=yes|pick|identity\n"
                "                            Run within user namespace, autoselect UID/GID range\n"
@@ -375,8 +380,8 @@ static int help(void) {
                "                            Adjust ('chown') or map ('map') OS tree ownership\n"
                "                            to private UID/GID range\n"
                "  -U                        Equivalent to --private-users=pick and\n"
-               "                            --private-users-ownership=auto\n\n"
-               "%3$sNetworking:%4$s\n"
+               "                            --private-users-ownership=auto\n"
+               "\n%3$sNetworking:%4$s\n"
                "     --private-network      Disable network in container\n"
                "     --network-interface=HOSTIF[:CONTAINERIF]\n"
                "                            Assign an existing network interface to the\n"
@@ -401,8 +406,8 @@ static int help(void) {
                "                            Set network namespace to the one represented by\n"
                "                            the specified kernel namespace file node\n"
                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
-               "                            Expose a container IP port on the host\n\n"
-               "%3$sSecurity:%4$s\n"
+               "                            Expose a container IP port on the host\n"
+               "\n%3$sSecurity:%4$s\n"
                "     --capability=CAP       In addition to the default, retain specified\n"
                "                            capability\n"
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
@@ -417,20 +422,20 @@ static int help(void) {
                "                            processes in the container\n"
                "  -L --selinux-apifs-context=SECLABEL\n"
                "                            Set the SELinux security context to be used by\n"
-               "                            API/tmpfs file systems in the container\n\n"
-               "%3$sResources:%4$s\n"
+               "                            API/tmpfs file systems in the container\n"
+               "\n%3$sResources:%4$s\n"
                "     --rlimit=NAME=LIMIT    Set a resource limit for the payload\n"
                "     --oom-score-adjust=VALUE\n"
                "                            Adjust the OOM score value for the payload\n"
                "     --cpu-affinity=CPUS    Adjust the CPU affinity of the container\n"
-               "     --personality=ARCH     Pick personality for this container\n\n"
-               "%3$sIntegration:%4$s\n"
+               "     --personality=ARCH     Pick personality for this container\n"
+               "\n%3$sIntegration:%4$s\n"
                "     --resolv-conf=MODE     Select mode of /etc/resolv.conf initialization\n"
                "     --timezone=MODE        Select mode of /etc/localtime initialization\n"
                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, \n"
                "                            host, try-guest, try-host\n"
-               "  -j                        Equivalent to --link-journal=try-guest\n\n"
-               "%3$sMounts:%4$s\n"
+               "  -j                        Equivalent to --link-journal=try-guest\n"
+               "\n%3$sMounts:%4$s\n"
                "     --bind=PATH[:PATH[:OPTIONS]]\n"
                "                            Bind mount a file or directory from the host into\n"
                "                            the container\n"
@@ -444,12 +449,13 @@ static int help(void) {
                "                            the container\n"
                "     --overlay-ro=PATH[:PATH...]:PATH\n"
                "                            Similar, but creates a read-only overlay mount\n"
-               "     --bind-user=NAME       Bind user from host to container\n\n"
-               "%3$sInput/Output:%4$s\n"
+               "     --bind-user=NAME       Bind user from host to container\n"
+               "\n%3$sInput/Output:%4$s\n"
                "     --console=MODE         Select how stdin/stdout/stderr and /dev/console are\n"
                "                            set up for the container.\n"
-               "  -P --pipe                 Equivalent to --console=pipe\n\n"
-               "%3$sCredentials:%4$s\n"
+               "  -P --pipe                 Equivalent to --console=pipe\n"
+               "     --background=COLOR     Set ANSI color for background\n"
+               "\n%3$sCredentials:%4$s\n"
                "     --set-credential=ID:VALUE\n"
                "                            Pass a credential with literal value to container.\n"
                "     --load-credential=ID:PATH\n"
@@ -514,6 +520,12 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) {
 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
         int r;
 
+        if (!arg_privileged) {
+                /* We only support the unified mode when running unprivileged */
+                arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+                return 0;
+        }
+
         /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
          * in the image actually supports. */
         r = cg_all_unified();
@@ -615,7 +627,6 @@ static int parse_mount_settings_env(void) {
         e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
         if (streq_ptr(e, "network"))
                 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
-
         else if (e) {
                 r = parse_boolean(e);
                 if (r < 0)
@@ -744,6 +755,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_BIND_USER,
                 ARG_SUPPRESS_SYNC,
                 ARG_IMAGE_POLICY,
+                ARG_BACKGROUND,
         };
 
         static const struct option options[] = {
@@ -818,6 +830,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "bind-user",              required_argument, NULL, ARG_BIND_USER              },
                 { "suppress-sync",          required_argument, NULL, ARG_SUPPRESS_SYNC          },
                 { "image-policy",           required_argument, NULL, ARG_IMAGE_POLICY           },
+                { "background",             required_argument, NULL, ARG_BACKGROUND             },
                 {}
         };
 
@@ -1249,33 +1262,11 @@ static int parse_argv(int argc, char *argv[]) {
                                 arg_uid_shift = 0;
                                 arg_uid_range = UINT32_C(0x10000);
                         } else {
-                                _cleanup_free_ char *buffer = NULL;
-                                const char *range, *shift;
-
                                 /* anything else: User namespacing on, UID range is explicitly configured */
-
-                                range = strchr(optarg, ':');
-                                if (range) {
-                                        buffer = strndup(optarg, range - optarg);
-                                        if (!buffer)
-                                                return log_oom();
-                                        shift = buffer;
-
-                                        range++;
-                                        r = safe_atou32(range, &arg_uid_range);
-                                        if (r < 0)
-                                                return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
-                                } else
-                                        shift = optarg;
-
-                                r = parse_uid(shift, &arg_uid_shift);
+                                r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range);
                                 if (r < 0)
-                                        return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
-
+                                        return r;
                                 arg_userns_mode = USER_NAMESPACE_FIXED;
-
-                                if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
-                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
                         }
 
                         arg_settings_mask |= SETTING_USERNS;
@@ -1362,17 +1353,27 @@ static int parse_argv(int argc, char *argv[]) {
 
                         break;
 
-                case ARG_CHDIR:
+                case ARG_CHDIR: {
+                        _cleanup_free_ char *wd = NULL;
+
                         if (!path_is_absolute(optarg))
                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
                                                        "Working directory %s is not an absolute path.", optarg);
 
-                        r = free_and_strdup(&arg_chdir, optarg);
+                        r = path_simplify_alloc(optarg, &wd);
                         if (r < 0)
-                                return log_oom();
+                                return log_error_errno(r, "Failed to simplify path %s: %m", optarg);
+
+                        if (!path_is_normalized(wd))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory path is not normalized: %s", wd);
+
+                        if (path_below_api_vfs(wd))
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory is below API VFS, refusing: %s", wd);
 
+                        free_and_replace(arg_chdir, wd);
                         arg_settings_mask |= SETTING_WORKING_DIRECTORY;
                         break;
+                }
 
                 case ARG_PIVOT_ROOT:
                         r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
@@ -1395,7 +1396,7 @@ static int parse_argv(int argc, char *argv[]) {
                         _cleanup_free_ void *k = NULL;
                         size_t l;
 
-                        r = unhexmem(optarg, strlen(optarg), &k, &l);
+                        r = unhexmem(optarg, &k, &l);
                         if (r < 0)
                                 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
                         if (l < sizeof(sd_id128_t))
@@ -1412,7 +1413,7 @@ static int parse_argv(int argc, char *argv[]) {
                         void *p;
 
                         if ((value = startswith(optarg, "base64:"))) {
-                                r = unbase64mem(value, strlen(value), &p, &l);
+                                r = unbase64mem(value, &p, &l);
                                 if (r < 0)
                                         return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
 
@@ -1568,7 +1569,7 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_SET_CREDENTIAL:
-                        r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg);
+                        r = machine_credential_set(&arg_credentials, optarg);
                         if (r < 0)
                                 return r;
 
@@ -1576,7 +1577,7 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_LOAD_CREDENTIAL:
-                        r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg);
+                        r = machine_credential_load(&arg_credentials, optarg);
                         if (r < 0)
                                 return r;
 
@@ -1607,6 +1608,12 @@ static int parse_argv(int argc, char *argv[]) {
                                 return r;
                         break;
 
+                case ARG_BACKGROUND:
+                        r = free_and_strdup_warn(&arg_background, optarg);
+                        if (r < 0)
+                                return r;
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -1653,6 +1660,21 @@ static int parse_argv(int argc, char *argv[]) {
 static int verify_arguments(void) {
         int r;
 
+        SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged);
+
+        if (!arg_privileged) {
+                /* machined is not accessible to unpriv clients */
+                if (arg_register) {
+                        log_notice("Automatically implying --register=no, since machined is not accessible to unprivileged clients.");
+                        arg_register = false;
+                }
+
+                if (!arg_private_network) {
+                        log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
+                        arg_private_network = true;
+                }
+        }
+
         if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
                 /* If we are running the stub init in the container, we don't need to look at what the init
                  * in the container supports, because we are not using it. Let's immediately pick the right
@@ -2184,7 +2206,7 @@ static int copy_devnodes(const char *dest) {
                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
                                 /* Explicitly warn the user when /dev is already populated. */
                                 if (errno == EEXIST)
-                                        log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
+                                        log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
                                 if (errno != EPERM)
                                         return log_error_errno(errno, "mknod(%s) failed: %m", to);
 
@@ -2365,18 +2387,44 @@ static int setup_keyring(void) {
         return 0;
 }
 
+int make_run_host(const char *root) {
+        int r;
+
+        assert(root);
+
+        r = userns_mkdir(root, "/run/host", 0755, 0, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create /run/host/: %m");
+
+        return 0;
+}
+
 static int setup_credentials(const char *root) {
+        bool world_readable = false;
         const char *q;
         int r;
 
-        if (arg_n_credentials <= 0)
+        if (arg_credentials.n_credentials == 0)
                 return 0;
 
-        r = userns_mkdir(root, "/run/host", 0755, 0, 0);
+        /* If starting a single-process container as a non-root user, the uid will only be resolved after we
+         * are inside the inner child, when credential directories and files are already read-only, so they
+         * are unusable as the single process won't have access to them. We also don't have access to the
+         * uid that will actually be used from here, as we are setting credentials up from the outer child.
+         * In order to make them usable as requested by the configuration, make them world readable in that
+         * case, as by definition there are no other processes in that case besides the one being started,
+         * which is being configured to be able to access credentials, and any of its children which will
+         * inherit its privileges anyway. To ensure this, also enforce (and document) that
+         * --no-new-privileges is necessary for this combination to work. */
+        if (arg_no_new_privileges && !isempty(arg_user) && !STR_IN_SET(arg_user, "root", "0") &&
+            arg_start_mode == START_PID1)
+                world_readable = true;
+
+        r = make_run_host(root);
         if (r < 0)
-                return log_error_errno(r, "Failed to create /run/host: %m");
+                return r;
 
-        r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
+        r = userns_mkdir(root, "/run/host/credentials", world_readable ? 0777 : 0700, 0, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
 
@@ -2385,23 +2433,23 @@ static int setup_credentials(const char *root) {
         if (r < 0)
                 return r;
 
-        for (size_t i = 0; i < arg_n_credentials; i++) {
+        FOREACH_ARRAY(cred, arg_credentials.credentials, arg_credentials.n_credentials) {
                 _cleanup_free_ char *j = NULL;
                 _cleanup_close_ int fd = -EBADF;
 
-                j = path_join(q, arg_credentials[i].id);
+                j = path_join(q, cred->id);
                 if (!j)
                         return log_oom();
 
-                fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
+                fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, world_readable ? 0666 : 0600);
                 if (fd < 0)
                         return log_error_errno(errno, "Failed to create credential file %s: %m", j);
 
-                r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size);
+                r = loop_write(fd, cred->data, cred->size);
                 if (r < 0)
                         return log_error_errno(r, "Failed to write credential to file %s: %m", j);
 
-                if (fchmod(fd, 0400) < 0)
+                if (fchmod(fd, world_readable ? 0444 : 0400) < 0)
                         return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
 
                 if (arg_userns_mode != USER_NAMESPACE_NO) {
@@ -2410,7 +2458,7 @@ static int setup_credentials(const char *root) {
                 }
         }
 
-        if (chmod(q, 0500) < 0)
+        if (chmod(q, world_readable ? 0555 : 0500) < 0)
                 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
 
         r = userns_lchown(q, 0, 0);
@@ -2536,7 +2584,7 @@ static int setup_journal(const char *directory) {
         p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
         q = prefix_roota(directory, p);
 
-        if (path_is_mount_point(p, NULL, 0) > 0) {
+        if (path_is_mount_point(p) > 0) {
                 if (try)
                         return 0;
 
@@ -2544,7 +2592,7 @@ static int setup_journal(const char *directory) {
                                        "%s: already a mount point, refusing to use for journal", p);
         }
 
-        if (path_is_mount_point(q, NULL, 0) > 0) {
+        if (path_is_mount_point(q) > 0) {
                 if (try)
                         return 0;
 
@@ -2620,7 +2668,7 @@ static int setup_journal(const char *directory) {
 
         r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
         if (r < 0)
-                return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
+                return log_error_errno(r, "Failed to bind mount journal from host into guest: %m");
 
         return 0;
 }
@@ -2680,6 +2728,9 @@ static int reset_audit_loginuid(void) {
         if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
                 return 0;
 
+        if (!arg_privileged)
+                return 0;
+
         r = read_one_line_file("/proc/self/loginuid", &p);
         if (r == -ENOENT)
                 return 0;
@@ -2709,14 +2760,19 @@ static int mount_tunnel_dig(const char *root) {
         const char *p, *q;
         int r;
 
+        if (!arg_privileged) {
+                log_debug("Not digging mount tunnel, because running unprivileged.");
+                return 0;
+        }
+
         (void) mkdir_p("/run/systemd/nspawn/", 0755);
         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
         (void) mkdir_p(p, 0600);
 
-        r = userns_mkdir(root, "/run/host", 0755, 0, 0);
+        r = make_run_host(root);
         if (r < 0)
-                return log_error_errno(r, "Failed to create /run/host: %m");
+                return r;
 
         r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
         if (r < 0)
@@ -2737,6 +2793,11 @@ static int mount_tunnel_dig(const char *root) {
 static int mount_tunnel_open(void) {
         int r;
 
+        if (!arg_privileged) {
+                log_debug("Not opening up mount tunnel, because running unprivileged.");
+                return 0;
+        }
+
         r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
         if (r < 0)
                 return r;
@@ -2913,14 +2974,72 @@ static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *erro
         return 0;
 }
 
+static int pick_paths(void) {
+        int r;
+
+        if (arg_directory) {
+                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+                PickFilter filter = pick_filter_image_dir;
+
+                filter.architecture = arg_architecture;
+
+                r = path_pick_update_warn(
+                                &arg_directory,
+                                &filter,
+                                PICK_ARCHITECTURE|PICK_TRIES,
+                                &result);
+                if (r < 0) {
+                        /* Accept ENOENT here so that the --template= logic can work */
+                        if (r != -ENOENT)
+                                return r;
+                } else
+                        arg_architecture = result.architecture;
+        }
+
+        if (arg_image) {
+                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+                PickFilter filter = pick_filter_image_raw;
+
+                filter.architecture = arg_architecture;
+
+                r = path_pick_update_warn(
+                                &arg_image,
+                                &filter,
+                                PICK_ARCHITECTURE|PICK_TRIES,
+                                &result);
+                if (r < 0)
+                        return r;
+
+                arg_architecture = result.architecture;
+        }
+
+        if (arg_template) {
+                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+                PickFilter filter = pick_filter_image_dir;
+
+                filter.architecture = arg_architecture;
+
+                r = path_pick_update_warn(
+                                &arg_template,
+                                &filter,
+                                PICK_ARCHITECTURE,
+                                &result);
+                if (r < 0)
+                        return r;
+
+                arg_architecture = result.architecture;
+        }
+
+        return 0;
+}
+
 static int determine_names(void) {
         int r;
 
         if (arg_template && !arg_directory && arg_machine) {
 
-                /* If --template= was specified then we should not
-                 * search for a machine, but instead create a new one
-                 * in /var/lib/machine. */
+                /* If --template= was specified then we should not search for a machine, but instead create a
+                 * new one in /var/lib/machine. */
 
                 arg_directory = path_join("/var/lib/machines", arg_machine);
                 if (!arg_directory)
@@ -2957,9 +3076,11 @@ static int determine_names(void) {
         }
 
         if (!arg_machine) {
-                if (arg_directory && path_equal(arg_directory, "/"))
+                if (arg_directory && path_equal(arg_directory, "/")) {
                         arg_machine = gethostname_malloc();
-                else if (arg_image) {
+                        if (!arg_machine)
+                                return log_oom();
+                } else if (arg_image) {
                         char *e;
 
                         r = path_extract_filename(arg_image, &arg_machine);
@@ -3198,20 +3319,32 @@ static int inner_child(
                 return r;
 
         if (!arg_network_namespace_path && arg_private_network) {
-                r = unshare(CLONE_NEWNET);
+                _cleanup_close_ int netns_fd = -EBADF;
+
+                if (arg_privileged) {
+                        if (unshare(CLONE_NEWNET) < 0)
+                                return log_error_errno(errno, "Failed to unshare network namespace: %m");
+                }
+
+                netns_fd = namespace_open_by_type(NAMESPACE_NET);
+                if (netns_fd < 0)
+                        return log_error_errno(netns_fd, "Failed to open newly allocate network namespace: %m");
+
+                r = send_one_fd(fd_inner_socket, netns_fd, 0);
                 if (r < 0)
-                        return log_error_errno(errno, "Failed to unshare network namespace: %m");
+                        return log_error_errno(r, "Failed to send network namespace to supervisor: %m");
 
                 /* Tell the parent that it can setup network interfaces. */
                 (void) barrier_place(barrier); /* #3 */
         }
 
-        r = mount_sysfs(NULL, arg_mount_settings);
-        if (r < 0)
-                return r;
+        if (arg_privileged) {
+                r = mount_sysfs(NULL, arg_mount_settings);
+                if (r < 0)
+                        return r;
+        }
 
-        /* Wait until we are cgroup-ified, so that we
-         * can mount the right cgroup path writable */
+        /* Wait until we are cgroup-ified, so that we can mount the right cgroup path writable */
         if (!barrier_place_and_sync(barrier)) /* #4 */
                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
                                        "Parent died too early");
@@ -3396,7 +3529,7 @@ static int inner_child(
         if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
                 return log_oom();
 
-        if (fdset_size(fds) > 0) {
+        if (!fdset_isempty(fds)) {
                 r = fdset_cloexec(fds, false);
                 if (r < 0)
                         return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
@@ -3408,7 +3541,7 @@ static int inner_child(
         if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
                 return log_oom();
 
-        if (arg_n_credentials > 0) {
+        if (arg_credentials.n_credentials > 0) {
                 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
                 if (!envp[n_env])
                         return log_oom();
@@ -3430,6 +3563,9 @@ static int inner_child(
         if (!barrier_place_and_sync(barrier)) /* #5 */
                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
 
+        /* Note, this should be done this late (💣 and not moved earlier! 💣), so that all namespacing
+         * changes are already in effect by now, so that any resolved paths here definitely reference
+         * resources inside the container, and not outside of them. */
         if (arg_chdir)
                 if (chdir(arg_chdir) < 0)
                         return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
@@ -3509,11 +3645,11 @@ static int inner_child(
         return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
 }
 
-static int setup_notify_child(void) {
+static int setup_notify_child(const void *directory) {
         _cleanup_close_ int fd = -EBADF;
-        static const union sockaddr_union sa = {
+        _cleanup_free_ char *j = NULL;
+        union sockaddr_union sa = {
                 .un.sun_family = AF_UNIX,
-                .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
         };
         int r;
 
@@ -3521,14 +3657,26 @@ static int setup_notify_child(void) {
         if (fd < 0)
                 return log_error_errno(errno, "Failed to allocate notification socket: %m");
 
-        (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
-        (void) sockaddr_un_unlink(&sa.un);
+        if (directory) {
+                j = path_join(directory, NSPAWN_NOTIFY_SOCKET_PATH);
+                if (!j)
+                        return log_oom();
+        }
 
-        r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+        r = sockaddr_un_set_path(&sa.un, j ?: NSPAWN_NOTIFY_SOCKET_PATH);
         if (r < 0)
-                return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
+                return log_error_errno(r, "Failed to set AF_UNIX path to %s: %m", j ?: NSPAWN_NOTIFY_SOCKET_PATH);
 
-        r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
+        (void) mkdir_parents(sa.un.sun_path, 0755);
+        (void) sockaddr_un_unlink(&sa.un);
+
+        WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */
+                r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+                if (r < 0)
+                        return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
+        }
+
+        r = userns_lchown(sa.un.sun_path, 0, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
 
@@ -3539,6 +3687,125 @@ static int setup_notify_child(void) {
         return TAKE_FD(fd);
 }
 
+static int setup_unix_export_dir_outside(char **ret) {
+        int r;
+
+        assert(ret);
+
+        if (!arg_privileged) {
+                log_debug("Not digging socket tunnel, because running unprivileged.");
+                return 0;
+        }
+
+        _cleanup_free_ char *p = NULL;
+        p = path_join("/run/systemd/nspawn/unix-export", arg_machine);
+        if (!p)
+                return log_oom();
+
+        r = path_is_mount_point(p);
+        if (r > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Mount point '%s' exists already, refusing.", p);
+        if (r < 0 && r != -ENOENT)
+                return log_error_errno(r, "Failed to detect if '%s' is a mount point: %m", p);
+
+        r = mkdir_p(p, 0755);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create '%s': %m", p);
+
+        _cleanup_(rmdir_and_freep) char *q = TAKE_PTR(p);
+
+        /* Mount the "unix export" directory really tiny, just 64 inodes. We mark the superblock writable
+         * (since the container shall bind sockets into it). */
+        r = mount_nofollow_verbose(
+                        LOG_ERR,
+                        "tmpfs",
+                        q,
+                        "tmpfs",
+                        MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
+                        "size=4M,nr_inodes=64,mode=0755");
+        if (r < 0)
+                return r;
+
+        _cleanup_(umount_and_rmdir_and_freep) char *w = TAKE_PTR(q);
+
+        /* After creating the superblock we change the bind mount to be read-only. This means that the fs
+         * itself is writable, but not through the mount accessible from the host. */
+        r = mount_nofollow_verbose(
+                        LOG_ERR,
+                        /* source= */ NULL,
+                        w,
+                        /* fstype= */ NULL,
+                        MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
+                        /* options= */ NULL);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(w);
+        return 0;
+}
+
+static int setup_unix_export_host_inside(const char *directory, const char *unix_export_path) {
+        int r;
+
+        assert(directory);
+
+        if (!arg_privileged)
+                return 0;
+
+        assert(unix_export_path);
+
+        r = make_run_host(directory);
+        if (r < 0)
+                return r;
+
+        _cleanup_free_ char *p = path_join(directory, "run/host/unix-export");
+        if (!p)
+                return log_oom();
+
+        if (mkdir(p, 0755) < 0)
+                return log_error_errno(errno, "Failed to create '%s': %m", p);
+
+        r = mount_nofollow_verbose(
+                        LOG_ERR,
+                        unix_export_path,
+                        p,
+                        /* fstype= */ NULL,
+                        MS_BIND,
+                        /* options= */ NULL);
+        if (r < 0)
+                return r;
+
+        r = mount_nofollow_verbose(
+                        LOG_ERR,
+                        /* source= */ NULL,
+                        p,
+                        /* fstype= */ NULL,
+                        MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
+                        /* options= */ NULL);
+        if (r < 0)
+                return r;
+
+        r = userns_lchown(p, 0, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to chown '%s': %m", p);
+
+        return 0;
+}
+
+static DissectImageFlags determine_dissect_image_flags(void) {
+        return
+                DISSECT_IMAGE_GENERIC_ROOT |
+                DISSECT_IMAGE_REQUIRE_ROOT |
+                DISSECT_IMAGE_RELAX_VAR_CHECK |
+                DISSECT_IMAGE_USR_NO_ROOT |
+                DISSECT_IMAGE_DISCARD_ON_LOOP |
+                DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+                DISSECT_IMAGE_PIN_PARTITION_DEVICES |
+                (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
+                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
+                (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0);
+}
+
 static int outer_child(
                 Barrier *barrier,
                 const char *directory,
@@ -3546,7 +3813,8 @@ static int outer_child(
                 int fd_outer_socket,
                 int fd_inner_socket,
                 FDSet *fds,
-                int netns_fd) {
+                int netns_fd,
+                const char *unix_export_path) {
 
         _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
         _cleanup_strv_free_ char **os_release_pairs = NULL;
@@ -3599,10 +3867,8 @@ static int outer_child(
                                 arg_uid_shift,
                                 arg_uid_range,
                                 /* userns_fd= */ -EBADF,
+                                determine_dissect_image_flags()|
                                 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
-                                DISSECT_IMAGE_DISCARD_ON_LOOP|
-                                DISSECT_IMAGE_USR_NO_ROOT|
-                                (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
                                 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
                 if (r < 0)
                         return r;
@@ -3613,7 +3879,12 @@ static int outer_child(
                 return r;
 
         if (arg_userns_mode != USER_NAMESPACE_NO) {
-                r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
+                r = namespace_open(0,
+                                   /* ret_pidns_fd = */ NULL,
+                                   &mntns_fd,
+                                   /* ret_netns_fd = */ NULL,
+                                   /* ret_userns_fd = */ NULL,
+                                   /* ret_root_fd = */ NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to pin outer mount namespace: %m");
 
@@ -3752,7 +4023,7 @@ static int outer_child(
 
                 dirs[i] = NULL;
 
-                r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
+                r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
                 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
                         /* This might fail because the kernel or file system doesn't support idmapping. We
                          * can't really distinguish this nicely, nor do we have any guarantees about the
@@ -3773,21 +4044,17 @@ static int outer_child(
 
         if (dissected_image) {
                 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
-                r = dissected_image_mount(
+                r = dissected_image_mount_and_warn(
                                 dissected_image,
                                 directory,
                                 arg_uid_shift,
                                 arg_uid_range,
                                 /* userns_fd= */ -EBADF,
+                                determine_dissect_image_flags()|
                                 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
-                                DISSECT_IMAGE_DISCARD_ON_LOOP|
-                                DISSECT_IMAGE_USR_NO_ROOT|
-                                (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
                                 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
-                if (r == -EUCLEAN)
-                        return log_error_errno(r, "File system check for image failed: %m");
                 if (r < 0)
-                        return log_error_errno(r, "Failed to mount image file system: %m");
+                        return r;
         }
 
         if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
@@ -3840,6 +4107,10 @@ static int outer_child(
         p = prefix_roota(directory, "/run/host");
         (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
 
+        r = setup_unix_export_host_inside(directory, unix_export_path);
+        if (r < 0)
+                return r;
+
         r = setup_pts(directory);
         if (r < 0)
                 return r;
@@ -3889,11 +4160,11 @@ static int outer_child(
 
         /* The same stuff as the $container env var, but nicely readable for the entire payload */
         p = prefix_roota(directory, "/run/host/container-manager");
-        (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
+        (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444);
 
         /* The same stuff as the $container_uuid env var */
         p = prefix_roota(directory, "/run/host/container-uuid");
-        (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
+        (void) write_string_filef(p, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
 
         if (!arg_use_cgns) {
                 r = mount_cgroups(
@@ -3908,47 +4179,59 @@ static int outer_child(
                         return r;
         }
 
-        /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
-         * mounts available in systemd services inside the container that create a new mount namespace.  See
-         * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
-         * will inherit the shared propagation mode.
-         *
-         * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
-         * directory mount to root later on.
-         * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
-         */
-        r = mount_switch_root(directory, MS_SHARED);
-        if (r < 0)
-                return log_error_errno(r, "Failed to move root directory: %m");
+        /* We have different codepaths here for privileged and non-privileged mode. In privileged mode we'll
+         * now switch into the target directory, and then do the final setup from there. If a user namespace
+         * is then allocated for the container, the root mount and everything else will be out of reach for
+         * it. For unprivileged containers we cannot do that however, since we couldn't mount a sysfs and
+         * procfs then anymore, since that only works if there's an unobstructed instance currently
+         * visible. Hence there we do it the other way round: we first allocate a new set of namespaces
+         * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
 
-        /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
-         * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
-         * the container. */
-        r = mount_tunnel_open();
-        if (r < 0)
-                return r;
+        if (arg_privileged) {
+                /* Mark everything as shared so our mounts get propagated down. This is required to make new
+                 * bind mounts available in systemd services inside the container that create a new mount
+                 * namespace.  See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
+                 * /dev/) done after this will inherit the shared propagation mode.
+                 *
+                 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
+                 * directory mount to root later on.
+                 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
+                 */
+                r = mount_switch_root(directory, MS_SHARED);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to move root directory: %m");
 
-        if (arg_userns_mode != USER_NAMESPACE_NO) {
-                /* In order to mount procfs and sysfs in an unprivileged container the kernel
-                 * requires that a fully visible instance is already present in the target mount
-                 * namespace. Mount one here so the inner child can mount its own instances. Later
-                 * we umount the temporary instances created here before we actually exec the
-                 * payload. Since the rootfs is shared the umount will propagate into the container.
-                 * Note, the inner child wouldn't be able to unmount the instances on its own since
-                 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
-                 * this. */
-                r = pin_fully_visible_fs();
+                /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
+                 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
+                 * the container. */
+                r = mount_tunnel_open();
                 if (r < 0)
                         return r;
-        }
 
-        fd = setup_notify_child();
+                if (arg_userns_mode != USER_NAMESPACE_NO) {
+                        /* In order to mount procfs and sysfs in an unprivileged container the kernel
+                         * requires that a fully visible instance is already present in the target mount
+                         * namespace. Mount one here so the inner child can mount its own instances. Later
+                         * we umount the temporary instances created here before we actually exec the
+                         * payload. Since the rootfs is shared the umount will propagate into the container.
+                         * Note, the inner child wouldn't be able to unmount the instances on its own since
+                         * it doesn't own the originating mount namespace. IOW, the outer child needs to do
+                         * this. */
+                        r = pin_fully_visible_fs();
+                        if (r < 0)
+                                return r;
+                }
+
+                fd = setup_notify_child(NULL);
+        } else
+                fd = setup_notify_child(directory);
         if (fd < 0)
                 return fd;
 
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
                         arg_clone_ns_flags |
-                        (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
+                        (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) |
+                        ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0));
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork inner child: %m");
         if (pid == 0) {
@@ -3958,11 +4241,35 @@ static int outer_child(
                  * user if user namespaces are turned on. */
 
                 if (arg_network_namespace_path) {
-                        r = namespace_enter(-1, -1, netns_fd, -1, -1);
+                        r = namespace_enter(/* pidns_fd = */ -EBADF,
+                                            /* mntns_fd = */ -EBADF,
+                                            netns_fd,
+                                            /* userns_fd = */ -EBADF,
+                                            /* root_fd = */ -EBADF);
                         if (r < 0)
                                 return log_error_errno(r, "Failed to join network namespace: %m");
                 }
 
+                if (!arg_privileged) {
+                        /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
+                         * inside the inner namespaces, but before we switch root. Hence do so here. */
+                        _cleanup_free_ char *j = path_join(directory, "/proc");
+                        if (!j)
+                                return log_oom();
+
+                        r = mount_follow_verbose(LOG_ERR, "proc", j, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_sysfs(directory, arg_mount_settings);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_switch_root(directory, MS_SHARED);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to move root directory: %m");
+                }
+
                 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
                 if (r < 0)
                         _exit(EXIT_FAILURE);
@@ -4030,13 +4337,13 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
                         return r;
 
                 /* Make some superficial checks whether the range is currently known in the user database */
-                if (getpwuid(candidate))
+                if (getpwuid_malloc(candidate, /* ret= */ NULL) >= 0)
                         goto next;
-                if (getpwuid(candidate + UINT32_C(0xFFFE)))
+                if (getpwuid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0)
                         goto next;
-                if (getgrgid(candidate))
+                if (getgrgid_malloc(candidate, /* ret= */ NULL) >= 0)
                         goto next;
-                if (getgrgid(candidate + UINT32_C(0xFFFE)))
+                if (getgrgid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0)
                         goto next;
 
                 *ret_lock_file = lf;
@@ -4217,6 +4524,17 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
         if (!tags)
                 return log_oom();
 
+        if (DEBUG_LOGGING) {
+                _cleanup_free_ char *joined = strv_join(tags, " ");
+
+                if (joined) {
+                        _cleanup_free_ char *j = cescape(joined);
+                        free_and_replace(joined, j);
+                }
+
+                log_debug("Got sd_notify() message: %s", strnull(joined));
+        }
+
         if (strv_contains(tags, "READY=1")) {
                 r = sd_notify(false, "READY=1\n");
                 if (r < 0)
@@ -4233,6 +4551,9 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
         int r;
 
+        if (fd < 0)
+                return 0;
+
         r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
         if (r < 0)
                 return log_error_errno(r, "Failed to allocate notify event source: %m");
@@ -4242,6 +4563,25 @@ static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid,
         return 0;
 }
 
+static void set_window_title(PTYForward *f) {
+        _cleanup_free_ char *hn = NULL, *dot = NULL;
+
+        assert(f);
+
+        (void) gethostname_strict(&hn);
+
+        if (emoji_enabled())
+                dot = strjoin(special_glyph(SPECIAL_GLYPH_BLUE_CIRCLE), " ");
+
+        if (hn)
+                (void) pty_forward_set_titlef(f, "%sContainer %s on %s", strempty(dot), arg_machine, hn);
+        else
+                (void) pty_forward_set_titlef(f, "%sContainer %s", strempty(dot), arg_machine);
+
+        if (dot)
+                (void) pty_forward_set_title_prefix(f, dot);
+}
+
 static int merge_settings(Settings *settings, const char *path) {
         int rl;
 
@@ -4457,7 +4797,7 @@ static int merge_settings(Settings *settings, const char *path) {
 #endif
         }
 
-        for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
+        for (rl = 0; rl < _RLIMIT_MAX; rl++) {
                 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
                         continue;
 
@@ -4593,26 +4933,28 @@ static int load_settings(void) {
                 return 0;
 
         /* We first look in the admin's directories in /etc and /run */
-        FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
-                _cleanup_free_ char *j = NULL;
+        if (arg_privileged) {
+                FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
+                        _cleanup_free_ char *j = NULL;
 
-                j = path_join(i, arg_settings_filename);
-                if (!j)
-                        return log_oom();
+                        j = path_join(i, arg_settings_filename);
+                        if (!j)
+                                return log_oom();
 
-                f = fopen(j, "re");
-                if (f) {
-                        p = TAKE_PTR(j);
+                        f = fopen(j, "re");
+                        if (f) {
+                                p = TAKE_PTR(j);
 
-                        /* By default, we trust configuration from /etc and /run */
-                        if (arg_settings_trusted < 0)
-                                arg_settings_trusted = true;
+                                /* By default, we trust configuration from /etc and /run */
+                                if (arg_settings_trusted < 0)
+                                        arg_settings_trusted = true;
 
-                        break;
-                }
+                                break;
+                        }
 
-                if (errno != ENOENT)
-                        return log_error_errno(errno, "Failed to open %s: %m", j);
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to open %s: %m", j);
+                }
         }
 
         if (!f) {
@@ -4672,10 +5014,14 @@ static int load_oci_bundle(void) {
 
 static int run_container(
                DissectedImage *dissected_image,
+               int userns_fd,
                FDSet *fds,
-               char veth_name[IFNAMSIZ], bool *veth_created,
+               char veth_name[IFNAMSIZ],
+               bool *veth_created,
                struct ExposeArgs *expose_args,
-               int *master, pid_t *pid, int *ret) {
+               int *master,
+               pid_t *pid,
+               int *ret) {
 
         static const struct sigaction sa = {
                 .sa_handler = nop_signal_handler,
@@ -4691,6 +5037,7 @@ static int run_container(
         _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
         _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
+        _cleanup_(umount_and_rmdir_and_freep) char *unix_export_host_dir = NULL;
         _cleanup_(sd_event_unrefp) sd_event *event = NULL;
         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
         _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
@@ -4706,6 +5053,11 @@ static int run_container(
         assert_se(sigemptyset(&mask_chld) == 0);
         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
 
+        /* Set up the unix export host directory on the host first */
+        r = setup_unix_export_dir_outside(&unix_export_host_dir);
+        if (r < 0)
+                return r;
+
         if (arg_userns_mode == USER_NAMESPACE_PICK) {
                 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
                  * check with getpwuid() if the specific user already exists. Note that /etc might be
@@ -4754,11 +5106,44 @@ static int run_container(
                                                "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
         }
 
-        *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
-        if (*pid < 0)
-                return log_error_errno(errno, "clone() failed%s: %m",
-                                       errno == EINVAL ?
-                                       ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+        if (arg_privileged) {
+                assert(userns_fd < 0);
+
+                /* If we have no user namespace then we'll clone and create a new mount namespace right-away. */
+
+                *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
+                if (*pid < 0)
+                        return log_error_errno(errno, "clone() failed%s: %m",
+                                               errno == EINVAL ?
+                                               ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+        } else {
+                assert(userns_fd >= 0);
+
+                /* If we have a user namespace then we'll clone() first, and then join the user namespace,
+                 * and then open the mount namespace, so that it is owned by the user namespace */
+
+                *pid = raw_clone(SIGCHLD);
+                if (*pid < 0)
+                        return log_error_errno(errno, "clone() failed: %m");
+
+                if (*pid == 0) {
+                        if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+                                log_error_errno(errno, "Failed to join allocate user namespace: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        r = reset_uid_gid();
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to reset UID/GID to root: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        if (unshare(CLONE_NEWNS) < 0) {
+                                log_error_errno(errno, "Failed to unshare file system namespace: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+                }
+        }
 
         if (*pid == 0) {
                 /* The outer child only has a file system namespace. */
@@ -4776,7 +5161,8 @@ static int run_container(
                                 fd_outer_socket_pair[1],
                                 fd_inner_socket_pair[1],
                                 fds,
-                                child_netns_fd);
+                                child_netns_fd,
+                                unix_export_host_dir);
                 if (r < 0)
                         _exit(EXIT_FAILURE);
 
@@ -4894,14 +5280,13 @@ static int run_container(
                         /* Wait until the child has unshared its network namespace. */
                         if (!barrier_place_and_sync(&barrier)) /* #3 */
                                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
-                }
 
-                if (child_netns_fd < 0) {
-                        /* Make sure we have an open file descriptor to the child's network
-                         * namespace so it stays alive even if the child exits. */
-                        r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
-                        if (r < 0)
-                                return log_error_errno(r, "Failed to open child network namespace: %m");
+                        /* Make sure we have an open file descriptor to the child's network namespace so it
+                         * stays alive even if the child exits. */
+                        assert(child_netns_fd < 0);
+                        child_netns_fd = receive_one_fd(fd_inner_socket_pair[0], 0);
+                        if (child_netns_fd < 0)
+                                return log_error_errno(r, "Failed to receive child network namespace: %m");
                 }
 
                 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
@@ -4909,12 +5294,29 @@ static int run_container(
                         return r;
 
                 if (arg_network_veth) {
-                        r = setup_veth(arg_machine, *pid, veth_name,
-                                       arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
-                        if (r < 0)
-                                return r;
-                        else if (r > 0)
-                                ifi = r;
+                        if (arg_privileged) {
+                                r = setup_veth(arg_machine, *pid, veth_name,
+                                               arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
+                                if (r < 0)
+                                        return r;
+                                else if (r > 0)
+                                        ifi = r;
+                        } else {
+                                _cleanup_free_ char *host_ifname = NULL;
+
+                                r = nsresource_add_netif(userns_fd, child_netns_fd, /* namespace_ifname= */ NULL, &host_ifname, /* ret_namespace_ifname= */ NULL);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to add network interface to container: %m");
+
+                                ifi = if_nametoindex(host_ifname);
+                                if (ifi == 0)
+                                        return log_error_errno(errno, "Failed to resolve interface '%s': %m", host_ifname);
+
+                                if (strlen(host_ifname) >= IFNAMSIZ)
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Host interface name too long?");
+
+                                strcpy(veth_name, host_ifname);
+                        }
 
                         if (arg_network_bridge) {
                                 /* Add the interface to a bridge */
@@ -4953,9 +5355,12 @@ static int run_container(
         }
 
         if (arg_register || !arg_keep_unit) {
-                r = sd_bus_default_system(&bus);
+                if (arg_privileged)
+                        r = sd_bus_default_system(&bus);
+                else
+                        r = sd_bus_default_user(&bus);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to open system bus: %m");
+                        return log_error_errno(r, "Failed to open bus: %m");
 
                 r = sd_bus_set_close_on_exit(bus, false);
                 if (r < 0)
@@ -5016,7 +5421,13 @@ static int run_container(
         } else if (arg_slice || arg_property)
                 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
 
-        r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+        r = create_subcgroup(
+                        *pid,
+                        arg_keep_unit,
+                        arg_unified_cgroup_hierarchy,
+                        arg_uid_shift,
+                        userns_fd,
+                        arg_privileged);
         if (r < 0)
                 return r;
 
@@ -5024,14 +5435,8 @@ static int run_container(
         if (r < 0)
                 return r;
 
-        r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
-        if (r < 0)
-                return r;
-
-        /* Notify the child that the parent is ready with all
-         * its setup (including cgroup-ification), and that
-         * the child can now hand over control to the code to
-         * run inside the container. */
+        /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and
+         * that the child can now hand over control to the code to run inside the container. */
         (void) barrier_place(&barrier); /* #4 */
 
         /* Block SIGCHLD here, before notifying child.
@@ -5146,9 +5551,23 @@ static int run_container(
                                 return log_error_errno(r, "Failed to create PTY forwarder: %m");
 
                         if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
-                                (void) pty_forward_set_width_height(forward,
-                                                                    arg_console_width,
-                                                                    arg_console_height);
+                                (void) pty_forward_set_width_height(
+                                                forward,
+                                                arg_console_width,
+                                                arg_console_height);
+
+                        if (!arg_background && shall_tint_background()) {
+                                _cleanup_free_ char *bg = NULL;
+
+                                r = terminal_tint_color(220 /* blue */, &bg);
+                                if (r < 0)
+                                        log_debug_errno(r, "Failed to determine terminal background color, not tinting.");
+                                else
+                                        (void) pty_forward_set_background_color(forward, bg);
+                        } else if (!isempty(arg_background))
+                                (void) pty_forward_set_background_color(forward, arg_background);
+
+                        set_window_title(forward);
                         break;
 
                 default:
@@ -5183,38 +5602,10 @@ static int run_container(
 
         fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
 
-        if (arg_private_network) {
-                /* Move network interfaces back to the parent network namespace. We use `safe_fork`
-                 * to avoid having to move the parent to the child network namespace. */
-                r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL);
+        if (arg_private_network && arg_privileged) {
+                r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
                 if (r < 0)
                         return r;
-
-                if (r == 0) {
-                        _cleanup_close_ int parent_netns_fd = -EBADF;
-
-                        r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
-                        if (r < 0) {
-                                log_error_errno(r, "Failed to open parent network namespace: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
-                        if (r < 0) {
-                                log_error_errno(r, "Failed to enter child network namespace: %m");
-                                _exit(EXIT_FAILURE);
-                        }
-
-                        /* Reverse network interfaces pair list so that interfaces get their initial name back.
-                         * This is about ensuring interfaces get their old name back when being moved back. */
-                        arg_network_interfaces = strv_reverse(arg_network_interfaces);
-
-                        r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
-                        if (r < 0)
-                                log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
-
-                        _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
-                }
         }
 
         r = wait_for_container(TAKE_PID(*pid), &container_status);
@@ -5288,7 +5679,7 @@ static int initialize_rlimits(void) {
                  * don't read the other limits from PID 1 but prefer the static table above. */
         };
 
-        int rl;
+        int rl, r;
 
         for (rl = 0; rl < _RLIMIT_MAX; rl++) {
                 /* Let's only fill in what the user hasn't explicitly configured anyway */
@@ -5299,8 +5690,9 @@ static int initialize_rlimits(void) {
                         if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
                                 /* For these two let's read the limits off PID 1. See above for an explanation. */
 
-                                if (prlimit(1, rl, NULL, &buffer) < 0)
-                                        return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
+                                r = pid_getrlimit(1, rl, &buffer);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
 
                                 v = &buffer;
                         } else if (rl == RLIMIT_NOFILE) {
@@ -5351,6 +5743,10 @@ static int cant_be_in_netns(void) {
         if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
                                        "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
+        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+                log_debug_errno(r, "Can't connect to udev control socket, assuming we are in same netns.");
+                return 0;
+        }
         if (r < 0)
                 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
 
@@ -5369,7 +5765,7 @@ static int cant_be_in_netns(void) {
 
 static int run(int argc, char *argv[]) {
         bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
-        _cleanup_close_ int master = -EBADF;
+        _cleanup_close_ int master = -EBADF, userns_fd = -EBADF;
         _cleanup_fdset_free_ FDSet *fds = NULL;
         int r, n_fd_passed, ret = EXIT_SUCCESS;
         char veth_name[IFNAMSIZ] = "";
@@ -5381,20 +5777,14 @@ static int run(int argc, char *argv[]) {
         _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
         pid_t pid = 0;
 
-        log_parse_environment();
-        log_open();
+        log_setup();
+
+        arg_privileged = getuid() == 0;
 
         r = parse_argv(argc, argv);
         if (r <= 0)
                 goto finish;
 
-        if (geteuid() != 0) {
-                r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
-                                      argc >= 2 ? "Need to be root." :
-                                      "Need to be root (and some arguments are usually required).\nHint: try --help");
-                goto finish;
-        }
-
         r = cant_be_in_netns();
         if (r < 0)
                 goto finish;
@@ -5407,6 +5797,10 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
+        r = pick_paths();
+        if (r < 0)
+                goto finish;
+
         r = determine_names();
         if (r < 0)
                 goto finish;
@@ -5421,7 +5815,7 @@ static int run(int argc, char *argv[]) {
         if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
                 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
 
-        r = cg_unified();
+        r = cg_unified(); /* initialize cache early */
         if (r < 0) {
                 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
                 goto finish;
@@ -5431,6 +5825,10 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
+        r = resolve_network_interface_names(arg_network_interfaces);
+        if (r < 0)
+                goto finish;
+
         r = verify_network_interfaces_initialized();
         if (r < 0)
                 goto finish;
@@ -5438,6 +5836,16 @@ static int run(int argc, char *argv[]) {
         /* Reapply environment settings. */
         (void) detect_unified_cgroup_hierarchy_from_environment();
 
+        if (!arg_privileged) {
+                r = cg_all_unified();
+                if (r < 0) {
+                        log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m");
+                        goto finish;
+                }
+                if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode.");
+        }
+
         /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
          * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
          * so just turning this off here means we only turn it off in nspawn itself, not any children. */
@@ -5457,9 +5865,21 @@ static int run(int argc, char *argv[]) {
         * the child. Functions like copy_devnodes() change the umask temporarily. */
         umask(0022);
 
+        if (arg_console_mode < 0)
+                arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
+                                   CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
+
+        if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
+                arg_quiet = true;
+
         if (arg_directory) {
                 assert(!arg_image);
 
+                if (!arg_privileged) {
+                        r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges.");
+                        goto finish;
+                }
+
                 /* Safety precaution: let's not allow running images from the live host OS image, as long as
                  * /var from the host will propagate into container dynamically (because bad things happen if
                  * two systems write to the same /var). Let's allow it for the special cases where /var is
@@ -5480,7 +5900,7 @@ static int run(int argc, char *argv[]) {
                         /* If the specified path is a mount point we generate the new snapshot immediately
                          * inside it under a random name. However if the specified is not a mount point we
                          * create the new snapshot in the parent directory, just next to it. */
-                        r = path_is_mount_point(arg_directory, NULL, 0);
+                        r = path_is_mount_point(arg_directory);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
                                 goto finish;
@@ -5496,7 +5916,11 @@ static int run(int argc, char *argv[]) {
 
                         /* We take an exclusive lock on this image, since it's our private, ephemeral copy
                          * only owned by us and no one else. */
-                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        np,
+                                        LOCK_EX|LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to lock %s: %m", np);
                                 goto finish;
@@ -5528,7 +5952,11 @@ static int run(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
 
-                        r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        arg_directory,
+                                        (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r == -EBUSY) {
                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
                                 goto finish;
@@ -5620,15 +6048,12 @@ static int run(int argc, char *argv[]) {
 
         } else {
                 DissectImageFlags dissect_image_flags =
-                        DISSECT_IMAGE_GENERIC_ROOT |
-                        DISSECT_IMAGE_REQUIRE_ROOT |
-                        DISSECT_IMAGE_RELAX_VAR_CHECK |
-                        DISSECT_IMAGE_USR_NO_ROOT |
-                        DISSECT_IMAGE_ADD_PARTITION_DEVICES |
-                        DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+                        determine_dissect_image_flags();
+
                 assert(arg_image);
                 assert(!arg_template);
 
+
                 r = chase_and_update(&arg_image, 0);
                 if (r < 0)
                         goto finish;
@@ -5643,9 +6068,13 @@ static int run(int argc, char *argv[]) {
                         }
 
                         /* Always take an exclusive lock on our own ephemeral copy. */
-                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        np,
+                                        LOCK_EX|LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r < 0) {
-                                r = log_error_errno(r, "Failed to create image lock: %m");
+                                log_error_errno(r, "Failed to create image lock: %m");
                                 goto finish;
                         }
 
@@ -5668,13 +6097,17 @@ static int run(int argc, char *argv[]) {
                         free_and_replace(arg_image, np);
                         remove_image = true;
                 } else {
-                        r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        arg_image,
+                                        (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r == -EBUSY) {
-                                r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
+                                log_error_errno(r, "Disk image %s is currently busy.", arg_image);
                                 goto finish;
                         }
                         if (r < 0) {
-                                r = log_error_errno(r, "Failed to create image lock: %m");
+                                log_error_errno(r, "Failed to create image lock: %m");
                                 goto finish;
                         }
 
@@ -5703,56 +6136,80 @@ static int run(int argc, char *argv[]) {
                         goto finish;
                 }
 
-                r = loop_device_make_by_path(
-                                arg_image,
-                                arg_read_only ? O_RDONLY : O_RDWR,
-                                /* sector_size= */ UINT32_MAX,
-                                FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
-                                LOCK_SH,
-                                &loop);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to set up loopback block device: %m");
-                        goto finish;
-                }
+                if (arg_privileged) {
+                        r = loop_device_make_by_path(
+                                        arg_image,
+                                        arg_read_only ? O_RDONLY : O_RDWR,
+                                        /* sector_size= */ UINT32_MAX,
+                                        FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+                                        LOCK_SH,
+                                        &loop);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to set up loopback block device: %m");
+                                goto finish;
+                        }
 
-                r = dissect_loop_device_and_warn(
-                                loop,
-                                &arg_verity_settings,
-                                /* mount_options=*/ NULL,
-                                arg_image_policy ?: &image_policy_container,
-                                dissect_image_flags,
-                                &dissected_image);
-                if (r == -ENOPKG) {
-                        /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
-                        log_notice("Note that the disk image needs to\n"
-                                   "    a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
-                                   "    b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
-                                   "    c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
-                                   "    d) or contain a file system without a partition table\n"
-                                   "in order to be bootable with systemd-nspawn.");
-                        goto finish;
-                }
-                if (r < 0)
-                        goto finish;
+                        r = dissect_loop_device_and_warn(
+                                        loop,
+                                        &arg_verity_settings,
+                                        /* mount_options=*/ NULL,
+                                        arg_image_policy ?: &image_policy_container,
+                                        dissect_image_flags,
+                                        &dissected_image);
+                        if (r == -ENOPKG) {
+                                /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
+                                log_notice("Note that the disk image needs to\n"
+                                           "    a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
+                                           "    b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
+                                           "    c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
+                                           "    d) or contain a file system without a partition table\n"
+                                           "in order to be bootable with systemd-nspawn.");
+                                goto finish;
+                        }
+                        if (r < 0)
+                                goto finish;
 
-                r = dissected_image_load_verity_sig_partition(
-                                dissected_image,
-                                loop->fd,
-                                &arg_verity_settings);
-                if (r < 0)
-                        goto finish;
+                        r = dissected_image_load_verity_sig_partition(
+                                        dissected_image,
+                                        loop->fd,
+                                        &arg_verity_settings);
+                        if (r < 0)
+                                goto finish;
 
-                if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
-                        log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
-                                   "root hash signature found! Proceeding without integrity checking.", arg_image);
+                        if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
+                                log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
+                                           "root hash signature found! Proceeding without integrity checking.", arg_image);
 
-                r = dissected_image_decrypt_interactively(
-                                dissected_image,
-                                NULL,
-                                &arg_verity_settings,
-                                0);
-                if (r < 0)
-                        goto finish;
+                        r = dissected_image_decrypt_interactively(
+                                        dissected_image,
+                                        NULL,
+                                        &arg_verity_settings,
+                                        dissect_image_flags);
+                        if (r < 0)
+                                goto finish;
+                } else {
+                        _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine);
+                        if (!userns_name) {
+                                r = log_oom();
+                                goto finish;
+                        }
+
+                        /* if we are unprivileged, let's allocate a 64K userns first */
+                        userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
+                        if (userns_fd < 0) {
+                                r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
+                                goto finish;
+                        }
+
+                        r = mountfsd_mount_image(
+                                        arg_image,
+                                        userns_fd,
+                                        arg_image_policy,
+                                        dissect_image_flags,
+                                        &dissected_image);
+                        if (r < 0)
+                                goto finish;
+                }
 
                 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
                 if (remove_image && unlink(arg_image) >= 0)
@@ -5766,19 +6223,20 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
-        if (arg_console_mode < 0)
-                arg_console_mode =
-                        isatty(STDIN_FILENO) > 0 &&
-                        isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
+        if (!arg_quiet) {
+                const char *t = arg_image ?: arg_directory;
+                _cleanup_free_ char *u = NULL;
+                (void) terminal_urlify_path(t, t, &u);
 
-        if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
-                arg_quiet = true;
+                log_info("%s %sSpawning container %s on %s.%s",
+                         special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), arg_machine, u ?: t, ansi_normal());
 
-        if (!arg_quiet)
-                log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
-                         arg_machine, arg_image ?: arg_directory);
+                if (arg_console_mode == CONSOLE_INTERACTIVE)
+                        log_info("%s %sPress %sCtrl-]%s three times within 1s to kill container.%s",
+                                 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal());
+        }
 
-        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18) >= 0);
 
         r = make_reaper_process(true);
         if (r < 0) {
@@ -5795,11 +6253,13 @@ static int run(int argc, char *argv[]) {
                 expose_args.fw_ctx = fw_ctx;
         }
         for (;;) {
-                r = run_container(dissected_image,
-                                  fds,
-                                  veth_name, &veth_created,
-                                  &expose_args, &master,
-                                  &pid, &ret);
+                r = run_container(
+                                dissected_image,
+                                userns_fd,
+                                fds,
+                                veth_name, &veth_created,
+                                &expose_args, &master,
+                                &pid, &ret);
                 if (r <= 0)
                         break;
         }
@@ -5841,25 +6301,30 @@ finish:
                         log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
         }
 
-        if (arg_machine) {
+        if (arg_machine && arg_privileged) {
                 const char *p;
 
                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
                 (void) rm_rf(p, REMOVE_ROOT);
+
+                p = strjoina("/run/systemd/nspawn/unix-export/", arg_machine);
+                (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
+                (void) rmdir(p);
         }
 
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET,  &expose_args.address4);
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
 
-        if (veth_created)
-                (void) remove_veth_links(veth_name, arg_network_veth_extra);
-        (void) remove_bridge(arg_network_zone);
+        if (arg_privileged) {
+                if (veth_created)
+                        (void) remove_veth_links(veth_name, arg_network_veth_extra);
+                (void) remove_bridge(arg_network_zone);
+        }
 
         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
         expose_port_free_all(arg_expose_ports);
         rlimit_free_all(arg_rlimit);
         device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
-        machine_credential_free_all(arg_credentials, arg_n_credentials);
 
         if (r < 0)
                 return r;
diff --git a/src/nspawn/nspawn.h b/src/nspawn/nspawn.h
index 27fb0b4..556f8ee 100644
--- a/src/nspawn/nspawn.h
+++ b/src/nspawn/nspawn.h
@@ -5,3 +5,4 @@
 
 int userns_lchown(const char *p, uid_t uid, gid_t gid);
 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid);
+int make_run_host(const char *root);
diff --git a/src/nspawn/test-nspawn-util.c b/src/nspawn/test-nspawn-util.c
index 08c8050..533edde 100644
--- a/src/nspawn/test-nspawn-util.c
+++ b/src/nspawn/test-nspawn-util.c
@@ -8,7 +8,7 @@
 TEST(systemd_installation_has_version) {
         int r;
 
-        FOREACH_STRING(version, "0", "231", STRINGIFY(PROJECT_VERSION), "999") {
+        FOREACH_STRING(version, "0", "231", PROJECT_VERSION_FULL, "999") {
                 r = systemd_installation_has_version(saved_argv[1], version);
                 assert_se(r >= 0);
                 log_info("%s has systemd >= %s: %s",