1 files changed, 252 insertions, 38 deletions
diff --git a/src/basic/namespace-util.c b/src/basic/namespace-util.c
index 2101f61..5b4e43f 100644
--- a/src/basic/namespace-util.c
+++ b/src/basic/namespace-util.c
@@ -11,6 +11,7 @@
 #include "missing_magic.h"
 #include "missing_sched.h"
 #include "namespace-util.h"
+#include "parse-util.h"
 #include "process-util.h"
 #include "stat-util.h"
 #include "stdio-util.h"
@@ -33,71 +34,86 @@ const struct namespace_info namespace_info[] = {
 
 #define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path)
 
-int namespace_open(pid_t pid, int *pidns_fd, int *mntns_fd, int *netns_fd, int *userns_fd, int *root_fd) {
-        _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF;
-        int rfd = -EBADF;
+static NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
+        for (NamespaceType t = 0; t < _NAMESPACE_TYPE_MAX; t++)
+                if (((namespace_info[t].clone_flag ^ clone_flag) & (CLONE_NEWCGROUP|CLONE_NEWIPC|CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUSER|CLONE_NEWUTS|CLONE_NEWTIME)) == 0)
+                        return t;
+
+        return _NAMESPACE_TYPE_INVALID;
+}
+
+int namespace_open(
+                pid_t pid,
+                int *ret_pidns_fd,
+                int *ret_mntns_fd,
+                int *ret_netns_fd,
+                int *ret_userns_fd,
+                int *ret_root_fd) {
+
+        _cleanup_close_ int pidns_fd = -EBADF, mntns_fd = -EBADF, netns_fd = -EBADF,
+                userns_fd = -EBADF, root_fd = -EBADF;
 
         assert(pid >= 0);
 
-        if (mntns_fd) {
-                const char *mntns;
+        if (ret_pidns_fd) {
+                const char *pidns;
 
-                mntns = pid_namespace_path(pid, NAMESPACE_MOUNT);
-                mntnsfd = open(mntns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
-                if (mntnsfd < 0)
+                pidns = pid_namespace_path(pid, NAMESPACE_PID);
+                pidns_fd = open(pidns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+                if (pidns_fd < 0)
                         return -errno;
         }
 
-        if (pidns_fd) {
-                const char *pidns;
+        if (ret_mntns_fd) {
+                const char *mntns;
 
-                pidns = pid_namespace_path(pid, NAMESPACE_PID);
-                pidnsfd = open(pidns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
-                if (pidnsfd < 0)
+                mntns = pid_namespace_path(pid, NAMESPACE_MOUNT);
+                mntns_fd = open(mntns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+                if (mntns_fd < 0)
                         return -errno;
         }
 
-        if (netns_fd) {
+        if (ret_netns_fd) {
                 const char *netns;
 
                 netns = pid_namespace_path(pid, NAMESPACE_NET);
-                netnsfd = open(netns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
-                if (netnsfd < 0)
+                netns_fd = open(netns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+                if (netns_fd < 0)
                         return -errno;
         }
 
-        if (userns_fd) {
+        if (ret_userns_fd) {
                 const char *userns;
 
                 userns = pid_namespace_path(pid, NAMESPACE_USER);
-                usernsfd = open(userns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
-                if (usernsfd < 0 && errno != ENOENT)
+                userns_fd = open(userns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+                if (userns_fd < 0 && errno != ENOENT)
                         return -errno;
         }
 
-        if (root_fd) {
+        if (ret_root_fd) {
                 const char *root;
 
                 root = procfs_file_alloca(pid, "root");
-                rfd = open(root, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY);
-                if (rfd < 0)
+                root_fd = open(root, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY);
+                if (root_fd < 0)
                         return -errno;
         }
 
-        if (pidns_fd)
-                *pidns_fd = TAKE_FD(pidnsfd);
+        if (ret_pidns_fd)
+                *ret_pidns_fd = TAKE_FD(pidns_fd);
 
-        if (mntns_fd)
-                *mntns_fd = TAKE_FD(mntnsfd);
+        if (ret_mntns_fd)
+                *ret_mntns_fd = TAKE_FD(mntns_fd);
 
-        if (netns_fd)
-                *netns_fd = TAKE_FD(netnsfd);
+        if (ret_netns_fd)
+                *ret_netns_fd = TAKE_FD(netns_fd);
 
-        if (userns_fd)
-                *userns_fd = TAKE_FD(usernsfd);
+        if (ret_userns_fd)
+                *ret_userns_fd = TAKE_FD(userns_fd);
 
-        if (root_fd)
-                *root_fd = TAKE_FD(rfd);
+        if (ret_root_fd)
+                *ret_root_fd = TAKE_FD(root_fd);
 
         return 0;
 }
@@ -206,6 +222,88 @@ int detach_mount_namespace(void) {
         return 0;
 }
 
+int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid) {
+        int r;
+
+        /* Tried detach_mount_namespace() first. If that doesn't work due to permissions, opens up an
+         * unprivileged user namespace with a mapping of the originating UID/GID to the specified target
+         * UID/GID. Then, tries detach_mount_namespace() again.
+         *
+         * Or in other words: tries much harder to get a mount namespace, making use of unprivileged user
+         * namespaces if need be.
+         *
+         * Note that after this function completed:
+         *
+         *    → if we had privs, afterwards uids/gids on files and processes are as before
+         *
+         *    → if we had no privs, our own id and all our files will show up owned by target_uid/target_gid,
+         *    and everything else owned by nobody.
+         *
+         * Yes, that's quite a difference. */
+
+        if (!uid_is_valid(target_uid))
+                return -EINVAL;
+        if (!gid_is_valid(target_gid))
+                return -EINVAL;
+
+        r = detach_mount_namespace();
+        if (r != -EPERM)
+                return r;
+
+        if (unshare(CLONE_NEWUSER) < 0)
+                return log_debug_errno(errno, "Failed to acquire user namespace: %m");
+
+        r = write_string_filef("/proc/self/uid_map", 0,
+                               UID_FMT " " UID_FMT " 1\n", target_uid, getuid());
+        if (r < 0)
+                return log_debug_errno(r, "Failed to write uid map: %m");
+
+        r = write_string_file("/proc/self/setgroups", "deny", 0);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to write setgroups file: %m");
+
+        r = write_string_filef("/proc/self/gid_map", 0,
+                               GID_FMT " " GID_FMT " 1\n", target_gid, getgid());
+        if (r < 0)
+                return log_debug_errno(r, "Failed to write gid map: %m");
+
+        return detach_mount_namespace();
+}
+
+int detach_mount_namespace_userns(int userns_fd) {
+        int r;
+
+        assert(userns_fd >= 0);
+
+        if (setns(userns_fd, CLONE_NEWUSER) < 0)
+                return log_debug_errno(errno, "Failed to join user namespace: %m");
+
+        r = reset_uid_gid();
+        if (r < 0)
+                return log_debug_errno(r, "Failed to become root in user namespace: %m");
+
+        return detach_mount_namespace();
+}
+
+int userns_acquire_empty(void) {
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        _cleanup_close_ int userns_fd = -EBADF;
+        int r;
+
+        r = safe_fork("(sd-mkuserns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_USERNS, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                /* Child. We do nothing here, just freeze until somebody kills us. */
+                freeze();
+
+        r = namespace_open(pid, NULL, NULL, NULL, &userns_fd, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open userns fd: %m");
+
+        return TAKE_FD(userns_fd);
+}
+
 int userns_acquire(const char *uid_map, const char *gid_map) {
         char path[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1];
         _cleanup_(sigkill_waitp) pid_t pid = 0;
@@ -221,7 +319,7 @@ int userns_acquire(const char *uid_map, const char *gid_map) {
 
         r = safe_fork("(sd-mkuserns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_USERNS, &pid);
         if (r < 0)
-                return r;
+                return log_debug_errno(r, "Failed to fork process (sd-mkuserns): %m");
         if (r == 0)
                 /* Child. We do nothing here, just freeze until somebody kills us. */
                 freeze();
@@ -229,19 +327,50 @@ int userns_acquire(const char *uid_map, const char *gid_map) {
         xsprintf(path, "/proc/" PID_FMT "/uid_map", pid);
         r = write_string_file(path, uid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
         if (r < 0)
-                return log_error_errno(r, "Failed to write UID map: %m");
+                return log_debug_errno(r, "Failed to write UID map: %m");
 
         xsprintf(path, "/proc/" PID_FMT "/gid_map", pid);
         r = write_string_file(path, gid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
         if (r < 0)
-                return log_error_errno(r, "Failed to write GID map: %m");
-
-        r = namespace_open(pid, NULL, NULL, NULL, &userns_fd, NULL);
+                return log_debug_errno(r, "Failed to write GID map: %m");
+
+        r = namespace_open(pid,
+                           /* ret_pidns_fd = */ NULL,
+                           /* ret_mntns_fd = */ NULL,
+                           /* ret_netns_fd = */ NULL,
+                           &userns_fd,
+                           /* ret_root_fd = */ NULL);
         if (r < 0)
-                return log_error_errno(r, "Failed to open userns fd: %m");
+                return log_debug_errno(r, "Failed to open userns fd: %m");
 
         return TAKE_FD(userns_fd);
+}
 
+int netns_acquire(void) {
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        _cleanup_close_ int netns_fd = -EBADF;
+        int r;
+
+        /* Forks off a process in a new network namespace, acquires a network namespace fd, and then kills
+         * the process again. This way we have a netns fd that is not bound to any process. */
+
+        r = safe_fork("(sd-mknetns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_NETNS, &pid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to fork process (sd-mknetns): %m");
+        if (r == 0)
+                /* Child. We do nothing here, just freeze until somebody kills us. */
+                freeze();
+
+        r = namespace_open(pid,
+                           /* ret_pidns_fd = */ NULL,
+                           /* ret_mntns_fd = */ NULL,
+                           &netns_fd,
+                           /* ret_userns_fd = */ NULL,
+                           /* ret_root_fd = */ NULL);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to open netns fd: %m");
+
+        return TAKE_FD(netns_fd);
 }
 
 int in_same_namespace(pid_t pid1, pid_t pid2, NamespaceType type) {
@@ -267,3 +396,88 @@ int in_same_namespace(pid_t pid1, pid_t pid2, NamespaceType type) {
 
         return stat_inode_same(&ns_st1, &ns_st2);
 }
+
+int parse_userns_uid_range(const char *s, uid_t *ret_uid_shift, uid_t *ret_uid_range) {
+        _cleanup_free_ char *buffer = NULL;
+        const char *range, *shift;
+        int r;
+        uid_t uid_shift, uid_range = 65536;
+
+        assert(s);
+
+        range = strchr(s, ':');
+        if (range) {
+                buffer = strndup(s, range - s);
+                if (!buffer)
+                        return log_oom();
+                shift = buffer;
+
+                range++;
+                r = safe_atou32(range, &uid_range);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
+        } else
+                shift = s;
+
+        r = parse_uid(shift, &uid_shift);
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse UID \"%s\": %m", s);
+
+        if (!userns_shift_range_valid(uid_shift, uid_range))
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
+
+        if (ret_uid_shift)
+                *ret_uid_shift = uid_shift;
+
+        if (ret_uid_range)
+                *ret_uid_range = uid_range;
+
+        return 0;
+}
+
+int namespace_open_by_type(NamespaceType type) {
+        const char *p;
+        int fd;
+
+        assert(type >= 0);
+        assert(type < _NAMESPACE_TYPE_MAX);
+
+        p = pid_namespace_path(0, type);
+
+        fd = RET_NERRNO(open(p, O_RDONLY|O_NOCTTY|O_CLOEXEC));
+        if (fd == -ENOENT && proc_mounted() == 0)
+                return -ENOSYS;
+
+        return fd;
+}
+
+int is_our_namespace(int fd, NamespaceType request_type) {
+        int clone_flag;
+
+        assert(fd >= 0);
+
+        clone_flag = ioctl(fd, NS_GET_NSTYPE);
+        if (clone_flag < 0)
+                return -errno;
+
+        NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
+        if (found_type < 0)
+                return -EBADF; /* Uh? Unknown namespace type? */
+
+        if (request_type >= 0 && request_type != found_type) /* It's a namespace, but not of the right type? */
+                return -EUCLEAN;
+
+        struct stat st_fd, st_ours;
+        if (fstat(fd, &st_fd) < 0)
+                return -errno;
+
+        const char *p = pid_namespace_path(0, found_type);
+        if (stat(p, &st_ours) < 0) {
+                if (errno == ENOENT)
+                        return proc_mounted() == 0 ? -ENOSYS : -ENOENT;
+
+                return -errno;
+        }
+
+        return stat_inode_same(&st_ours, &st_fd);
+}