diff options
Diffstat (limited to 'sys-utils/unshare.c')
-rw-r--r-- | sys-utils/unshare.c | 1134 |
1 files changed, 1134 insertions, 0 deletions
diff --git a/sys-utils/unshare.c b/sys-utils/unshare.c new file mode 100644 index 0000000..29fad71 --- /dev/null +++ b/sys-utils/unshare.c @@ -0,0 +1,1134 @@ +/* + * unshare(1) - command-line interface for unshare(2) + * + * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <errno.h> +#include <getopt.h> +#include <poll.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/eventfd.h> +#include <sys/wait.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/prctl.h> +#include <grp.h> + +/* we only need some defines missing in sys/mount.h, no libmount linkage */ +#include <libmount.h> + +#include "nls.h" +#include "c.h" +#include "caputils.h" +#include "closestream.h" +#include "namespace.h" +#include "pidfd-utils.h" +#include "exec_shell.h" +#include "xalloc.h" +#include "pathnames.h" +#include "all-io.h" +#include "signames.h" +#include "strutils.h" +#include "pwdutils.h" + +/* synchronize parent and child by pipe */ +#define PIPE_SYNC_BYTE 0x06 + +/* 'private' is kernel default */ +#define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE) + +/* /proc namespace files and mountpoints for binds */ +static struct namespace_file { + int type; /* CLONE_NEW* */ + const char *name; /* ns/<type> */ + const char *target; /* user specified target for bind mount */ +} namespace_files[] = { + { .type = CLONE_NEWUSER, .name = "ns/user" }, + { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" }, + { .type = CLONE_NEWIPC, .name = "ns/ipc" }, + { .type = CLONE_NEWUTS, .name = "ns/uts" }, + { .type = CLONE_NEWNET, .name = "ns/net" }, + { .type = CLONE_NEWPID, .name = "ns/pid_for_children" }, + { .type = CLONE_NEWNS, .name = "ns/mnt" }, + { .type = CLONE_NEWTIME, .name = "ns/time_for_children" }, + { .name = NULL } +}; + +static int npersists; /* number of persistent namespaces */ + +enum { + SETGROUPS_NONE = -1, + SETGROUPS_DENY = 0, + SETGROUPS_ALLOW = 1, +}; + +static const char *setgroups_strings[] = +{ + [SETGROUPS_DENY] = "deny", + [SETGROUPS_ALLOW] = "allow" +}; + +static int setgroups_str2id(const char *str) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++) + if (strcmp(str, setgroups_strings[i]) == 0) + return i; + + errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str); +} + +static void setgroups_control(int action) +{ + const char *file = _PATH_PROC_SETGROUPS; + const char *cmd; + int fd; + + if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings)) + return; + cmd = setgroups_strings[action]; + + fd = open(file, O_WRONLY); + if (fd < 0) { + if (errno == ENOENT) + return; + err(EXIT_FAILURE, _("cannot open %s"), file); + } + + if (write_all(fd, cmd, strlen(cmd))) + err(EXIT_FAILURE, _("write failed %s"), file); + close(fd); +} + +static void map_id(const char *file, uint32_t from, uint32_t to) +{ + char *buf; + int fd; + + fd = open(file, O_WRONLY); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s"), file); + + xasprintf(&buf, "%u %u 1", from, to); + if (write_all(fd, buf, strlen(buf))) + err(EXIT_FAILURE, _("write failed %s"), file); + free(buf); + close(fd); +} + +static unsigned long parse_propagation(const char *str) +{ + size_t i; + static const struct prop_opts { + const char *name; + unsigned long flag; + } opts[] = { + { "slave", MS_REC | MS_SLAVE }, + { "private", MS_REC | MS_PRIVATE }, + { "shared", MS_REC | MS_SHARED }, + { "unchanged", 0 } + }; + + for (i = 0; i < ARRAY_SIZE(opts); i++) { + if (strcmp(opts[i].name, str) == 0) + return opts[i].flag; + } + + errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str); +} + +static void set_propagation(unsigned long flags) +{ + if (flags == 0) + return; + + if (mount("none", "/", NULL, flags, NULL) != 0) + err(EXIT_FAILURE, _("cannot change root filesystem propagation")); +} + + +static int set_ns_target(int type, const char *path) +{ + struct namespace_file *ns; + + for (ns = namespace_files; ns->name; ns++) { + if (ns->type != type) + continue; + ns->target = path; + npersists++; + return 0; + } + + return -EINVAL; +} + +static int bind_ns_files(pid_t pid) +{ + struct namespace_file *ns; + char src[PATH_MAX]; + + for (ns = namespace_files; ns->name; ns++) { + if (!ns->target) + continue; + + snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name); + + if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0) + err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target); + } + + return 0; +} + +static ino_t get_mnt_ino(pid_t pid) +{ + struct stat st; + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid); + + if (stat(path, &st) != 0) + err(EXIT_FAILURE, _("stat of %s failed"), path); + return st.st_ino; +} + +static void settime(time_t offset, clockid_t clk_id) +{ + char buf[sizeof(stringify_value(ULONG_MAX)) * 3]; + int fd, len; + + len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, (int64_t) offset); + + fd = open("/proc/self/timens_offsets", O_WRONLY); + if (fd < 0) + err(EXIT_FAILURE, _("failed to open /proc/self/timens_offsets")); + + if (write(fd, buf, len) != len) + err(EXIT_FAILURE, _("failed to write to /proc/self/timens_offsets")); + + close(fd); +} + +/** + * waitchild() - Wait for a process to exit successfully + * @pid: PID of the process to wait for + * + * Wait for a process to exit successfully. If it exits with a non-zero return + * code, then exit() with the same status. + */ +static void waitchild(int pid) +{ + int rc, status; + + do { + rc = waitpid(pid, &status, 0); + if (rc < 0) { + if (errno == EINTR) + continue; + err(EXIT_FAILURE, _("waitpid failed")); + } + if (WIFEXITED(status) && + WEXITSTATUS(status) != EXIT_SUCCESS) + exit(WEXITSTATUS(status)); + } while (rc < 0); +} + +/** + * sync_with_child() - Tell our child we're ready and wait for it to exit + * @pid: The pid of our child + * @fd: A file descriptor created with eventfd() + * + * This tells a child created with fork_and_wait() that we are ready for it to + * continue. Once we have done that, wait for our child to exit. + */ +static void sync_with_child(pid_t pid, int fd) +{ + uint64_t ch = PIPE_SYNC_BYTE; + + write_all(fd, &ch, sizeof(ch)); + close(fd); + + waitchild(pid); +} + +/** + * fork_and_wait() - Fork and wait to be sync'd with + * @fd - A file descriptor created with eventfd() which should be passed to + * sync_with_child() + * + * This creates an eventfd and forks. The parent process returns immediately, + * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning. + * This allows the parent to perform some tasks before the child starts its + * work. The parent should call sync_with_child() once it is ready for the + * child to continue. + * + * Return: The pid from fork() + */ +static pid_t fork_and_wait(int *fd) +{ + pid_t pid; + uint64_t ch; + + *fd = eventfd(0, 0); + if (*fd < 0) + err(EXIT_FAILURE, _("eventfd failed")); + + pid = fork(); + if (pid < 0) + err(EXIT_FAILURE, _("fork failed")); + + if (!pid) { + /* wait for the our parent to tell us to continue */ + if (read_all(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) || + ch != PIPE_SYNC_BYTE) + err(EXIT_FAILURE, _("failed to read eventfd")); + close(*fd); + } + + return pid; +} + +static pid_t bind_ns_files_from_child(int *fd) +{ + pid_t child, ppid = getpid(); + ino_t ino = get_mnt_ino(ppid); + + child = fork_and_wait(fd); + if (child) + return child; + + if (get_mnt_ino(ppid) == ino) + exit(EXIT_FAILURE); + bind_ns_files(ppid); + exit(EXIT_SUCCESS); +} + +static uid_t get_user(const char *s, const char *err) +{ + struct passwd *pw; + char *buf = NULL; + uid_t ret; + + pw = xgetpwnam(s, &buf); + if (pw) { + ret = pw->pw_uid; + free(pw); + free(buf); + } else { + ret = strtoul_or_err(s, err); + } + + return ret; +} + +static gid_t get_group(const char *s, const char *err) +{ + struct group *gr; + char *buf = NULL; + gid_t ret; + + gr = xgetgrnam(s, &buf); + if (gr) { + ret = gr->gr_gid; + free(gr); + free(buf); + } else { + ret = strtoul_or_err(s, err); + } + + return ret; +} + +/** + * struct map_range - A range of IDs to map + * @outer: First ID mapped on the outside of the namespace + * @inner: First ID mapped on the inside of the namespace + * @count: Length of the inside and outside ranges + * + * A range of uids/gids to map using new[gu]idmap. + */ +struct map_range { + unsigned int outer; + unsigned int inner; + unsigned int count; +}; + +#define UID_BUFSIZ sizeof(stringify_value(ULONG_MAX)) + +/** + * get_map_range() - Parse a mapping range from a string + * @s: A string of the format inner:outer:count or outer,inner,count + * + * Parse a string of the form inner:outer:count or outer,inner,count into + * a new mapping range. + * + * Return: A new &struct map_range + */ +static struct map_range *get_map_range(const char *s) +{ + int end; + struct map_range *ret; + + ret = xmalloc(sizeof(*ret)); + + if (sscanf(s, "%u:%u:%u%n", &ret->inner, &ret->outer, &ret->count, + &end) >= 3 && !s[end]) + return ret; /* inner:outer:count */ + + if (sscanf(s, "%u,%u,%u%n", &ret->outer, &ret->inner, &ret->count, + &end) >= 3 && !s[end]) + return ret; /* outer,inner,count */ + + errx(EXIT_FAILURE, _("invalid mapping '%s'"), s); +} + +/** + * read_subid_range() - Look up a user's sub[gu]id range + * @filename: The file to look up the range from. This should be either + * ``/etc/subuid`` or ``/etc/subgid``. + * @uid: The uid of the user whose range we should look up. + * + * This finds the first subid range matching @uid in @filename. + */ +static struct map_range *read_subid_range(char *filename, uid_t uid) +{ + char *line = NULL, *pwbuf; + FILE *idmap; + size_t n = 0; + struct passwd *pw; + struct map_range *map; + + map = xmalloc(sizeof(*map)); + map->inner = -1; + + pw = xgetpwuid(uid, &pwbuf); + if (!pw) + errx(EXIT_FAILURE, _("you (user %d) don't exist."), uid); + + idmap = fopen(filename, "r"); + if (!idmap) + err(EXIT_FAILURE, _("could not open '%s'"), filename); + + /* + * Each line in sub[ug]idmap looks like + * username:subuid:count + * OR + * uid:subuid:count + */ + while (getline(&line, &n, idmap) != -1) { + char *rest, *s; + + rest = strchr(line, ':'); + if (!rest) + continue; + *rest = '\0'; + + if (strcmp(line, pw->pw_name) && + strtoul(line, NULL, 10) != pw->pw_uid) + continue; + + s = rest + 1; + rest = strchr(s, ':'); + if (!rest) + continue; + *rest = '\0'; + map->outer = strtoul_or_err(s, _("failed to parse subid map")); + + s = rest + 1; + rest = strchr(s, '\n'); + if (rest) + *rest = '\0'; + map->count = strtoul_or_err(s, _("failed to parse subid map")); + + fclose(idmap); + free(pw); + free(pwbuf); + + return map; + } + + errx(EXIT_FAILURE, _("no line matching user \"%s\" in %s"), + pw->pw_name, filename); +} + +/** + * map_ids() - Create a new uid/gid map + * @idmapper: Either newuidmap or newgidmap + * @ppid: Pid to set the map for + * @outer: ID outside the namespace for a single map. + * @inner: ID inside the namespace for a single map. May be -1 to only use @map. + * @map: A range of IDs to map + * + * This creates a new uid/gid map for @ppid using @idmapper. The ID @outer in + * the parent (our) namespace is mapped to the ID @inner in the child (@ppid's) + * namespace. In addition, the range of IDs beginning at @map->outer is mapped + * to the range of IDs beginning at @map->inner. The tricky bit is that we + * cannot let these mappings overlap. We accomplish this by removing a "hole" + * from @map, if @outer or @inner overlap it. This may result in one less than + * @map->count IDs being mapped from @map. The unmapped IDs are always the + * topmost IDs of the mapping (either in the parent or the child namespace). + * + * Most of the time, this function will be called with @map->outer as some + * large ID, @map->inner as 0, and @map->count as a large number (at least + * 1000, but less than @map->outer). Typically, there will be no conflict with + * @outer. However, @inner may split the mapping for e.g. --map-current-user. + * + * This function always exec()s or errors out and does not return. + */ +static void __attribute__((__noreturn__)) +map_ids(const char *idmapper, int ppid, unsigned int outer, unsigned int inner, + struct map_range *map) +{ + /* idmapper + pid + 4 * map + NULL */ + char *argv[15]; + /* argv - idmapper - "1" - NULL */ + char args[12][UID_BUFSIZ]; + int i = 0, j = 0; + struct map_range lo, mid, hi; + unsigned int inner_offset, outer_offset; + + /* Some helper macros to reduce bookkeeping */ +#define push_str(s) do { \ + argv[i++] = s; \ +} while (0) +#define push_ul(x) do { \ + snprintf(args[j], sizeof(args[j]), "%u", x); \ + push_str(args[j++]); \ +} while (0) + + push_str(xstrdup(idmapper)); + push_ul(ppid); + if ((int)inner == -1) { + /* + * If we don't have a "single" mapping, then we can just use map + * directly, starting inner IDs from zero for an auto mapping + */ + push_ul(map->inner + 1 ? map->inner : 0); + push_ul(map->outer); + push_ul(map->count); + push_str(NULL); + + execvp(idmapper, argv); + errexec(idmapper); + } + + /* + * Start inner IDs from zero for an auto mapping; otherwise, if the two + * fixed mappings overlap, remove an ID from map + */ + if (map->inner + 1 == 0) + map->inner = 0; + else if ((outer >= map->outer && outer <= map->outer + map->count) || + (inner >= map->inner && inner <= map->inner + map->count)) + map->count--; + + /* Determine where the splits between lo, mid, and hi will be */ + outer_offset = min(outer > map->outer ? outer - map->outer : 0, + map->count); + inner_offset = min(inner > map->inner ? inner - map->inner : 0, + map->count); + + /* + * In the worst case, we need three mappings: + * From the bottom of map to either inner or outer + */ + lo.outer = map->outer; + lo.inner = map->inner; + lo.count = min(inner_offset, outer_offset); + + /* From the lower of inner or outer to the higher */ + mid.outer = lo.outer + lo.count; + mid.outer += mid.outer == outer; + mid.inner = lo.inner + lo.count; + mid.inner += mid.inner == inner; + mid.count = abs_diff(outer_offset, inner_offset); + + /* And from the higher of inner or outer to the end of the map */ + hi.outer = mid.outer + mid.count; + hi.outer += hi.outer == outer; + hi.inner = mid.inner + mid.count; + hi.inner += hi.inner == inner; + hi.count = map->count - lo.count - mid.count; + + push_ul(inner); + push_ul(outer); + push_str("1"); + /* new[gu]idmap doesn't like zero-length mappings, so skip them */ + if (lo.count) { + push_ul(lo.inner); + push_ul(lo.outer); + push_ul(lo.count); + } + if (mid.count) { + push_ul(mid.inner); + push_ul(mid.outer); + push_ul(mid.count); + } + if (hi.count) { + push_ul(hi.inner); + push_ul(hi.outer); + push_ul(hi.count); + } + push_str(NULL); + execvp(idmapper, argv); + errexec(idmapper); +} + +/** + * map_ids_from_child() - Set up a new uid/gid map + * @fd: The eventfd to wait on + * @mapuser: The user to map the current user to (or -1) + * @usermap: The range of UIDs to map (or %NULL) + * @mapgroup: The group to map the current group to (or -1) + * @groupmap: The range of GIDs to map (or %NULL) + * + * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon + * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map + * for our parent's PID. + * + * Return: The pid of the child. + */ +static pid_t map_ids_from_child(int *fd, uid_t mapuser, + struct map_range *usermap, gid_t mapgroup, + struct map_range *groupmap) +{ + pid_t child, pid = 0; + pid_t ppid = getpid(); + + child = fork_and_wait(fd); + if (child) + return child; + + /* Avoid forking more than we need to */ + if (usermap && groupmap) { + pid = fork(); + if (pid < 0) + err(EXIT_FAILURE, _("fork failed")); + if (pid) + waitchild(pid); + } + + if (!pid && usermap) + map_ids("newuidmap", ppid, geteuid(), mapuser, usermap); + if (groupmap) + map_ids("newgidmap", ppid, getegid(), mapgroup, groupmap); + exit(EXIT_SUCCESS); +} + +static void __attribute__((__noreturn__)) usage(void) +{ + FILE *out = stdout; + + fputs(USAGE_HEADER, out); + fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"), + program_invocation_short_name); + + fputs(USAGE_SEPARATOR, out); + fputs(_("Run a program with some namespaces unshared from the parent.\n"), out); + + fputs(USAGE_OPTIONS, out); + fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out); + fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out); + fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out); + fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out); + fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out); + fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out); + fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out); + fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out); + fputs(USAGE_SEPARATOR, out); + fputs(_(" -f, --fork fork before launching <program>\n"), out); + fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out); + fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out); + fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out); + fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out); + fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out); + fputs(_(" --map-users=<inneruid>:<outeruid>:<count>\n" + " map count users from outeruid to inneruid (implies --user)\n"), out); + fputs(_(" --map-groups=<innergid>:<outergid>:<count>\n" + " map count groups from outergid to innergid (implies --user)\n"), out); + fputs(USAGE_SEPARATOR, out); + fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n" + " defaults to SIGKILL\n"), out); + fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out); + fputs(_(" --propagation slave|shared|private|unchanged\n" + " modify mount propagation in mount namespace\n"), out); + fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out); + fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out); + fputs(USAGE_SEPARATOR, out); + fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out); + fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out); + fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out); + fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out); + fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out); + fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out); + + fputs(USAGE_SEPARATOR, out); + printf(USAGE_HELP_OPTIONS(27)); + printf(USAGE_MAN_TAIL("unshare(1)")); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + enum { + OPT_MOUNTPROC = CHAR_MAX + 1, + OPT_PROPAGATION, + OPT_SETGROUPS, + OPT_KILLCHILD, + OPT_KEEPCAPS, + OPT_MONOTONIC, + OPT_BOOTTIME, + OPT_MAPUSER, + OPT_MAPUSERS, + OPT_MAPGROUP, + OPT_MAPGROUPS, + OPT_MAPAUTO, + }; + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + + { "mount", optional_argument, NULL, 'm' }, + { "uts", optional_argument, NULL, 'u' }, + { "ipc", optional_argument, NULL, 'i' }, + { "net", optional_argument, NULL, 'n' }, + { "pid", optional_argument, NULL, 'p' }, + { "user", optional_argument, NULL, 'U' }, + { "cgroup", optional_argument, NULL, 'C' }, + { "time", optional_argument, NULL, 'T' }, + + { "fork", no_argument, NULL, 'f' }, + { "kill-child", optional_argument, NULL, OPT_KILLCHILD }, + { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC }, + { "map-user", required_argument, NULL, OPT_MAPUSER }, + { "map-users", required_argument, NULL, OPT_MAPUSERS }, + { "map-group", required_argument, NULL, OPT_MAPGROUP }, + { "map-groups", required_argument, NULL, OPT_MAPGROUPS }, + { "map-root-user", no_argument, NULL, 'r' }, + { "map-current-user", no_argument, NULL, 'c' }, + { "map-auto", no_argument, NULL, OPT_MAPAUTO }, + { "propagation", required_argument, NULL, OPT_PROPAGATION }, + { "setgroups", required_argument, NULL, OPT_SETGROUPS }, + { "keep-caps", no_argument, NULL, OPT_KEEPCAPS }, + { "setuid", required_argument, NULL, 'S' }, + { "setgid", required_argument, NULL, 'G' }, + { "root", required_argument, NULL, 'R' }, + { "wd", required_argument, NULL, 'w' }, + { "monotonic", required_argument, NULL, OPT_MONOTONIC }, + { "boottime", required_argument, NULL, OPT_BOOTTIME }, + { NULL, 0, NULL, 0 } + }; + + int setgrpcmd = SETGROUPS_NONE; + int unshare_flags = 0; + int c, forkit = 0; + uid_t mapuser = -1; + gid_t mapgroup = -1; + struct map_range *usermap = NULL; + struct map_range *groupmap = NULL; + int kill_child_signo = 0; /* 0 means --kill-child was not used */ + const char *procmnt = NULL; + const char *newroot = NULL; + const char *newdir = NULL; + pid_t pid_bind = 0, pid_idmap = 0; + pid_t pid = 0; +#ifdef UL_HAVE_PIDFD + int fd_parent_pid = -1; +#endif + int fd_idmap, fd_bind = -1; + sigset_t sigset, oldsigset; + int status; + unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT; + int force_uid = 0, force_gid = 0; + uid_t uid = 0, real_euid = geteuid(); + gid_t gid = 0, real_egid = getegid(); + int keepcaps = 0; + time_t monotonic = 0; + time_t boottime = 0; + int force_monotonic = 0; + int force_boottime = 0; + + setlocale(LC_ALL, ""); + bindtextdomain(PACKAGE, LOCALEDIR); + textdomain(PACKAGE); + close_stdout_atexit(); + + while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) { + switch (c) { + case 'f': + forkit = 1; + break; + case 'm': + unshare_flags |= CLONE_NEWNS; + if (optarg) + set_ns_target(CLONE_NEWNS, optarg); + break; + case 'u': + unshare_flags |= CLONE_NEWUTS; + if (optarg) + set_ns_target(CLONE_NEWUTS, optarg); + break; + case 'i': + unshare_flags |= CLONE_NEWIPC; + if (optarg) + set_ns_target(CLONE_NEWIPC, optarg); + break; + case 'n': + unshare_flags |= CLONE_NEWNET; + if (optarg) + set_ns_target(CLONE_NEWNET, optarg); + break; + case 'p': + unshare_flags |= CLONE_NEWPID; + if (optarg) + set_ns_target(CLONE_NEWPID, optarg); + break; + case 'U': + unshare_flags |= CLONE_NEWUSER; + if (optarg) + set_ns_target(CLONE_NEWUSER, optarg); + break; + case 'C': + unshare_flags |= CLONE_NEWCGROUP; + if (optarg) + set_ns_target(CLONE_NEWCGROUP, optarg); + break; + case 'T': + unshare_flags |= CLONE_NEWTIME; + if (optarg) + set_ns_target(CLONE_NEWTIME, optarg); + break; + case OPT_MOUNTPROC: + unshare_flags |= CLONE_NEWNS; + procmnt = optarg ? optarg : "/proc"; + break; + case OPT_MAPUSER: + unshare_flags |= CLONE_NEWUSER; + mapuser = get_user(optarg, _("failed to parse uid")); + break; + case OPT_MAPGROUP: + unshare_flags |= CLONE_NEWUSER; + mapgroup = get_group(optarg, _("failed to parse gid")); + break; + case 'r': + unshare_flags |= CLONE_NEWUSER; + mapuser = 0; + mapgroup = 0; + break; + case 'c': + unshare_flags |= CLONE_NEWUSER; + mapuser = real_euid; + mapgroup = real_egid; + break; + case OPT_MAPUSERS: + unshare_flags |= CLONE_NEWUSER; + if (!strcmp(optarg, "auto")) + usermap = read_subid_range(_PATH_SUBUID, real_euid); + else + usermap = get_map_range(optarg); + break; + case OPT_MAPGROUPS: + unshare_flags |= CLONE_NEWUSER; + if (!strcmp(optarg, "auto")) + groupmap = read_subid_range(_PATH_SUBGID, real_euid); + else + groupmap = get_map_range(optarg); + break; + case OPT_MAPAUTO: + unshare_flags |= CLONE_NEWUSER; + usermap = read_subid_range(_PATH_SUBUID, real_euid); + groupmap = read_subid_range(_PATH_SUBGID, real_euid); + break; + case OPT_SETGROUPS: + setgrpcmd = setgroups_str2id(optarg); + break; + case OPT_PROPAGATION: + propagation = parse_propagation(optarg); + break; + case OPT_KILLCHILD: + forkit = 1; + if (optarg) { + if ((kill_child_signo = signame_to_signum(optarg)) < 0) + errx(EXIT_FAILURE, _("unknown signal: %s"), + optarg); + } else { + kill_child_signo = SIGKILL; + } + break; + case OPT_KEEPCAPS: + keepcaps = 1; + cap_last_cap(); /* Force last cap to be cached before we fork. */ + break; + case 'S': + uid = strtoul_or_err(optarg, _("failed to parse uid")); + force_uid = 1; + break; + case 'G': + gid = strtoul_or_err(optarg, _("failed to parse gid")); + force_gid = 1; + break; + case 'R': + newroot = optarg; + break; + case 'w': + newdir = optarg; + break; + case OPT_MONOTONIC: + monotonic = strtoul_or_err(optarg, _("failed to parse monotonic offset")); + force_monotonic = 1; + break; + case OPT_BOOTTIME: + boottime = strtoul_or_err(optarg, _("failed to parse boottime offset")); + force_boottime = 1; + break; + + case 'h': + usage(); + case 'V': + print_version(EXIT_SUCCESS); + default: + errtryhelp(EXIT_FAILURE); + } + } + + if ((force_monotonic || force_boottime) && !(unshare_flags & CLONE_NEWTIME)) + errx(EXIT_FAILURE, _("options --monotonic and --boottime require " + "unsharing of a time namespace (-T)")); + + /* clear any inherited settings */ + signal(SIGCHLD, SIG_DFL); + + if (npersists && (unshare_flags & CLONE_NEWNS)) + pid_bind = bind_ns_files_from_child(&fd_bind); + + if (usermap || groupmap) + pid_idmap = map_ids_from_child(&fd_idmap, mapuser, usermap, + mapgroup, groupmap); + + if (-1 == unshare(unshare_flags)) + err(EXIT_FAILURE, _("unshare failed")); + + /* Tell child we've called unshare() */ + if (usermap || groupmap) + sync_with_child(pid_idmap, fd_idmap); + + if (force_boottime) + settime(boottime, CLOCK_BOOTTIME); + + if (force_monotonic) + settime(monotonic, CLOCK_MONOTONIC); + + if (forkit) { + if (sigemptyset(&sigset) != 0 || + sigaddset(&sigset, SIGINT) != 0 || + sigaddset(&sigset, SIGTERM) != 0 || + sigprocmask(SIG_BLOCK, &sigset, &oldsigset) != 0) + err(EXIT_FAILURE, _("sigprocmask block failed")); +#ifdef UL_HAVE_PIDFD + if (kill_child_signo != 0) { + /* make a connection to the original process (parent) */ + fd_parent_pid = pidfd_open(getpid(), 0); + if (0 > fd_parent_pid) + err(EXIT_FAILURE, _("pidfd_open failed")); + } +#endif + /* force child forking before mountspace binding so + * pid_for_children is populated */ + pid = fork(); + + switch(pid) { + case -1: + err(EXIT_FAILURE, _("fork failed")); + case 0: /* child */ + if (sigprocmask(SIG_SETMASK, &oldsigset, NULL)) + err(EXIT_FAILURE, + _("sigprocmask restore failed")); + if (npersists && (unshare_flags & CLONE_NEWNS)) + close(fd_bind); + break; + default: /* parent */ + break; + } + } + + if (npersists && (pid || !forkit)) { + /* run in parent */ + if (pid_bind && (unshare_flags & CLONE_NEWNS)) + sync_with_child(pid_bind, fd_bind); + else + /* simple way, just bind */ + bind_ns_files(getpid()); + } + + if (pid) { + if (waitpid(pid, &status, 0) == -1) + err(EXIT_FAILURE, _("waitpid failed")); + + if (WIFEXITED(status)) + return WEXITSTATUS(status); + if (WIFSIGNALED(status)) { + + /* Ensure the signal that terminated the child will + * also terminate the parent. */ + + int termsig = WTERMSIG(status); + + if (signal(termsig, SIG_DFL) == SIG_ERR || + sigemptyset(&sigset) != 0 || + sigaddset(&sigset, termsig) != 0 || + sigprocmask(SIG_UNBLOCK, &sigset, NULL) != 0) + err(EXIT_FAILURE, + _("sigprocmask unblock failed")); + + kill(getpid(), termsig); + } + err(EXIT_FAILURE, _("child exit failed")); + } + + if (kill_child_signo != 0) { + if (prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0) + err(EXIT_FAILURE, "prctl failed"); +#ifdef UL_HAVE_PIDFD + /* Use poll() to check that there is still the original parent. */ + if (fd_parent_pid != -1) { + struct pollfd pollfds[1] = { + { .fd = fd_parent_pid, .events = POLLIN } + }; + int nfds = poll(pollfds, 1, 0); + + if (0 > nfds) + err(EXIT_FAILURE, "poll parent pidfd failed"); + + /* If the child was re-parented before prctl(2) was called, the + * new parent will likely not be interested in the precise exit + * status of the orphan. + */ + if (nfds) + exit(EXIT_FAILURE); + + close(fd_parent_pid); + fd_parent_pid = -1; + } +#endif + } + + if (mapuser != (uid_t) -1 && !usermap) + map_id(_PATH_PROC_UIDMAP, mapuser, real_euid); + + /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map + * has been disabled unless /proc/self/setgroups is written + * first to permanently disable the ability to call setgroups + * in that user namespace. */ + if (mapgroup != (gid_t) -1 && !groupmap) { + if (setgrpcmd == SETGROUPS_ALLOW) + errx(EXIT_FAILURE, _("options --setgroups=allow and " + "--map-group are mutually exclusive")); + setgroups_control(SETGROUPS_DENY); + map_id(_PATH_PROC_GIDMAP, mapgroup, real_egid); + } + + if (setgrpcmd != SETGROUPS_NONE) + setgroups_control(setgrpcmd); + + if ((unshare_flags & CLONE_NEWNS) && propagation) + set_propagation(propagation); + + if (newroot) { + if (chroot(newroot) != 0) + err(EXIT_FAILURE, + _("cannot change root directory to '%s'"), newroot); + newdir = newdir ?: "/"; + } + if (newdir && chdir(newdir)) + err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir); + + if (procmnt) { + /* When not changing root and using the default propagation flags + then the recursive propagation change of root will + automatically change that of an existing proc mount. */ + if (!newroot && propagation != (MS_PRIVATE|MS_REC)) { + int rc = mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL); + + /* Custom procmnt means that proc is very likely not mounted, causing EINVAL. + Ignoring the error in this specific instance is considered safe. */ + if(rc != 0 && errno != EINVAL) + err(EXIT_FAILURE, _("cannot change %s filesystem propagation"), procmnt); + } + + if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0) + err(EXIT_FAILURE, _("mount %s failed"), procmnt); + } + + if (force_gid) { + if (setgroups(0, NULL) != 0) /* drop supplementary groups */ + err(EXIT_FAILURE, _("setgroups failed")); + if (setgid(gid) < 0) /* change GID */ + err(EXIT_FAILURE, _("setgid failed")); + } + if (force_uid && setuid(uid) < 0) /* change UID */ + err(EXIT_FAILURE, _("setuid failed")); + + /* We use capabilities system calls to propagate the permitted + * capabilities into the ambient set because we have already + * forked so are in async-signal-safe context. */ + if (keepcaps && (unshare_flags & CLONE_NEWUSER)) { + struct __user_cap_header_struct header = { + .version = _LINUX_CAPABILITY_VERSION_3, + .pid = 0, + }; + + struct __user_cap_data_struct payload[_LINUX_CAPABILITY_U32S_3] = {{ 0 }}; + uint64_t effective, cap; + + if (capget(&header, payload) < 0) + err(EXIT_FAILURE, _("capget failed")); + + /* In order the make capabilities ambient, we first need to ensure + * that they are all inheritable. */ + payload[0].inheritable = payload[0].permitted; + payload[1].inheritable = payload[1].permitted; + + if (capset(&header, payload) < 0) + err(EXIT_FAILURE, _("capset failed")); + + effective = ((uint64_t)payload[1].effective << 32) | (uint64_t)payload[0].effective; + + for (cap = 0; cap < (sizeof(effective) * 8); cap++) { + /* This is the same check as cap_valid(), but using + * the runtime value for the last valid cap. */ + if (cap > (uint64_t) cap_last_cap()) + continue; + + if ((effective & (1 << cap)) + && prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0) + err(EXIT_FAILURE, _("prctl(PR_CAP_AMBIENT) failed")); + } + } + + if (optind < argc) { + execvp(argv[optind], argv + optind); + errexec(argv[optind]); + } + exec_shell(); +} |