/*
 * unshare(1) - command-line interface for unshare(2)
 *
 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
 * later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include <errno.h>
#include <getopt.h>
#include <poll.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/eventfd.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/prctl.h>
#include <grp.h>

/* we only need some defines missing in sys/mount.h, no libmount linkage */
#include <libmount.h>

#include "nls.h"
#include "c.h"
#include "caputils.h"
#include "closestream.h"
#include "namespace.h"
#include "pidfd-utils.h"
#include "exec_shell.h"
#include "xalloc.h"
#include "pathnames.h"
#include "all-io.h"
#include "signames.h"
#include "strutils.h"
#include "pwdutils.h"

/* synchronize parent and child by pipe */
#define PIPE_SYNC_BYTE	0x06

/* 'private' is kernel default */
#define UNSHARE_PROPAGATION_DEFAULT	(MS_REC | MS_PRIVATE)

/* /proc namespace files and mountpoints for binds */
static struct namespace_file {
	int		type;		/* CLONE_NEW* */
	const char	*name;		/* ns/<type> */
	const char	*target;	/* user specified target for bind mount */
} namespace_files[] = {
	{ .type = CLONE_NEWUSER,  .name = "ns/user" },
	{ .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
	{ .type = CLONE_NEWIPC,   .name = "ns/ipc"  },
	{ .type = CLONE_NEWUTS,   .name = "ns/uts"  },
	{ .type = CLONE_NEWNET,   .name = "ns/net"  },
	{ .type = CLONE_NEWPID,   .name = "ns/pid_for_children" },
	{ .type = CLONE_NEWNS,    .name = "ns/mnt"  },
	{ .type = CLONE_NEWTIME,  .name = "ns/time_for_children" },
	{ .name = NULL }
};

static int npersists;	/* number of persistent namespaces */

enum {
	SETGROUPS_NONE = -1,
	SETGROUPS_DENY = 0,
	SETGROUPS_ALLOW = 1,
};

static const char *setgroups_strings[] =
{
	[SETGROUPS_DENY] = "deny",
	[SETGROUPS_ALLOW] = "allow"
};

static int setgroups_str2id(const char *str)
{
	size_t i;

	for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
		if (strcmp(str, setgroups_strings[i]) == 0)
			return i;

	errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
}

static void setgroups_control(int action)
{
	const char *file = _PATH_PROC_SETGROUPS;
	const char *cmd;
	int fd;

	if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
		return;
	cmd = setgroups_strings[action];

	fd = open(file, O_WRONLY);
	if (fd < 0) {
		if (errno == ENOENT)
			return;
		err(EXIT_FAILURE, _("cannot open %s"), file);
	}

	if (write_all(fd, cmd, strlen(cmd)))
		err(EXIT_FAILURE, _("write failed %s"), file);
	close(fd);
}

static void map_id(const char *file, uint32_t from, uint32_t to)
{
	char *buf;
	int fd;

	fd = open(file, O_WRONLY);
	if (fd < 0)
		 err(EXIT_FAILURE, _("cannot open %s"), file);

	xasprintf(&buf, "%u %u 1", from, to);
	if (write_all(fd, buf, strlen(buf)))
		err(EXIT_FAILURE, _("write failed %s"), file);
	free(buf);
	close(fd);
}

static unsigned long parse_propagation(const char *str)
{
	size_t i;
	static const struct prop_opts {
		const char *name;
		unsigned long flag;
	} opts[] = {
		{ "slave",	MS_REC | MS_SLAVE },
		{ "private",	MS_REC | MS_PRIVATE },
		{ "shared",     MS_REC | MS_SHARED },
		{ "unchanged",        0 }
	};

	for (i = 0; i < ARRAY_SIZE(opts); i++) {
		if (strcmp(opts[i].name, str) == 0)
			return opts[i].flag;
	}

	errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
}

static void set_propagation(unsigned long flags)
{
	if (flags == 0)
		return;

	if (mount("none", "/", NULL, flags, NULL) != 0)
		err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
}


static int set_ns_target(int type, const char *path)
{
	struct namespace_file *ns;

	for (ns = namespace_files; ns->name; ns++) {
		if (ns->type != type)
			continue;
		ns->target = path;
		npersists++;
		return 0;
	}

	return -EINVAL;
}

static int bind_ns_files(pid_t pid)
{
	struct namespace_file *ns;
	char src[PATH_MAX];

	for (ns = namespace_files; ns->name; ns++) {
		if (!ns->target)
			continue;

		snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);

		if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
			err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
	}

	return 0;
}

static ino_t get_mnt_ino(pid_t pid)
{
	struct stat st;
	char path[PATH_MAX];

	snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);

	if (stat(path, &st) != 0)
		err(EXIT_FAILURE, _("stat of %s failed"), path);
	return st.st_ino;
}

static void settime(int64_t offset, clockid_t clk_id)
{
	char buf[sizeof(stringify_value(ULONG_MAX)) * 3];
	int fd, len;

	len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, offset);

	fd = open("/proc/self/timens_offsets", O_WRONLY);
	if (fd < 0)
		err(EXIT_FAILURE, _("failed to open /proc/self/timens_offsets"));

	if (write(fd, buf, len) != len)
		err(EXIT_FAILURE, _("failed to write to /proc/self/timens_offsets"));

	close(fd);
}

/**
 * waitchild() - Wait for a process to exit successfully
 * @pid: PID of the process to wait for
 *
 * Wait for a process to exit successfully. If it exits with a non-zero return
 * code, then exit() with the same status.
 */
static void waitchild(int pid)
{
	int rc, status;

	do {
		rc = waitpid(pid, &status, 0);
		if (rc < 0) {
			if (errno == EINTR)
				continue;
			err(EXIT_FAILURE, _("waitpid failed"));
		}
		if (WIFEXITED(status) &&
		    WEXITSTATUS(status) != EXIT_SUCCESS)
			exit(WEXITSTATUS(status));
	} while (rc < 0);
}

/**
 * sync_with_child() - Tell our child we're ready and wait for it to exit
 * @pid: The pid of our child
 * @fd: A file descriptor created with eventfd()
 *
 * This tells a child created with fork_and_wait() that we are ready for it to
 * continue. Once we have done that, wait for our child to exit.
 */
static void sync_with_child(pid_t pid, int fd)
{
	uint64_t ch = PIPE_SYNC_BYTE;

	write_all(fd, &ch, sizeof(ch));
	close(fd);

	waitchild(pid);
}

/**
 * fork_and_wait() - Fork and wait to be sync'd with
 * @fd - A file descriptor created with eventfd() which should be passed to
 *       sync_with_child()
 *
 * This creates an eventfd and forks. The parent process returns immediately,
 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
 * This allows the parent to perform some tasks before the child starts its
 * work. The parent should call sync_with_child() once it is ready for the
 * child to continue.
 *
 * Return: The pid from fork()
 */
static pid_t fork_and_wait(int *fd)
{
	pid_t pid;
	uint64_t ch;

	*fd = eventfd(0, 0);
	if (*fd < 0)
		err(EXIT_FAILURE, _("eventfd failed"));

	pid = fork();
	if (pid < 0)
		err(EXIT_FAILURE, _("fork failed"));

	if (!pid) {
		/* wait for the our parent to tell us to continue */
		if (read_all(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
		    ch != PIPE_SYNC_BYTE)
			err(EXIT_FAILURE, _("failed to read eventfd"));
		close(*fd);
	}

	return pid;
}

static pid_t bind_ns_files_from_child(int *fd)
{
	pid_t child, ppid = getpid();
	ino_t ino = get_mnt_ino(ppid);

	child = fork_and_wait(fd);
	if (child)
		return child;

	if (get_mnt_ino(ppid) == ino)
		exit(EXIT_FAILURE);
	bind_ns_files(ppid);
	exit(EXIT_SUCCESS);
}

static uid_t get_user(const char *s, const char *err)
{
	struct passwd *pw;
	char *buf = NULL;
	uid_t ret;

	pw = xgetpwnam(s, &buf);
	if (pw) {
		ret = pw->pw_uid;
		free(pw);
		free(buf);
	} else {
		ret = strtoul_or_err(s, err);
	}

	return ret;
}

static gid_t get_group(const char *s, const char *err)
{
	struct group *gr;
	char *buf = NULL;
	gid_t ret;

	gr = xgetgrnam(s, &buf);
	if (gr) {
		ret = gr->gr_gid;
		free(gr);
		free(buf);
	} else {
		ret = strtoul_or_err(s, err);
	}

	return ret;
}

/**
 * struct map_range - A range of IDs to map
 * @outer: First ID mapped on the outside of the namespace
 * @inner: First ID mapped on the inside of the namespace
 * @count: Length of the inside and outside ranges
 * @next: Next range of IDs in the chain
 *
 * A range of uids/gids to map using new[gu]idmap.
 */
struct map_range {
	unsigned int outer;
	unsigned int inner;
	unsigned int count;
	struct map_range *next;
};

static void insert_map_range(struct map_range **chain, struct map_range map)
{
	struct map_range *tail = *chain;
	*chain = xmalloc(sizeof(**chain));
	memcpy(*chain, &map, sizeof(**chain));
	(*chain)->next = tail;
}

/**
 * get_map_range() - Parse a mapping range from a string
 * @s: A string of the format inner:outer:count or outer,inner,count
 *
 * Parse a string of the form inner:outer:count or outer,inner,count into
 * a new mapping range.
 *
 * Return: A struct map_range
 */
static struct map_range get_map_range(const char *s)
{
	int end;
	struct map_range ret = { .next = NULL };

	if (sscanf(s, "%u:%u:%u%n", &ret.inner, &ret.outer, &ret.count,
		   &end) >= 3 && !s[end])
		return ret; /* inner:outer:count */

	if (sscanf(s, "%u,%u,%u%n", &ret.outer, &ret.inner, &ret.count,
		   &end) >= 3 && !s[end])
		return ret; /* outer,inner,count */

	errx(EXIT_FAILURE, _("invalid mapping '%s'"), s);
}

/**
 * read_subid_range() - Look up a user's sub[gu]id range
 * @filename: The file to look up the range from. This should be either
 *            ``/etc/subuid`` or ``/etc/subgid``.
 * @uid: The uid of the user whose range we should look up.
 *
 * This finds the first subid range matching @uid in @filename.
 */
static struct map_range read_subid_range(char *filename, uid_t uid)
{
	char *line = NULL, *pwbuf;
	FILE *idmap;
	size_t n = 0;
	struct passwd *pw;
	struct map_range map = { .inner = -1, .next = NULL };

	pw = xgetpwuid(uid, &pwbuf);
	if (!pw)
		errx(EXIT_FAILURE, _("you (user %d) don't exist."), uid);

	idmap = fopen(filename, "r");
	if (!idmap)
		err(EXIT_FAILURE, _("could not open '%s'"), filename);

	/*
	* Each line in sub[ug]idmap looks like
	* username:subuid:count
	* OR
	* uid:subuid:count
	*/
	while (getline(&line, &n, idmap) != -1) {
		char *rest, *s;

		rest = strchr(line, ':');
		if (!rest)
			continue;
		*rest = '\0';

		if (strcmp(line, pw->pw_name) &&
		    strtoul(line, NULL, 10) != pw->pw_uid)
			continue;

		s = rest + 1;
		rest = strchr(s, ':');
		if (!rest)
			continue;
		*rest = '\0';
		map.outer = strtoul_or_err(s, _("failed to parse subid map"));

		s = rest + 1;
		rest = strchr(s, '\n');
		if (rest)
			*rest = '\0';
		map.count = strtoul_or_err(s, _("failed to parse subid map"));

		fclose(idmap);
		free(pw);
		free(pwbuf);

		return map;
	}

	errx(EXIT_FAILURE, _("no line matching user \"%s\" in %s"),
	pw->pw_name, filename);
}

/**
 * read_kernel_map() - Read all available IDs from the kernel
 * @chain: destination list to receive pass-through ID mappings
 * @filename: either /proc/self/uid_map or /proc/self/gid_map
 *
 * This is used by --map-users=all and --map-groups=all to construct
 * pass-through mappings for all IDs available in the parent namespace.
 */
static void read_kernel_map(struct map_range **chain, char *filename)
{
	char *line = NULL;
	size_t size = 0;
	FILE *idmap;

	idmap = fopen(filename, "r");
	if (!idmap)
		err(EXIT_FAILURE, _("could not open '%s'"), filename);

	while (getline(&line, &size, idmap) != -1) {
		unsigned int start, count;
		if (sscanf(line, " %u %*u %u", &start, &count) < 2)
			continue;
		insert_map_range(chain, (struct map_range) {
			.inner = start,
			.outer = start,
			.count = count
		});
	}

	fclose(idmap);
	free(line);
}

/**
 * add_single_map_range() - Add a single-ID map into a list without overlap
 * @chain: A linked list of ID range mappings
 * @outer: ID outside the namespace for a single map.
 * @inner: ID inside the namespace for a single map, or -1 for no map.
 *
 * Prepend a mapping to @chain for the single ID @outer to the single ID
 * @inner. The tricky bit is that we cannot let existing mappings overlap it.
 * We accomplish this by removing a "hole" from each existing range @map, if
 * @outer or @inner overlap it. This may result in one less than @map->count
 * IDs being mapped from @map. The unmapped IDs are always the topmost IDs
 * of the mapping (either in the parent or the child namespace).
 *
 * Most of the time, this function will be called with a single mapping range
 * @map, @map->outer as some large ID, @map->inner as 0, and @map->count as a
 * large number (at least 1000, but less than @map->outer). Typically, there
 * will be no conflict with @outer. However, @inner may split the mapping for
 * e.g. --map-current-user.
 */

static void add_single_map_range(struct map_range **chain, unsigned int outer,
				 unsigned int inner)
{
	struct map_range *map = *chain;

	if (inner + 1 == 0)
		outer = (unsigned int) -1;
	*chain = NULL;

	while (map) {
		struct map_range lo = { 0 }, mid = { 0 }, hi = { 0 },
				 *next = map->next;
		unsigned int inner_offset, outer_offset;

		/*
		 * Start inner IDs from zero for an auto mapping; otherwise, if
		 * the single mapping exists and overlaps the range, remove an ID
		 */
		if (map->inner + 1 == 0)
			map->inner = 0;
		else if (inner + 1 != 0 &&
		         ((outer >= map->outer && outer <= map->outer + map->count) ||
			  (inner >= map->inner && inner <= map->inner + map->count)))
			map->count--;

		/* Determine where the splits between lo, mid, and hi will be */
		outer_offset = min(outer > map->outer ? outer - map->outer : 0,
				   map->count);
		inner_offset = min(inner > map->inner ? inner - map->inner : 0,
				   map->count);

		/*
		 * In the worst case, we need three mappings:
		 * From the bottom of map to either inner or outer
		 */
		lo.outer = map->outer;
		lo.inner = map->inner;
		lo.count = min(inner_offset, outer_offset);

		/* From the lower of inner or outer to the higher */
		mid.outer = lo.outer + lo.count;
		mid.outer += mid.outer == outer;
		mid.inner = lo.inner + lo.count;
		mid.inner += mid.inner == inner;
		mid.count = abs_diff(outer_offset, inner_offset);

		/* And from the higher of inner or outer to the end of the map */
		hi.outer = mid.outer + mid.count;
		hi.outer += hi.outer == outer;
		hi.inner = mid.inner + mid.count;
		hi.inner += hi.inner == inner;
		hi.count = map->count - lo.count - mid.count;

		/* Insert non-empty mappings into the output chain */
		if (hi.count)
			insert_map_range(chain, hi);
		if (mid.count)
			insert_map_range(chain, mid);
		if (lo.count)
			insert_map_range(chain, lo);

		free(map);
		map = next;
	}

	if (inner + 1 != 0) {
		/* Insert single ID mapping as the first entry in the chain */
		insert_map_range(chain, (struct map_range) {
			.inner = inner,
			.outer = outer,
			.count = 1
		});
	}
}

/**
 * map_ids_external() - Create a new uid/gid map using setuid helper
 * @idmapper: Either newuidmap or newgidmap
 * @ppid: Pid to set the map for
 * @chain: A linked list of ID range mappings
 *
 * This creates a new uid/gid map for @ppid using @idmapper to set the
 * mapping for each of the ranges in @chain.
 *
 * This function always exec()s or errors out and does not return.
 */
static void __attribute__((__noreturn__))
map_ids_external(const char *idmapper, int ppid, struct map_range *chain)
{
	unsigned int i = 0, length = 3;
	char **argv;

	for (struct map_range *map = chain; map; map = map->next)
		length += 3;
	argv = xcalloc(length, sizeof(*argv));
	argv[i++] = xstrdup(idmapper);
	xasprintf(&argv[i++], "%u", ppid);

	for (struct map_range *map = chain; map; map = map->next) {
		xasprintf(&argv[i++], "%u", map->inner);
		xasprintf(&argv[i++], "%u", map->outer);
		xasprintf(&argv[i++], "%u", map->count);
	}

	argv[i] = NULL;
	execvp(idmapper, argv);
	errexec(idmapper);
}

/**
 * map_ids_internal() - Create a new uid/gid map using root privilege
 * @type: Either uid_map or gid_map
 * @ppid: Pid to set the map for
 * @chain: A linked list of ID range mappings
 *
 * This creates a new uid/gid map for @ppid using a privileged write to
 * /proc/@ppid/@type to set a mapping for each of the ranges in @chain.
 */
static void map_ids_internal(const char *type, int ppid, struct map_range *chain)
{
	int count, fd;
	unsigned int length = 0;
	char buffer[4096], *path;

	xasprintf(&path, "/proc/%u/%s", ppid, type);
	for (struct map_range *map = chain; map; map = map->next) {
		count = snprintf(buffer + length, sizeof(buffer) - length,
				 "%u %u %u\n",
				 map->inner, map->outer, map->count);
		if (count < 0 || count + length > sizeof(buffer))
			errx(EXIT_FAILURE,
				_("%s too large for kernel 4k limit"), path);
		length += count;
	}

	fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY);
	if (fd < 0)
		err(EXIT_FAILURE, _("failed to open %s"), path);
	if (write_all(fd, buffer, length) < 0)
		err(EXIT_FAILURE, _("failed to write %s"), path);
	close(fd);
	free(path);
}

/**
 * map_ids_from_child() - Set up a new uid/gid map
 * @fd: The eventfd to wait on
 * @mapuser: The user to map the current user to (or -1)
 * @usermap: The range of UIDs to map (or %NULL)
 * @mapgroup: The group to map the current group to (or -1)
 * @groupmap: The range of GIDs to map (or %NULL)
 *
 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
 * for our parent's PID.
 *
 * Return: The pid of the child.
 */
static pid_t map_ids_from_child(int *fd, uid_t mapuser,
				struct map_range *usermap, gid_t mapgroup,
				struct map_range *groupmap)
{
	pid_t child, pid = 0;
	pid_t ppid = getpid();

	child = fork_and_wait(fd);
	if (child)
		return child;

	if (usermap)
		add_single_map_range(&usermap, geteuid(), mapuser);
	if (groupmap)
		add_single_map_range(&groupmap, getegid(), mapgroup);

	if (geteuid() == 0) {
		if (usermap)
			map_ids_internal("uid_map", ppid, usermap);
		if (groupmap)
			map_ids_internal("gid_map", ppid, groupmap);
		exit(EXIT_SUCCESS);
	}

	/* Avoid forking more than we need to */
	if (usermap && groupmap) {
		pid = fork();
		if (pid < 0)
			err(EXIT_FAILURE, _("fork failed"));
		if (pid)
			waitchild(pid);
	}

	if (!pid && usermap)
		map_ids_external("newuidmap", ppid, usermap);
	if (groupmap)
		map_ids_external("newgidmap", ppid, groupmap);
	exit(EXIT_SUCCESS);
}

static void __attribute__((__noreturn__)) usage(void)
{
	FILE *out = stdout;

	fputs(USAGE_HEADER, out);
	fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
		program_invocation_short_name);

	fputs(USAGE_SEPARATOR, out);
	fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);

	fputs(USAGE_OPTIONS, out);
	fputs(_(" -m, --mount[=<file>]      unshare mounts namespace\n"), out);
	fputs(_(" -u, --uts[=<file>]        unshare UTS namespace (hostname etc)\n"), out);
	fputs(_(" -i, --ipc[=<file>]        unshare System V IPC namespace\n"), out);
	fputs(_(" -n, --net[=<file>]        unshare network namespace\n"), out);
	fputs(_(" -p, --pid[=<file>]        unshare pid namespace\n"), out);
	fputs(_(" -U, --user[=<file>]       unshare user namespace\n"), out);
	fputs(_(" -C, --cgroup[=<file>]     unshare cgroup namespace\n"), out);
	fputs(_(" -T, --time[=<file>]       unshare time namespace\n"), out);
	fputs(USAGE_SEPARATOR, out);
	fputs(_(" -f, --fork                fork before launching <program>\n"), out);
	fputs(_(" --map-user=<uid>|<name>   map current user to uid (implies --user)\n"), out);
	fputs(_(" --map-group=<gid>|<name>  map current group to gid (implies --user)\n"), out);
	fputs(_(" -r, --map-root-user       map current user to root (implies --user)\n"), out);
	fputs(_(" -c, --map-current-user    map current user to itself (implies --user)\n"), out);
	fputs(_(" --map-auto                map users and groups automatically (implies --user)\n"), out);
	fputs(_(" --map-users=<inneruid>:<outeruid>:<count>\n"
		"                           map count users from outeruid to inneruid (implies --user)\n"), out);
	fputs(_(" --map-groups=<innergid>:<outergid>:<count>\n"
		"                           map count groups from outergid to innergid (implies --user)\n"), out);
	fputs(USAGE_SEPARATOR, out);
	fputs(_(" --kill-child[=<signame>]  when dying, kill the forked child (implies --fork)\n"
		"                             defaults to SIGKILL\n"), out);
	fputs(_(" --mount-proc[=<dir>]      mount proc filesystem first (implies --mount)\n"), out);
	fputs(_(" --propagation slave|shared|private|unchanged\n"
	        "                           modify mount propagation in mount namespace\n"), out);
	fputs(_(" --setgroups allow|deny    control the setgroups syscall in user namespaces\n"), out);
	fputs(_(" --keep-caps               retain capabilities granted in user namespaces\n"), out);
	fputs(USAGE_SEPARATOR, out);
	fputs(_(" -R, --root=<dir>          run the command with root directory set to <dir>\n"), out);
	fputs(_(" -w, --wd=<dir>            change working directory to <dir>\n"), out);
	fputs(_(" -S, --setuid <uid>        set uid in entered namespace\n"), out);
	fputs(_(" -G, --setgid <gid>        set gid in entered namespace\n"), out);
	fputs(_(" --monotonic <offset>      set clock monotonic offset (seconds) in time namespaces\n"), out);
	fputs(_(" --boottime <offset>       set clock boottime offset (seconds) in time namespaces\n"), out);

	fputs(USAGE_SEPARATOR, out);
	fprintf(out, USAGE_HELP_OPTIONS(27));
	fprintf(out, USAGE_MAN_TAIL("unshare(1)"));

	exit(EXIT_SUCCESS);
}

int main(int argc, char *argv[])
{
	enum {
		OPT_MOUNTPROC = CHAR_MAX + 1,
		OPT_PROPAGATION,
		OPT_SETGROUPS,
		OPT_KILLCHILD,
		OPT_KEEPCAPS,
		OPT_MONOTONIC,
		OPT_BOOTTIME,
		OPT_MAPUSER,
		OPT_MAPUSERS,
		OPT_MAPGROUP,
		OPT_MAPGROUPS,
		OPT_MAPAUTO,
	};
	static const struct option longopts[] = {
		{ "help",          no_argument,       NULL, 'h'             },
		{ "version",       no_argument,       NULL, 'V'             },

		{ "mount",         optional_argument, NULL, 'm'             },
		{ "uts",           optional_argument, NULL, 'u'             },
		{ "ipc",           optional_argument, NULL, 'i'             },
		{ "net",           optional_argument, NULL, 'n'             },
		{ "pid",           optional_argument, NULL, 'p'             },
		{ "user",          optional_argument, NULL, 'U'             },
		{ "cgroup",        optional_argument, NULL, 'C'             },
		{ "time",          optional_argument, NULL, 'T'             },

		{ "fork",          no_argument,       NULL, 'f'             },
		{ "kill-child",    optional_argument, NULL, OPT_KILLCHILD   },
		{ "mount-proc",    optional_argument, NULL, OPT_MOUNTPROC   },
		{ "map-user",      required_argument, NULL, OPT_MAPUSER     },
		{ "map-users",     required_argument, NULL, OPT_MAPUSERS    },
		{ "map-group",     required_argument, NULL, OPT_MAPGROUP    },
		{ "map-groups",    required_argument, NULL, OPT_MAPGROUPS   },
		{ "map-root-user", no_argument,       NULL, 'r'             },
		{ "map-current-user", no_argument,    NULL, 'c'             },
		{ "map-auto",      no_argument,       NULL, OPT_MAPAUTO     },
		{ "propagation",   required_argument, NULL, OPT_PROPAGATION },
		{ "setgroups",     required_argument, NULL, OPT_SETGROUPS   },
		{ "keep-caps",     no_argument,       NULL, OPT_KEEPCAPS    },
		{ "setuid",	   required_argument, NULL, 'S'		    },
		{ "setgid",	   required_argument, NULL, 'G'		    },
		{ "root",	   required_argument, NULL, 'R'		    },
		{ "wd",		   required_argument, NULL, 'w'		    },
		{ "monotonic",     required_argument, NULL, OPT_MONOTONIC   },
		{ "boottime",      required_argument, NULL, OPT_BOOTTIME    },
		{ NULL, 0, NULL, 0 }
	};

	int setgrpcmd = SETGROUPS_NONE;
	int unshare_flags = 0;
	int c, forkit = 0;
	uid_t mapuser = -1;
	gid_t mapgroup = -1;
	struct map_range *usermap = NULL;
	struct map_range *groupmap = NULL;
	int kill_child_signo = 0; /* 0 means --kill-child was not used */
	const char *procmnt = NULL;
	const char *newroot = NULL;
	const char *newdir = NULL;
	pid_t pid_bind = 0, pid_idmap = 0;
	pid_t pid = 0;
#ifdef UL_HAVE_PIDFD
	int fd_parent_pid = -1;
#endif
	int fd_idmap, fd_bind = -1;
	sigset_t sigset, oldsigset;
	int status;
	unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
	int force_uid = 0, force_gid = 0;
	uid_t uid = 0, real_euid = geteuid();
	gid_t gid = 0, real_egid = getegid();
	int keepcaps = 0;
	int64_t monotonic = 0;
	int64_t boottime = 0;
	int force_monotonic = 0;
	int force_boottime = 0;

	setlocale(LC_ALL, "");
	bindtextdomain(PACKAGE, LOCALEDIR);
	textdomain(PACKAGE);
	close_stdout_atexit();

	while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) {
		switch (c) {
		case 'f':
			forkit = 1;
			break;
		case 'm':
			unshare_flags |= CLONE_NEWNS;
			if (optarg)
				set_ns_target(CLONE_NEWNS, optarg);
			break;
		case 'u':
			unshare_flags |= CLONE_NEWUTS;
			if (optarg)
				set_ns_target(CLONE_NEWUTS, optarg);
			break;
		case 'i':
			unshare_flags |= CLONE_NEWIPC;
			if (optarg)
				set_ns_target(CLONE_NEWIPC, optarg);
			break;
		case 'n':
			unshare_flags |= CLONE_NEWNET;
			if (optarg)
				set_ns_target(CLONE_NEWNET, optarg);
			break;
		case 'p':
			unshare_flags |= CLONE_NEWPID;
			if (optarg)
				set_ns_target(CLONE_NEWPID, optarg);
			break;
		case 'U':
			unshare_flags |= CLONE_NEWUSER;
			if (optarg)
				set_ns_target(CLONE_NEWUSER, optarg);
			break;
		case 'C':
			unshare_flags |= CLONE_NEWCGROUP;
			if (optarg)
				set_ns_target(CLONE_NEWCGROUP, optarg);
			break;
		case 'T':
			unshare_flags |= CLONE_NEWTIME;
			if (optarg)
				set_ns_target(CLONE_NEWTIME, optarg);
			break;
		case OPT_MOUNTPROC:
			unshare_flags |= CLONE_NEWNS;
			procmnt = optarg ? optarg : "/proc";
			break;
		case OPT_MAPUSER:
			unshare_flags |= CLONE_NEWUSER;
			mapuser = get_user(optarg, _("failed to parse uid"));
			break;
		case OPT_MAPGROUP:
			unshare_flags |= CLONE_NEWUSER;
			mapgroup = get_group(optarg, _("failed to parse gid"));
			break;
		case 'r':
			unshare_flags |= CLONE_NEWUSER;
			mapuser = 0;
			mapgroup = 0;
			break;
		case 'c':
			unshare_flags |= CLONE_NEWUSER;
			mapuser = real_euid;
			mapgroup = real_egid;
			break;
		case OPT_MAPUSERS:
			unshare_flags |= CLONE_NEWUSER;
			if (!strcmp(optarg, "auto"))
				insert_map_range(&usermap,
					read_subid_range(_PATH_SUBUID, real_euid));
			else if (!strcmp(optarg, "all"))
				read_kernel_map(&usermap, _PATH_PROC_UIDMAP);
			else
				insert_map_range(&usermap, get_map_range(optarg));
			break;
		case OPT_MAPGROUPS:
			unshare_flags |= CLONE_NEWUSER;
			if (!strcmp(optarg, "auto"))
				insert_map_range(&groupmap,
					read_subid_range(_PATH_SUBGID, real_euid));
			else if (!strcmp(optarg, "all"))
				read_kernel_map(&groupmap, _PATH_PROC_GIDMAP);
			else
				insert_map_range(&groupmap, get_map_range(optarg));
			break;
		case OPT_MAPAUTO:
			unshare_flags |= CLONE_NEWUSER;
			insert_map_range(&usermap, read_subid_range(_PATH_SUBUID, real_euid));
			insert_map_range(&groupmap, read_subid_range(_PATH_SUBGID, real_euid));
			break;
		case OPT_SETGROUPS:
			setgrpcmd = setgroups_str2id(optarg);
			break;
		case OPT_PROPAGATION:
			propagation = parse_propagation(optarg);
			break;
		case OPT_KILLCHILD:
			forkit = 1;
			if (optarg) {
				if ((kill_child_signo = signame_to_signum(optarg)) < 0)
					errx(EXIT_FAILURE, _("unknown signal: %s"),
					     optarg);
			} else {
				kill_child_signo = SIGKILL;
			}
			break;
                case OPT_KEEPCAPS:
			keepcaps = 1;
			cap_last_cap(); /* Force last cap to be cached before we fork. */
			break;
		case 'S':
			uid = strtoul_or_err(optarg, _("failed to parse uid"));
			force_uid = 1;
			break;
		case 'G':
			gid = strtoul_or_err(optarg, _("failed to parse gid"));
			force_gid = 1;
			break;
		case 'R':
			newroot = optarg;
			break;
		case 'w':
			newdir = optarg;
			break;
                case OPT_MONOTONIC:
			monotonic = strtos64_or_err(optarg, _("failed to parse monotonic offset"));
			force_monotonic = 1;
			break;
                case OPT_BOOTTIME:
			boottime = strtos64_or_err(optarg, _("failed to parse boottime offset"));
			force_boottime = 1;
			break;

		case 'h':
			usage();
		case 'V':
			print_version(EXIT_SUCCESS);
		default:
			errtryhelp(EXIT_FAILURE);
		}
	}

	if ((force_monotonic || force_boottime) && !(unshare_flags & CLONE_NEWTIME))
		errx(EXIT_FAILURE, _("options --monotonic and --boottime require "
			"unsharing of a time namespace (-T)"));

	/* clear any inherited settings */
	signal(SIGCHLD, SIG_DFL);

	if (npersists && (unshare_flags & CLONE_NEWNS))
		pid_bind = bind_ns_files_from_child(&fd_bind);

	if (usermap || groupmap)
		pid_idmap = map_ids_from_child(&fd_idmap, mapuser, usermap,
					       mapgroup, groupmap);

	if (-1 == unshare(unshare_flags))
		err(EXIT_FAILURE, _("unshare failed"));

	/* Tell child we've called unshare() */
	if (usermap || groupmap)
		sync_with_child(pid_idmap, fd_idmap);

	if (force_boottime)
		settime(boottime, CLOCK_BOOTTIME);

	if (force_monotonic)
		settime(monotonic, CLOCK_MONOTONIC);

	if (forkit) {
		if (sigemptyset(&sigset) != 0 ||
			sigaddset(&sigset, SIGINT) != 0 ||
			sigaddset(&sigset, SIGTERM) != 0 ||
			sigprocmask(SIG_BLOCK, &sigset, &oldsigset) != 0)
			err(EXIT_FAILURE, _("sigprocmask block failed"));
#ifdef UL_HAVE_PIDFD
		if (kill_child_signo != 0) {
			/* make a connection to the original process (parent) */
			fd_parent_pid = pidfd_open(getpid(), 0);
			if (0 > fd_parent_pid)
				err(EXIT_FAILURE, _("pidfd_open failed"));
		}
#endif
		/* force child forking before mountspace binding so
		 * pid_for_children is populated */
		pid = fork();

		switch(pid) {
		case -1:
			err(EXIT_FAILURE, _("fork failed"));
		case 0:	/* child */
			if (sigprocmask(SIG_SETMASK, &oldsigset, NULL))
				err(EXIT_FAILURE,
					_("sigprocmask restore failed"));
			if (npersists && (unshare_flags & CLONE_NEWNS))
				close(fd_bind);
			break;
		default: /* parent */
			break;
		}
	}

	if (npersists && (pid || !forkit)) {
		/* run in parent */
		if (pid_bind && (unshare_flags & CLONE_NEWNS))
			sync_with_child(pid_bind, fd_bind);
		else
			/* simple way, just bind */
			bind_ns_files(getpid());
	}

	if (pid) {
		if (waitpid(pid, &status, 0) == -1)
			err(EXIT_FAILURE, _("waitpid failed"));

		if (WIFEXITED(status))
			return WEXITSTATUS(status);
		if (WIFSIGNALED(status)) {

			/* Ensure the signal that terminated the child will
			 * also terminate the parent. */

			int termsig = WTERMSIG(status);

			if (termsig != SIGKILL && signal(termsig, SIG_DFL) == SIG_ERR)
				err(EXIT_FAILURE,
					_("signal handler reset failed"));
			if (sigemptyset(&sigset) != 0 ||
				sigaddset(&sigset, termsig) != 0 ||
				sigprocmask(SIG_UNBLOCK, &sigset, NULL) != 0)
				err(EXIT_FAILURE,
					_("sigprocmask unblock failed"));

			kill(getpid(), termsig);
		}
		err(EXIT_FAILURE, _("child exit failed"));
	}

	if (kill_child_signo != 0) {
		if (prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
			err(EXIT_FAILURE, "prctl failed");
#ifdef UL_HAVE_PIDFD
		/* Use poll() to check that there is still the original parent. */
		if (fd_parent_pid != -1) {
			struct pollfd pollfds[1] = {
				{ .fd = fd_parent_pid, .events = POLLIN	}
			};
			int nfds = poll(pollfds, 1, 0);

			if (0 > nfds)
				err(EXIT_FAILURE, "poll parent pidfd failed");

			/* If the child was re-parented before prctl(2) was called, the
			 * new parent will likely not be interested in the precise exit
			 * status of the orphan.
			 */
			if (nfds)
				exit(EXIT_FAILURE);

			close(fd_parent_pid);
			fd_parent_pid = -1;
		}
#endif
	}

        if (mapuser != (uid_t) -1 && !usermap)
		map_id(_PATH_PROC_UIDMAP, mapuser, real_euid);

        /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
         * has been disabled unless /proc/self/setgroups is written
         * first to permanently disable the ability to call setgroups
         * in that user namespace. */
	if (mapgroup != (gid_t) -1 && !groupmap) {
		if (setgrpcmd == SETGROUPS_ALLOW)
			errx(EXIT_FAILURE, _("options --setgroups=allow and "
					"--map-group are mutually exclusive"));
		setgroups_control(SETGROUPS_DENY);
		map_id(_PATH_PROC_GIDMAP, mapgroup, real_egid);
	}

	if (setgrpcmd != SETGROUPS_NONE)
		setgroups_control(setgrpcmd);

	if ((unshare_flags & CLONE_NEWNS) && propagation)
		set_propagation(propagation);

	if (newroot) {
		if (chroot(newroot) != 0)
			err(EXIT_FAILURE,
			    _("cannot change root directory to '%s'"), newroot);
		newdir = newdir ?: "/";
	}
	if (newdir && chdir(newdir))
		err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);

	if (procmnt) {
		/* When not changing root and using the default propagation flags
		   then the recursive propagation change of root will
		   automatically change that of an existing proc mount. */
		if (!newroot && propagation != (MS_PRIVATE|MS_REC)) {
			int rc = mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL);

			/* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
			   Ignoring the error in this specific instance is considered safe. */
			if(rc != 0 && errno != EINVAL)
				err(EXIT_FAILURE, _("cannot change %s filesystem propagation"), procmnt);
		}

		if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
			err(EXIT_FAILURE, _("mount %s failed"), procmnt);
	}

	if (force_gid) {
		if (setgroups(0, NULL) != 0)	/* drop supplementary groups */
			err(EXIT_FAILURE, _("setgroups failed"));
		if (setgid(gid) < 0)		/* change GID */
			err(EXIT_FAILURE, _("setgid failed"));
	}
	if (force_uid && setuid(uid) < 0)	/* change UID */
		err(EXIT_FAILURE, _("setuid failed"));

	if (keepcaps && (unshare_flags & CLONE_NEWUSER))
		cap_permitted_to_ambient();

	if (optind < argc) {
		execvp(argv[optind], argv + optind);
		errexec(argv[optind]);
	}
	exec_shell();
}