diff options
Diffstat (limited to 'man2/pivot_root.2')
-rw-r--r-- | man2/pivot_root.2 | 409 |
1 files changed, 409 insertions, 0 deletions
diff --git a/man2/pivot_root.2 b/man2/pivot_root.2 new file mode 100644 index 0000000..a4077ef --- /dev/null +++ b/man2/pivot_root.2 @@ -0,0 +1,409 @@ +.\" Copyright (C) 2019 Michael Kerrisk <mtk.manpages@gmail.com> +.\" A very few fragments remain from an earlier page written by +.\" Werner Almesberger in 2000 +.\" +.\" SPDX-License-Identifier: Linux-man-pages-copyleft +.\" +.TH pivot_root 2 2023-05-03 "Linux man-pages 6.05.01" +.SH NAME +pivot_root \- change the root mount +.SH LIBRARY +Standard C library +.RI ( libc ", " \-lc ) +.SH SYNOPSIS +.nf +.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */" +.B #include <unistd.h> +.PP +.BI "int syscall(SYS_pivot_root, const char *" new_root \ +", const char *" put_old ); +.fi +.PP +.IR Note : +glibc provides no wrapper for +.BR pivot_root (), +necessitating the use of +.BR syscall (2). +.SH DESCRIPTION +.BR pivot_root () +changes the root mount in the mount namespace of the calling process. +More precisely, it moves the root mount to the +directory \fIput_old\fP and makes \fInew_root\fP the new root mount. +The calling process must have the +.B CAP_SYS_ADMIN +capability in the user namespace that owns the caller's mount namespace. +.PP +.BR pivot_root () +changes the root directory and the current working directory +of each process or thread in the same mount namespace to +.I new_root +if they point to the old root directory. +(See also NOTES.) +On the other hand, +.BR pivot_root () +does not change the caller's current working directory +(unless it is on the old root directory), +and thus it should be followed by a +\fBchdir("/")\fP call. +.PP +The following restrictions apply: +.IP \[bu] 3 +.I new_root +and +.I put_old +must be directories. +.IP \[bu] +.I new_root +and +.I put_old +must not be on the same mount as the current root. +.IP \[bu] +\fIput_old\fP must be at or underneath \fInew_root\fP; +that is, adding some nonnegative +number of "\fI/..\fP" suffixes to the pathname pointed to by +.I put_old +must yield the same directory as \fInew_root\fP. +.IP \[bu] +.I new_root +must be a path to a mount point, but can't be +.IR """/""" . +A path that is not already a mount point can be converted into one by +bind mounting the path onto itself. +.IP \[bu] +The propagation type of the parent mount of +.I new_root +and the parent mount of the current root directory must not be +.BR MS_SHARED ; +similarly, if +.I put_old +is an existing mount point, its propagation type must not be +.BR MS_SHARED . +These restrictions ensure that +.BR pivot_root () +never propagates any changes to another mount namespace. +.IP \[bu] +The current root directory must be a mount point. +.SH RETURN VALUE +On success, zero is returned. +On error, \-1 is returned, and +\fIerrno\fP is set to indicate the error. +.SH ERRORS +.BR pivot_root () +may fail with any of the same errors as +.BR stat (2). +Additionally, it may fail with the following errors: +.TP +.B EBUSY +.\" Reconfirmed that the following error occurs on Linux 5.0 by +.\" specifying 'new_root' as "/rootfs" and 'put_old' as +.\" "/rootfs/oldrootfs", and *not* bind mounting "/rootfs" on top of +.\" itself. Of course, this is an odd situation, since a later check +.\" in the kernel code will in any case yield EINVAL if 'new_root' is +.\" not a mount point. However, when the system call was first added, +.\" 'new_root' was not required to be a mount point. So, this +.\" error is nowadays probably just the result of crufty accumulation. +.\" This error can also occur if we bind mount "/" on top of itself +.\" and try to specify "/" as the 'new' (again, an odd situation). So, +.\" the EBUSY check in the kernel does still seem necessary to prevent +.\" that case. Furthermore, the "or put_old" piece is probably +.\" redundant text (although the check is in the kernel), since, +.\" in another check, 'put_old' is required to be under 'new_root'. +.I new_root +or +.I put_old +is on the current root mount. +(This error covers the pathological case where +.I new_root +is +.IR """/""" .) +.TP +.B EINVAL +.I new_root +is not a mount point. +.TP +.B EINVAL +\fIput_old\fP is not at or underneath \fInew_root\fP. +.TP +.B EINVAL +The current root directory is not a mount point +(because of an earlier +.BR chroot (2)). +.TP +.B EINVAL +The current root is on the rootfs (initial ramfs) mount; see NOTES. +.TP +.B EINVAL +Either the mount point at +.IR new_root , +or the parent mount of that mount point, +has propagation type +.BR MS_SHARED . +.TP +.B EINVAL +.I put_old +is a mount point and has the propagation type +.BR MS_SHARED . +.TP +.B ENOTDIR +\fInew_root\fP or \fIput_old\fP is not a directory. +.TP +.B EPERM +The calling process does not have the +.B CAP_SYS_ADMIN +capability. +.SH STANDARDS +Linux. +.SH HISTORY +Linux 2.3.41. +.SH NOTES +A command-line interface for this system call is provided by +.BR pivot_root (8). +.PP +.BR pivot_root () +allows the caller to switch to a new root filesystem while at the same time +placing the old root mount at a location under +.I new_root +from where it can subsequently be unmounted. +(The fact that it moves all processes that have a root directory +or current working directory on the old root directory to the +new root frees the old root directory of users, +allowing the old root mount to be unmounted more easily.) +.PP +One use of +.BR pivot_root () +is during system startup, when the +system mounts a temporary root filesystem (e.g., an +.BR initrd (4)), +then mounts the real root filesystem, and eventually turns the latter into +the root directory of all relevant processes and threads. +A modern use is to set up a root filesystem during +the creation of a container. +.PP +The fact that +.BR pivot_root () +modifies process root and current working directories in the +manner noted in DESCRIPTION +is necessary in order to prevent kernel threads from keeping the old +root mount busy with their root and current working directories, +even if they never access +the filesystem in any way. +.PP +The rootfs (initial ramfs) cannot be +.BR pivot_root ()ed. +The recommended method of changing the root filesystem in this case is +to delete everything in rootfs, overmount rootfs with the new root, attach +.IR stdin / stdout / stderr +to the new +.IR /dev/console , +and exec the new +.BR init (1). +Helper programs for this process exist; see +.BR switch_root (8). +.\" +.SS pivot_root(\[dq].\[dq], \[dq].\[dq]) +.I new_root +and +.I put_old +may be the same directory. +In particular, the following sequence allows a pivot-root operation +without needing to create and remove a temporary directory: +.PP +.in +4n +.EX +chdir(new_root); +pivot_root(".", "."); +umount2(".", MNT_DETACH); +.EE +.in +.PP +This sequence succeeds because the +.BR pivot_root () +call stacks the old root mount point +on top of the new root mount point at +.IR / . +At that point, the calling process's root directory and current +working directory refer to the new root mount point +.RI ( new_root ). +During the subsequent +.BR umount () +call, resolution of +.I """.""" +starts with +.I new_root +and then moves up the list of mounts stacked at +.IR / , +with the result that old root mount point is unmounted. +.\" +.SS Historical notes +For many years, this manual page carried the following text: +.RS +.PP +.BR pivot_root () +may or may not change the current root and the current +working directory of any processes or threads which use the old +root directory. +The caller of +.BR pivot_root () +must ensure that processes with root or current working directory +at the old root operate correctly in either case. +An easy way to ensure this is to change their +root and current working directory to \fInew_root\fP before invoking +.BR pivot_root (). +.RE +.PP +This text, written before the system call implementation was +even finalized in the kernel, was probably intended to warn users +at that time that the implementation might change before final release. +However, the behavior stated in DESCRIPTION +has remained consistent since this system call +was first implemented and will not change now. +.SH EXAMPLES +.\" FIXME +.\" Would it be better, because simpler, to use unshare(2) +.\" rather than clone(2) in the example below? +The program below demonstrates the use of +.BR pivot_root () +inside a mount namespace that is created using +.BR clone (2). +After pivoting to the root directory named in the program's +first command-line argument, the child created by +.BR clone (2) +then executes the program named in the remaining command-line arguments. +.PP +We demonstrate the program by creating a directory that will serve as +the new root filesystem and placing a copy of the (statically linked) +.BR busybox (1) +executable in that directory. +.PP +.in +4n +.EX +$ \fBmkdir /tmp/rootfs\fP +$ \fBls \-id /tmp/rootfs\fP # Show inode number of new root directory +319459 /tmp/rootfs +$ \fBcp $(which busybox) /tmp/rootfs\fP +$ \fBPS1=\[aq]bbsh$ \[aq] sudo ./pivot_root_demo /tmp/rootfs /busybox sh\fP +bbsh$ \fBPATH=/\fP +bbsh$ \fBbusybox ln busybox ln\fP +bbsh$ \fBln busybox echo\fP +bbsh$ \fBln busybox ls\fP +bbsh$ \fBls\fP +busybox echo ln ls +bbsh$ \fBls \-id /\fP # Compare with inode number above +319459 / +bbsh$ \fBecho \[aq]hello world\[aq]\fP +hello world +.EE +.in +.SS Program source +\& +.PP +.\" SRC BEGIN (pivot_root.c) +.EX +/* pivot_root_demo.c */ +\& +#define _GNU_SOURCE +#include <err.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <unistd.h> +\& +static int +pivot_root(const char *new_root, const char *put_old) +{ + return syscall(SYS_pivot_root, new_root, put_old); +} +\& +#define STACK_SIZE (1024 * 1024) +\& +static int /* Startup function for cloned child */ +child(void *arg) +{ + char path[PATH_MAX]; + char **args = arg; + char *new_root = args[0]; + const char *put_old = "/oldrootfs"; +\& + /* Ensure that \[aq]new_root\[aq] and its parent mount don\[aq]t have + shared propagation (which would cause pivot_root() to + return an error), and prevent propagation of mount + events to the initial mount namespace. */ +\& + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) == \-1) + err(EXIT_FAILURE, "mount\-MS_PRIVATE"); +\& + /* Ensure that \[aq]new_root\[aq] is a mount point. */ +\& + if (mount(new_root, new_root, NULL, MS_BIND, NULL) == \-1) + err(EXIT_FAILURE, "mount\-MS_BIND"); +\& + /* Create directory to which old root will be pivoted. */ +\& + snprintf(path, sizeof(path), "%s/%s", new_root, put_old); + if (mkdir(path, 0777) == \-1) + err(EXIT_FAILURE, "mkdir"); +\& + /* And pivot the root filesystem. */ +\& + if (pivot_root(new_root, path) == \-1) + err(EXIT_FAILURE, "pivot_root"); +\& + /* Switch the current working directory to "/". */ +\& + if (chdir("/") == \-1) + err(EXIT_FAILURE, "chdir"); +\& + /* Unmount old root and remove mount point. */ +\& + if (umount2(put_old, MNT_DETACH) == \-1) + perror("umount2"); + if (rmdir(put_old) == \-1) + perror("rmdir"); +\& + /* Execute the command specified in argv[1]... */ +\& + execv(args[1], &args[1]); + err(EXIT_FAILURE, "execv"); +} +\& +int +main(int argc, char *argv[]) +{ + char *stack; +\& + /* Create a child process in a new mount namespace. */ +\& + stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, \-1, 0); + if (stack == MAP_FAILED) + err(EXIT_FAILURE, "mmap"); +\& + if (clone(child, stack + STACK_SIZE, + CLONE_NEWNS | SIGCHLD, &argv[1]) == \-1) + err(EXIT_FAILURE, "clone"); +\& + /* Parent falls through to here; wait for child. */ +\& + if (wait(NULL) == \-1) + err(EXIT_FAILURE, "wait"); +\& + exit(EXIT_SUCCESS); +} +.EE +.\" SRC END +.SH SEE ALSO +.BR chdir (2), +.BR chroot (2), +.BR mount (2), +.BR stat (2), +.BR initrd (4), +.BR mount_namespaces (7), +.BR pivot_root (8), +.BR switch_root (8) |