diff options
Diffstat (limited to 'man2/memfd_create.2')
-rw-r--r-- | man2/memfd_create.2 | 545 |
1 files changed, 545 insertions, 0 deletions
diff --git a/man2/memfd_create.2 b/man2/memfd_create.2 new file mode 100644 index 0000000..fb18abc --- /dev/null +++ b/man2/memfd_create.2 @@ -0,0 +1,545 @@ +.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com> +.\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com> +.\" +.\" SPDX-License-Identifier: GPL-2.0-or-later +.\" +.TH memfd_create 2 2023-05-03 "Linux man-pages 6.05.01" +.SH NAME +memfd_create \- create an anonymous file +.SH LIBRARY +Standard C library +.RI ( libc ", " \-lc ) +.SH SYNOPSIS +.nf +.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */" +.B #include <sys/mman.h> +.PP +.BI "int memfd_create(const char *" name ", unsigned int " flags ");" +.fi +.SH DESCRIPTION +.BR memfd_create () +creates an anonymous file and returns a file descriptor that refers to it. +The file behaves like a regular file, and so can be modified, +truncated, memory-mapped, and so on. +However, unlike a regular file, +it lives in RAM and has a volatile backing storage. +Once all references to the file are dropped, it is automatically released. +Anonymous memory is used for all backing pages of the file. +Therefore, files created by +.BR memfd_create () +have the same semantics as other anonymous +.\" David Herrmann: +.\" memfd uses VM_NORESERVE so each page is accounted on first access. +.\" This means, the overcommit-limits (see __vm_enough_memory()) and the +.\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that +.\" those are accounted on "current" and "current->mm", that is, the +.\" process doing the first page access. +memory allocations such as those allocated using +.BR mmap (2) +with the +.B MAP_ANONYMOUS +flag. +.PP +The initial size of the file is set to 0. +Following the call, the file size should be set using +.BR ftruncate (2). +(Alternatively, the file may be populated by calls to +.BR write (2) +or similar.) +.PP +The name supplied in +.I name +is used as a filename and will be displayed +as the target of the corresponding symbolic link in the directory +.IR /proc/self/fd/ . +The displayed name is always prefixed with +.I memfd: +and serves only for debugging purposes. +Names do not affect the behavior of the file descriptor, +and as such multiple files can have the same name without any side effects. +.PP +The following values may be bitwise ORed in +.I flags +to change the behavior of +.BR memfd_create (): +.TP +.B MFD_CLOEXEC +Set the close-on-exec +.RB ( FD_CLOEXEC ) +flag on the new file descriptor. +See the description of the +.B O_CLOEXEC +flag in +.BR open (2) +for reasons why this may be useful. +.TP +.B MFD_ALLOW_SEALING +Allow sealing operations on this file. +See the discussion of the +.B F_ADD_SEALS +and +.B F_GET_SEALS +operations in +.BR fcntl (2), +and also NOTES, below. +The initial set of seals is empty. +If this flag is not set, the initial set of seals will be +.BR F_SEAL_SEAL , +meaning that no other seals can be set on the file. +.\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default? +.\" Is it worth adding some text explaining this? +.TP +.BR MFD_HUGETLB " (since Linux 4.14)" +.\" commit 749df87bd7bee5a79cef073f5d032ddb2b211de8 +The anonymous file will be created in the hugetlbfs filesystem using +huge pages. +See the Linux kernel source file +.I Documentation/admin\-guide/mm/hugetlbpage.rst +for more information about hugetlbfs. +.\" commit 47b9012ecdc747f6936395265e677d41e11a31ff +Specifying both +.B MFD_HUGETLB +and +.B MFD_ALLOW_SEALING +in +.I flags +is supported since Linux 4.16. +.TP +.BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..." +Used in conjunction with +.B MFD_HUGETLB +to select alternative hugetlb page sizes (respectively, 2\ MB, 1\ GB, ...) +on systems that support multiple hugetlb page sizes. +Definitions for known +huge page sizes are included in the header file +.I <linux/memfd.h>. +.IP +For details on encoding huge page sizes not included in the header file, +see the discussion of the similarly named constants in +.BR mmap (2). +.PP +Unused bits in +.I flags +must be 0. +.PP +As its return value, +.BR memfd_create () +returns a new file descriptor that can be used to refer to the file. +This file descriptor is opened for both reading and writing +.RB ( O_RDWR ) +and +.B O_LARGEFILE +is set for the file descriptor. +.PP +With respect to +.BR fork (2) +and +.BR execve (2), +the usual semantics apply for the file descriptor created by +.BR memfd_create (). +A copy of the file descriptor is inherited by the child produced by +.BR fork (2) +and refers to the same file. +The file descriptor is preserved across +.BR execve (2), +unless the close-on-exec flag has been set. +.SH RETURN VALUE +On success, +.BR memfd_create () +returns a new file descriptor. +On error, \-1 is returned and +.I errno +is set to indicate the error. +.SH ERRORS +.TP +.B EFAULT +The address in +.I name +points to invalid memory. +.TP +.B EINVAL +.I flags +included unknown bits. +.TP +.B EINVAL +.I name +was too long. +(The limit is +.\" NAME_MAX - strlen("memfd:") +249 bytes, excluding the terminating null byte.) +.TP +.B EINVAL +Both +.B MFD_HUGETLB +and +.B MFD_ALLOW_SEALING +were specified in +.IR flags . +.TP +.B EMFILE +The per-process limit on the number of open file descriptors has been reached. +.TP +.B ENFILE +The system-wide limit on the total number of open files has been reached. +.TP +.B ENOMEM +There was insufficient memory to create a new anonymous file. +.TP +.B EPERM +The +.B MFD_HUGETLB +flag was specified, but the caller was not privileged (did not have the +.B CAP_IPC_LOCK +capability) +and is not a member of the +.I sysctl_hugetlb_shm_group +group; see the description of +.I /proc/sys/vm/sysctl_hugetlb_shm_group +in +.BR proc (5). +.SH STANDARDS +Linux. +.SH HISTORY +Linux 3.17, +glibc 2.27. +.SH NOTES +.\" See also http://lwn.net/Articles/593918/ +.\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/ +The +.BR memfd_create () +system call provides a simple alternative to manually mounting a +.BR tmpfs (5) +filesystem and creating and opening a file in that filesystem. +The primary purpose of +.BR memfd_create () +is to create files and associated file descriptors that are +used with the file-sealing APIs provided by +.BR fcntl (2). +.PP +The +.BR memfd_create () +system call also has uses without file sealing +(which is why file-sealing is disabled, unless explicitly requested with the +.B MFD_ALLOW_SEALING +flag). +In particular, it can be used as an alternative to creating files in +.I tmp +or as an alternative to using the +.BR open (2) +.B O_TMPFILE +in cases where there is no intention to actually link the +resulting file into the filesystem. +.SS File sealing +In the absence of file sealing, +processes that communicate via shared memory must either trust each other, +or take measures to deal with the possibility that an untrusted peer +may manipulate the shared memory region in problematic ways. +For example, an untrusted peer might modify the contents of the +shared memory at any time, or shrink the shared memory region. +The former possibility leaves the local process vulnerable to +time-of-check-to-time-of-use race conditions +(typically dealt with by copying data from +the shared memory region before checking and using it). +The latter possibility leaves the local process vulnerable to +.B SIGBUS +signals when an attempt is made to access a now-nonexistent +location in the shared memory region. +(Dealing with this possibility necessitates the use of a handler for the +.B SIGBUS +signal.) +.PP +Dealing with untrusted peers imposes extra complexity on +code that employs shared memory. +Memory sealing enables that extra complexity to be eliminated, +by allowing a process to operate secure in the knowledge that +its peer can't modify the shared memory in an undesired fashion. +.PP +An example of the usage of the sealing mechanism is as follows: +.IP (1) 5 +The first process creates a +.BR tmpfs (5) +file using +.BR memfd_create (). +The call yields a file descriptor used in subsequent steps. +.IP (2) +The first process +sizes the file created in the previous step using +.BR ftruncate (2), +maps it using +.BR mmap (2), +and populates the shared memory with the desired data. +.IP (3) +The first process uses the +.BR fcntl (2) +.B F_ADD_SEALS +operation to place one or more seals on the file, +in order to restrict further modifications on the file. +(If placing the seal +.BR F_SEAL_WRITE , +then it will be necessary to first unmap the shared writable mapping +created in the previous step. +Otherwise, behavior similar to +.B F_SEAL_WRITE +can be achieved by using +.BR F_SEAL_FUTURE_WRITE , +which will prevent future writes via +.BR mmap (2) +and +.BR write (2) +from succeeding while keeping existing shared writable mappings). +.IP (4) +A second process obtains a file descriptor for the +.BR tmpfs (5) +file and maps it. +Among the possible ways in which this could happen are the following: +.RS +.IP \[bu] 3 +The process that called +.BR memfd_create () +could transfer the resulting file descriptor to the second process +via a UNIX domain socket (see +.BR unix (7) +and +.BR cmsg (3)). +The second process then maps the file using +.BR mmap (2). +.IP \[bu] +The second process is created via +.BR fork (2) +and thus automatically inherits the file descriptor and mapping. +(Note that in this case and the next, +there is a natural trust relationship between the two processes, +since they are running under the same user ID. +Therefore, file sealing would not normally be necessary.) +.IP \[bu] +The second process opens the file +.IR /proc/ pid /fd/ fd, +where +.I <pid> +is the PID of the first process (the one that called +.BR memfd_create ()), +and +.I <fd> +is the number of the file descriptor returned by the call to +.BR memfd_create () +in that process. +The second process then maps the file using +.BR mmap (2). +.RE +.IP (5) +The second process uses the +.BR fcntl (2) +.B F_GET_SEALS +operation to retrieve the bit mask of seals +that has been applied to the file. +This bit mask can be inspected in order to determine +what kinds of restrictions have been placed on file modifications. +If desired, the second process can apply further seals +to impose additional restrictions (so long as the +.B F_SEAL_SEAL +seal has not yet been applied). +.SH EXAMPLES +Below are shown two example programs that demonstrate the use of +.BR memfd_create () +and the file sealing API. +.PP +The first program, +.IR t_memfd_create.c , +creates a +.BR tmpfs (5) +file using +.BR memfd_create (), +sets a size for the file, maps it into memory, +and optionally places some seals on the file. +The program accepts up to three command-line arguments, +of which the first two are required. +The first argument is the name to associate with the file, +the second argument is the size to be set for the file, +and the optional third argument is a string of characters that specify +seals to be set on the file. +.PP +The second program, +.IR t_get_seals.c , +can be used to open an existing file that was created via +.BR memfd_create () +and inspect the set of seals that have been applied to that file. +.PP +The following shell session demonstrates the use of these programs. +First we create a +.BR tmpfs (5) +file and set some seals on it: +.PP +.in +4n +.EX +$ \fB./t_memfd_create my_memfd_file 4096 sw &\fP +[1] 11775 +PID: 11775; fd: 3; /proc/11775/fd/3 +.EE +.in +.PP +At this point, the +.I t_memfd_create +program continues to run in the background. +From another program, we can obtain a file descriptor for the +file created by +.BR memfd_create () +by opening the +.IR /proc/ pid /fd +file that corresponds to the file descriptor opened by +.BR memfd_create (). +Using that pathname, we inspect the content of the +.IR /proc/ pid /fd +symbolic link, and use our +.I t_get_seals +program to view the seals that have been placed on the file: +.PP +.in +4n +.EX +$ \fBreadlink /proc/11775/fd/3\fP +/memfd:my_memfd_file (deleted) +$ \fB./t_get_seals /proc/11775/fd/3\fP +Existing seals: WRITE SHRINK +.EE +.in +.SS Program source: t_memfd_create.c +\& +.\" SRC BEGIN (t_memfd_create.c) +.EX +#define _GNU_SOURCE +#include <err.h> +#include <fcntl.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <unistd.h> +\& +int +main(int argc, char *argv[]) +{ + int fd; + char *name, *seals_arg; + ssize_t len; + unsigned int seals; +\& + if (argc < 3) { + fprintf(stderr, "%s name size [seals]\en", argv[0]); + fprintf(stderr, "\et\[aq]seals\[aq] can contain any of the " + "following characters:\en"); + fprintf(stderr, "\et\etg \- F_SEAL_GROW\en"); + fprintf(stderr, "\et\ets \- F_SEAL_SHRINK\en"); + fprintf(stderr, "\et\etw \- F_SEAL_WRITE\en"); + fprintf(stderr, "\et\etW \- F_SEAL_FUTURE_WRITE\en"); + fprintf(stderr, "\et\etS \- F_SEAL_SEAL\en"); + exit(EXIT_FAILURE); + } +\& + name = argv[1]; + len = atoi(argv[2]); + seals_arg = argv[3]; +\& + /* Create an anonymous file in tmpfs; allow seals to be + placed on the file. */ +\& + fd = memfd_create(name, MFD_ALLOW_SEALING); + if (fd == \-1) + err(EXIT_FAILURE, "memfd_create"); +\& + /* Size the file as specified on the command line. */ +\& + if (ftruncate(fd, len) == \-1) + err(EXIT_FAILURE, "truncate"); +\& + printf("PID: %jd; fd: %d; /proc/%jd/fd/%d\en", + (intmax_t) getpid(), fd, (intmax_t) getpid(), fd); +\& + /* Code to map the file and populate the mapping with data + omitted. */ +\& + /* If a \[aq]seals\[aq] command\-line argument was supplied, set some + seals on the file. */ +\& + if (seals_arg != NULL) { + seals = 0; +\& + if (strchr(seals_arg, \[aq]g\[aq]) != NULL) + seals |= F_SEAL_GROW; + if (strchr(seals_arg, \[aq]s\[aq]) != NULL) + seals |= F_SEAL_SHRINK; + if (strchr(seals_arg, \[aq]w\[aq]) != NULL) + seals |= F_SEAL_WRITE; + if (strchr(seals_arg, \[aq]W\[aq]) != NULL) + seals |= F_SEAL_FUTURE_WRITE; + if (strchr(seals_arg, \[aq]S\[aq]) != NULL) + seals |= F_SEAL_SEAL; +\& + if (fcntl(fd, F_ADD_SEALS, seals) == \-1) + err(EXIT_FAILURE, "fcntl"); + } +\& + /* Keep running, so that the file created by memfd_create() + continues to exist. */ +\& + pause(); +\& + exit(EXIT_SUCCESS); +} +.EE +.\" SRC END +.SS Program source: t_get_seals.c +\& +.\" SRC BEGIN (t_get_seals.c) +.EX +#define _GNU_SOURCE +#include <err.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +\& +int +main(int argc, char *argv[]) +{ + int fd; + unsigned int seals; +\& + if (argc != 2) { + fprintf(stderr, "%s /proc/PID/fd/FD\en", argv[0]); + exit(EXIT_FAILURE); + } +\& + fd = open(argv[1], O_RDWR); + if (fd == \-1) + err(EXIT_FAILURE, "open"); +\& + seals = fcntl(fd, F_GET_SEALS); + if (seals == \-1) + err(EXIT_FAILURE, "fcntl"); +\& + printf("Existing seals:"); + if (seals & F_SEAL_SEAL) + printf(" SEAL"); + if (seals & F_SEAL_GROW) + printf(" GROW"); + if (seals & F_SEAL_WRITE) + printf(" WRITE"); + if (seals & F_SEAL_FUTURE_WRITE) + printf(" FUTURE_WRITE"); + if (seals & F_SEAL_SHRINK) + printf(" SHRINK"); + printf("\en"); +\& + /* Code to map the file and access the contents of the + resulting mapping omitted. */ +\& + exit(EXIT_SUCCESS); +} +.EE +.\" SRC END +.SH SEE ALSO +.BR fcntl (2), +.BR ftruncate (2), +.BR memfd_secret (2), +.BR mmap (2), +.BR shmget (2), +.BR shm_open (3) |