summaryrefslogtreecommitdiffstats
path: root/man2/memfd_create.2
diff options
context:
space:
mode:
Diffstat (limited to 'man2/memfd_create.2')
-rw-r--r--man2/memfd_create.2545
1 files changed, 545 insertions, 0 deletions
diff --git a/man2/memfd_create.2 b/man2/memfd_create.2
new file mode 100644
index 0000000..fb18abc
--- /dev/null
+++ b/man2/memfd_create.2
@@ -0,0 +1,545 @@
+.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH memfd_create 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+memfd_create \- create an anonymous file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/mman.h>
+.PP
+.BI "int memfd_create(const char *" name ", unsigned int " flags ");"
+.fi
+.SH DESCRIPTION
+.BR memfd_create ()
+creates an anonymous file and returns a file descriptor that refers to it.
+The file behaves like a regular file, and so can be modified,
+truncated, memory-mapped, and so on.
+However, unlike a regular file,
+it lives in RAM and has a volatile backing storage.
+Once all references to the file are dropped, it is automatically released.
+Anonymous memory is used for all backing pages of the file.
+Therefore, files created by
+.BR memfd_create ()
+have the same semantics as other anonymous
+.\" David Herrmann:
+.\" memfd uses VM_NORESERVE so each page is accounted on first access.
+.\" This means, the overcommit-limits (see __vm_enough_memory()) and the
+.\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that
+.\" those are accounted on "current" and "current->mm", that is, the
+.\" process doing the first page access.
+memory allocations such as those allocated using
+.BR mmap (2)
+with the
+.B MAP_ANONYMOUS
+flag.
+.PP
+The initial size of the file is set to 0.
+Following the call, the file size should be set using
+.BR ftruncate (2).
+(Alternatively, the file may be populated by calls to
+.BR write (2)
+or similar.)
+.PP
+The name supplied in
+.I name
+is used as a filename and will be displayed
+as the target of the corresponding symbolic link in the directory
+.IR /proc/self/fd/ .
+The displayed name is always prefixed with
+.I memfd:
+and serves only for debugging purposes.
+Names do not affect the behavior of the file descriptor,
+and as such multiple files can have the same name without any side effects.
+.PP
+The following values may be bitwise ORed in
+.I flags
+to change the behavior of
+.BR memfd_create ():
+.TP
+.B MFD_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.TP
+.B MFD_ALLOW_SEALING
+Allow sealing operations on this file.
+See the discussion of the
+.B F_ADD_SEALS
+and
+.B F_GET_SEALS
+operations in
+.BR fcntl (2),
+and also NOTES, below.
+The initial set of seals is empty.
+If this flag is not set, the initial set of seals will be
+.BR F_SEAL_SEAL ,
+meaning that no other seals can be set on the file.
+.\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default?
+.\" Is it worth adding some text explaining this?
+.TP
+.BR MFD_HUGETLB " (since Linux 4.14)"
+.\" commit 749df87bd7bee5a79cef073f5d032ddb2b211de8
+The anonymous file will be created in the hugetlbfs filesystem using
+huge pages.
+See the Linux kernel source file
+.I Documentation/admin\-guide/mm/hugetlbpage.rst
+for more information about hugetlbfs.
+.\" commit 47b9012ecdc747f6936395265e677d41e11a31ff
+Specifying both
+.B MFD_HUGETLB
+and
+.B MFD_ALLOW_SEALING
+in
+.I flags
+is supported since Linux 4.16.
+.TP
+.BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..."
+Used in conjunction with
+.B MFD_HUGETLB
+to select alternative hugetlb page sizes (respectively, 2\ MB, 1\ GB, ...)
+on systems that support multiple hugetlb page sizes.
+Definitions for known
+huge page sizes are included in the header file
+.I <linux/memfd.h>.
+.IP
+For details on encoding huge page sizes not included in the header file,
+see the discussion of the similarly named constants in
+.BR mmap (2).
+.PP
+Unused bits in
+.I flags
+must be 0.
+.PP
+As its return value,
+.BR memfd_create ()
+returns a new file descriptor that can be used to refer to the file.
+This file descriptor is opened for both reading and writing
+.RB ( O_RDWR )
+and
+.B O_LARGEFILE
+is set for the file descriptor.
+.PP
+With respect to
+.BR fork (2)
+and
+.BR execve (2),
+the usual semantics apply for the file descriptor created by
+.BR memfd_create ().
+A copy of the file descriptor is inherited by the child produced by
+.BR fork (2)
+and refers to the same file.
+The file descriptor is preserved across
+.BR execve (2),
+unless the close-on-exec flag has been set.
+.SH RETURN VALUE
+On success,
+.BR memfd_create ()
+returns a new file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+The address in
+.I name
+points to invalid memory.
+.TP
+.B EINVAL
+.I flags
+included unknown bits.
+.TP
+.B EINVAL
+.I name
+was too long.
+(The limit is
+.\" NAME_MAX - strlen("memfd:")
+249 bytes, excluding the terminating null byte.)
+.TP
+.B EINVAL
+Both
+.B MFD_HUGETLB
+and
+.B MFD_ALLOW_SEALING
+were specified in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOMEM
+There was insufficient memory to create a new anonymous file.
+.TP
+.B EPERM
+The
+.B MFD_HUGETLB
+flag was specified, but the caller was not privileged (did not have the
+.B CAP_IPC_LOCK
+capability)
+and is not a member of the
+.I sysctl_hugetlb_shm_group
+group; see the description of
+.I /proc/sys/vm/sysctl_hugetlb_shm_group
+in
+.BR proc (5).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.17,
+glibc 2.27.
+.SH NOTES
+.\" See also http://lwn.net/Articles/593918/
+.\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/
+The
+.BR memfd_create ()
+system call provides a simple alternative to manually mounting a
+.BR tmpfs (5)
+filesystem and creating and opening a file in that filesystem.
+The primary purpose of
+.BR memfd_create ()
+is to create files and associated file descriptors that are
+used with the file-sealing APIs provided by
+.BR fcntl (2).
+.PP
+The
+.BR memfd_create ()
+system call also has uses without file sealing
+(which is why file-sealing is disabled, unless explicitly requested with the
+.B MFD_ALLOW_SEALING
+flag).
+In particular, it can be used as an alternative to creating files in
+.I tmp
+or as an alternative to using the
+.BR open (2)
+.B O_TMPFILE
+in cases where there is no intention to actually link the
+resulting file into the filesystem.
+.SS File sealing
+In the absence of file sealing,
+processes that communicate via shared memory must either trust each other,
+or take measures to deal with the possibility that an untrusted peer
+may manipulate the shared memory region in problematic ways.
+For example, an untrusted peer might modify the contents of the
+shared memory at any time, or shrink the shared memory region.
+The former possibility leaves the local process vulnerable to
+time-of-check-to-time-of-use race conditions
+(typically dealt with by copying data from
+the shared memory region before checking and using it).
+The latter possibility leaves the local process vulnerable to
+.B SIGBUS
+signals when an attempt is made to access a now-nonexistent
+location in the shared memory region.
+(Dealing with this possibility necessitates the use of a handler for the
+.B SIGBUS
+signal.)
+.PP
+Dealing with untrusted peers imposes extra complexity on
+code that employs shared memory.
+Memory sealing enables that extra complexity to be eliminated,
+by allowing a process to operate secure in the knowledge that
+its peer can't modify the shared memory in an undesired fashion.
+.PP
+An example of the usage of the sealing mechanism is as follows:
+.IP (1) 5
+The first process creates a
+.BR tmpfs (5)
+file using
+.BR memfd_create ().
+The call yields a file descriptor used in subsequent steps.
+.IP (2)
+The first process
+sizes the file created in the previous step using
+.BR ftruncate (2),
+maps it using
+.BR mmap (2),
+and populates the shared memory with the desired data.
+.IP (3)
+The first process uses the
+.BR fcntl (2)
+.B F_ADD_SEALS
+operation to place one or more seals on the file,
+in order to restrict further modifications on the file.
+(If placing the seal
+.BR F_SEAL_WRITE ,
+then it will be necessary to first unmap the shared writable mapping
+created in the previous step.
+Otherwise, behavior similar to
+.B F_SEAL_WRITE
+can be achieved by using
+.BR F_SEAL_FUTURE_WRITE ,
+which will prevent future writes via
+.BR mmap (2)
+and
+.BR write (2)
+from succeeding while keeping existing shared writable mappings).
+.IP (4)
+A second process obtains a file descriptor for the
+.BR tmpfs (5)
+file and maps it.
+Among the possible ways in which this could happen are the following:
+.RS
+.IP \[bu] 3
+The process that called
+.BR memfd_create ()
+could transfer the resulting file descriptor to the second process
+via a UNIX domain socket (see
+.BR unix (7)
+and
+.BR cmsg (3)).
+The second process then maps the file using
+.BR mmap (2).
+.IP \[bu]
+The second process is created via
+.BR fork (2)
+and thus automatically inherits the file descriptor and mapping.
+(Note that in this case and the next,
+there is a natural trust relationship between the two processes,
+since they are running under the same user ID.
+Therefore, file sealing would not normally be necessary.)
+.IP \[bu]
+The second process opens the file
+.IR /proc/ pid /fd/ fd,
+where
+.I <pid>
+is the PID of the first process (the one that called
+.BR memfd_create ()),
+and
+.I <fd>
+is the number of the file descriptor returned by the call to
+.BR memfd_create ()
+in that process.
+The second process then maps the file using
+.BR mmap (2).
+.RE
+.IP (5)
+The second process uses the
+.BR fcntl (2)
+.B F_GET_SEALS
+operation to retrieve the bit mask of seals
+that has been applied to the file.
+This bit mask can be inspected in order to determine
+what kinds of restrictions have been placed on file modifications.
+If desired, the second process can apply further seals
+to impose additional restrictions (so long as the
+.B F_SEAL_SEAL
+seal has not yet been applied).
+.SH EXAMPLES
+Below are shown two example programs that demonstrate the use of
+.BR memfd_create ()
+and the file sealing API.
+.PP
+The first program,
+.IR t_memfd_create.c ,
+creates a
+.BR tmpfs (5)
+file using
+.BR memfd_create (),
+sets a size for the file, maps it into memory,
+and optionally places some seals on the file.
+The program accepts up to three command-line arguments,
+of which the first two are required.
+The first argument is the name to associate with the file,
+the second argument is the size to be set for the file,
+and the optional third argument is a string of characters that specify
+seals to be set on the file.
+.PP
+The second program,
+.IR t_get_seals.c ,
+can be used to open an existing file that was created via
+.BR memfd_create ()
+and inspect the set of seals that have been applied to that file.
+.PP
+The following shell session demonstrates the use of these programs.
+First we create a
+.BR tmpfs (5)
+file and set some seals on it:
+.PP
+.in +4n
+.EX
+$ \fB./t_memfd_create my_memfd_file 4096 sw &\fP
+[1] 11775
+PID: 11775; fd: 3; /proc/11775/fd/3
+.EE
+.in
+.PP
+At this point, the
+.I t_memfd_create
+program continues to run in the background.
+From another program, we can obtain a file descriptor for the
+file created by
+.BR memfd_create ()
+by opening the
+.IR /proc/ pid /fd
+file that corresponds to the file descriptor opened by
+.BR memfd_create ().
+Using that pathname, we inspect the content of the
+.IR /proc/ pid /fd
+symbolic link, and use our
+.I t_get_seals
+program to view the seals that have been placed on the file:
+.PP
+.in +4n
+.EX
+$ \fBreadlink /proc/11775/fd/3\fP
+/memfd:my_memfd_file (deleted)
+$ \fB./t_get_seals /proc/11775/fd/3\fP
+Existing seals: WRITE SHRINK
+.EE
+.in
+.SS Program source: t_memfd_create.c
+\&
+.\" SRC BEGIN (t_memfd_create.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ char *name, *seals_arg;
+ ssize_t len;
+ unsigned int seals;
+\&
+ if (argc < 3) {
+ fprintf(stderr, "%s name size [seals]\en", argv[0]);
+ fprintf(stderr, "\et\[aq]seals\[aq] can contain any of the "
+ "following characters:\en");
+ fprintf(stderr, "\et\etg \- F_SEAL_GROW\en");
+ fprintf(stderr, "\et\ets \- F_SEAL_SHRINK\en");
+ fprintf(stderr, "\et\etw \- F_SEAL_WRITE\en");
+ fprintf(stderr, "\et\etW \- F_SEAL_FUTURE_WRITE\en");
+ fprintf(stderr, "\et\etS \- F_SEAL_SEAL\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ name = argv[1];
+ len = atoi(argv[2]);
+ seals_arg = argv[3];
+\&
+ /* Create an anonymous file in tmpfs; allow seals to be
+ placed on the file. */
+\&
+ fd = memfd_create(name, MFD_ALLOW_SEALING);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "memfd_create");
+\&
+ /* Size the file as specified on the command line. */
+\&
+ if (ftruncate(fd, len) == \-1)
+ err(EXIT_FAILURE, "truncate");
+\&
+ printf("PID: %jd; fd: %d; /proc/%jd/fd/%d\en",
+ (intmax_t) getpid(), fd, (intmax_t) getpid(), fd);
+\&
+ /* Code to map the file and populate the mapping with data
+ omitted. */
+\&
+ /* If a \[aq]seals\[aq] command\-line argument was supplied, set some
+ seals on the file. */
+\&
+ if (seals_arg != NULL) {
+ seals = 0;
+\&
+ if (strchr(seals_arg, \[aq]g\[aq]) != NULL)
+ seals |= F_SEAL_GROW;
+ if (strchr(seals_arg, \[aq]s\[aq]) != NULL)
+ seals |= F_SEAL_SHRINK;
+ if (strchr(seals_arg, \[aq]w\[aq]) != NULL)
+ seals |= F_SEAL_WRITE;
+ if (strchr(seals_arg, \[aq]W\[aq]) != NULL)
+ seals |= F_SEAL_FUTURE_WRITE;
+ if (strchr(seals_arg, \[aq]S\[aq]) != NULL)
+ seals |= F_SEAL_SEAL;
+\&
+ if (fcntl(fd, F_ADD_SEALS, seals) == \-1)
+ err(EXIT_FAILURE, "fcntl");
+ }
+\&
+ /* Keep running, so that the file created by memfd_create()
+ continues to exist. */
+\&
+ pause();
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SS Program source: t_get_seals.c
+\&
+.\" SRC BEGIN (t_get_seals.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ unsigned int seals;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "%s /proc/PID/fd/FD\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd = open(argv[1], O_RDWR);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "open");
+\&
+ seals = fcntl(fd, F_GET_SEALS);
+ if (seals == \-1)
+ err(EXIT_FAILURE, "fcntl");
+\&
+ printf("Existing seals:");
+ if (seals & F_SEAL_SEAL)
+ printf(" SEAL");
+ if (seals & F_SEAL_GROW)
+ printf(" GROW");
+ if (seals & F_SEAL_WRITE)
+ printf(" WRITE");
+ if (seals & F_SEAL_FUTURE_WRITE)
+ printf(" FUTURE_WRITE");
+ if (seals & F_SEAL_SHRINK)
+ printf(" SHRINK");
+ printf("\en");
+\&
+ /* Code to map the file and access the contents of the
+ resulting mapping omitted. */
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR fcntl (2),
+.BR ftruncate (2),
+.BR memfd_secret (2),
+.BR mmap (2),
+.BR shmget (2),
+.BR shm_open (3)