summaryrefslogtreecommitdiffstats
path: root/man2/userfaultfd.2
diff options
context:
space:
mode:
Diffstat (limited to 'man2/userfaultfd.2')
-rw-r--r--man2/userfaultfd.2943
1 files changed, 943 insertions, 0 deletions
diff --git a/man2/userfaultfd.2 b/man2/userfaultfd.2
new file mode 100644
index 0000000..82903c6
--- /dev/null
+++ b/man2/userfaultfd.2
@@ -0,0 +1,943 @@
+.\" Copyright (c) 2016, IBM Corporation.
+.\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
+.\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH userfaultfd 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+userfaultfd \- create a file descriptor for handling page faults in user space
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_userfaultfd, int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR userfaultfd (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR userfaultfd ()
+creates a new userfaultfd object that can be used for delegation of page-fault
+handling to a user-space application,
+and returns a file descriptor that refers to the new object.
+The new userfaultfd object is configured using
+.BR ioctl (2).
+.PP
+Once the userfaultfd object is configured, the application can use
+.BR read (2)
+to receive userfaultfd notifications.
+The reads from userfaultfd may be blocking or non-blocking,
+depending on the value of
+.I flags
+used for the creation of the userfaultfd or subsequent calls to
+.BR fcntl (2).
+.PP
+The following values may be bitwise ORed in
+.I flags
+to change the behavior of
+.BR userfaultfd ():
+.TP
+.B O_CLOEXEC
+Enable the close-on-exec flag for the new userfaultfd file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2).
+.TP
+.B O_NONBLOCK
+Enables non-blocking operation for the userfaultfd object.
+See the description of the
+.B O_NONBLOCK
+flag in
+.BR open (2).
+.TP
+.B UFFD_USER_MODE_ONLY
+This is an userfaultfd-specific flag that was introduced in Linux 5.11.
+When set, the userfaultfd object will only be able to handle
+page faults originated from the user space on the registered regions.
+When a kernel-originated fault was triggered
+on the registered range with this userfaultfd, a
+.B SIGBUS
+signal will be delivered.
+.PP
+When the last file descriptor referring to a userfaultfd object is closed,
+all memory ranges that were registered with the object are unregistered
+and unread events are flushed.
+.\"
+.PP
+Userfaultfd supports three modes of registration:
+.TP
+.BR UFFDIO_REGISTER_MODE_MISSING " (since Linux 4.10)"
+When registered with
+.B UFFDIO_REGISTER_MODE_MISSING
+mode, user-space will receive a page-fault notification
+when a missing page is accessed.
+The faulted thread will be stopped from execution until the page fault is
+resolved from user-space by either an
+.B UFFDIO_COPY
+or an
+.B UFFDIO_ZEROPAGE
+ioctl.
+.TP
+.BR UFFDIO_REGISTER_MODE_MINOR " (since Linux 5.13)"
+When registered with
+.B UFFDIO_REGISTER_MODE_MINOR
+mode, user-space will receive a page-fault notification
+when a minor page fault occurs.
+That is,
+when a backing page is in the page cache,
+but page table entries don't yet exist.
+The faulted thread will be stopped from execution
+until the page fault is resolved from user-space by an
+.B UFFDIO_CONTINUE
+ioctl.
+.TP
+.BR UFFDIO_REGISTER_MODE_WP " (since Linux 5.7)"
+When registered with
+.B UFFDIO_REGISTER_MODE_WP
+mode, user-space will receive a page-fault notification
+when a write-protected page is written.
+The faulted thread will be stopped from execution
+until user-space write-unprotects the page using an
+.B UFFDIO_WRITEPROTECT
+ioctl.
+.PP
+Multiple modes can be enabled at the same time for the same memory range.
+.PP
+Since Linux 4.14, a userfaultfd page-fault notification can selectively embed
+faulting thread ID information into the notification.
+One needs to enable this feature explicitly using the
+.B UFFD_FEATURE_THREAD_ID
+feature bit when initializing the userfaultfd context.
+By default, thread ID reporting is disabled.
+.SS Usage
+The userfaultfd mechanism is designed to allow a thread in a multithreaded
+program to perform user-space paging for the other threads in the process.
+When a page fault occurs for one of the regions registered
+to the userfaultfd object,
+the faulting thread is put to sleep and
+an event is generated that can be read via the userfaultfd file descriptor.
+The fault-handling thread reads events from this file descriptor and services
+them using the operations described in
+.BR ioctl_userfaultfd (2).
+When servicing the page fault events,
+the fault-handling thread can trigger a wake-up for the sleeping thread.
+.PP
+It is possible for the faulting threads and the fault-handling threads
+to run in the context of different processes.
+In this case, these threads may belong to different programs,
+and the program that executes the faulting threads
+will not necessarily cooperate with the program that handles the page faults.
+In such non-cooperative mode,
+the process that monitors userfaultfd and handles page faults
+needs to be aware of the changes in the virtual memory layout
+of the faulting process to avoid memory corruption.
+.PP
+Since Linux 4.11,
+userfaultfd can also notify the fault-handling threads about changes
+in the virtual memory layout of the faulting process.
+In addition, if the faulting process invokes
+.BR fork (2),
+the userfaultfd objects associated with the parent may be duplicated
+into the child process and the userfaultfd monitor will be notified
+(via the
+.B UFFD_EVENT_FORK
+described below)
+about the file descriptor associated with the userfault objects
+created for the child process,
+which allows the userfaultfd monitor to perform user-space paging
+for the child process.
+Unlike page faults which have to be synchronous and require an
+explicit or implicit wakeup,
+all other events are delivered asynchronously and
+the non-cooperative process resumes execution as
+soon as the userfaultfd manager executes
+.BR read (2).
+The userfaultfd manager should carefully synchronize calls to
+.B UFFDIO_COPY
+with the processing of events.
+.PP
+The current asynchronous model of the event delivery is optimal for
+single threaded non-cooperative userfaultfd manager implementations.
+.\" Regarding the preceding sentence, Mike Rapoport says:
+.\" The major point here is that current events delivery model could be
+.\" problematic for multi-threaded monitor. I even suspect that it would be
+.\" impossible to ensure synchronization between page faults and non-page
+.\" fault events in multi-threaded monitor.
+.\" .PP
+.\" FIXME elaborate about non-cooperating mode, describe its limitations
+.\" for kernels before Linux 4.11, features added in Linux 4.11
+.\" and limitations remaining in Linux 4.11
+.\" Maybe it's worth adding a dedicated sub-section...
+.\"
+.PP
+Since Linux 5.7, userfaultfd is able to do
+synchronous page dirty tracking using the new write-protect register mode.
+One should check against the feature bit
+.B UFFD_FEATURE_PAGEFAULT_FLAG_WP
+before using this feature.
+Similar to the original userfaultfd missing mode, the write-protect mode will
+generate a userfaultfd notification when the protected page is written.
+The user needs to resolve the page fault by unprotecting the faulted page and
+kicking the faulted thread to continue.
+For more information,
+please refer to the "Userfaultfd write-protect mode" section.
+.\"
+.SS Userfaultfd operation
+After the userfaultfd object is created with
+.BR userfaultfd (),
+the application must enable it using the
+.B UFFDIO_API
+.BR ioctl (2)
+operation.
+This operation allows a handshake between the kernel and user space
+to determine the API version and supported features.
+This operation must be performed before any of the other
+.BR ioctl (2)
+operations described below (or those operations fail with the
+.B EINVAL
+error).
+.PP
+After a successful
+.B UFFDIO_API
+operation,
+the application then registers memory address ranges using the
+.B UFFDIO_REGISTER
+.BR ioctl (2)
+operation.
+After successful completion of a
+.B UFFDIO_REGISTER
+operation,
+a page fault occurring in the requested memory range, and satisfying
+the mode defined at the registration time, will be forwarded by the kernel to
+the user-space application.
+The application can then use the
+.B UFFDIO_COPY ,
+.B UFFDIO_ZEROPAGE ,
+or
+.B UFFDIO_CONTINUE
+.BR ioctl (2)
+operations to resolve the page fault.
+.PP
+Since Linux 4.14, if the application sets the
+.B UFFD_FEATURE_SIGBUS
+feature bit using the
+.B UFFDIO_API
+.BR ioctl (2),
+no page-fault notification will be forwarded to user space.
+Instead a
+.B SIGBUS
+signal is delivered to the faulting process.
+With this feature,
+userfaultfd can be used for robustness purposes to simply catch
+any access to areas within the registered address range that do not
+have pages allocated, without having to listen to userfaultfd events.
+No userfaultfd monitor will be required for dealing with such memory
+accesses.
+For example, this feature can be useful for applications that
+want to prevent the kernel from automatically allocating pages and filling
+holes in sparse files when the hole is accessed through a memory mapping.
+.PP
+The
+.B UFFD_FEATURE_SIGBUS
+feature is implicitly inherited through
+.BR fork (2)
+if used in combination with
+.BR UFFD_FEATURE_FORK .
+.PP
+Details of the various
+.BR ioctl (2)
+operations can be found in
+.BR ioctl_userfaultfd (2).
+.PP
+Since Linux 4.11, events other than page-fault may enabled during
+.B UFFDIO_API
+operation.
+.PP
+Up to Linux 4.11,
+userfaultfd can be used only with anonymous private memory mappings.
+Since Linux 4.11,
+userfaultfd can be also used with hugetlbfs and shared memory mappings.
+.\"
+.SS Userfaultfd write-protect mode (since Linux 5.7)
+Since Linux 5.7, userfaultfd supports write-protect mode for anonymous memory.
+The user needs to first check availability of this feature using
+.B UFFDIO_API
+ioctl against the feature bit
+.B UFFD_FEATURE_PAGEFAULT_FLAG_WP
+before using this feature.
+.PP
+Since Linux 5.19,
+the write-protection mode was also supported on
+shmem and hugetlbfs memory types.
+It can be detected with the feature bit
+.BR UFFD_FEATURE_WP_HUGETLBFS_SHMEM .
+.PP
+To register with userfaultfd write-protect mode, the user needs to initiate the
+.B UFFDIO_REGISTER
+ioctl with mode
+.B UFFDIO_REGISTER_MODE_WP
+set.
+Note that it is legal to monitor the same memory range with multiple modes.
+For example, the user can do
+.B UFFDIO_REGISTER
+with the mode set to
+.BR "UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP" .
+When there is only
+.B UFFDIO_REGISTER_MODE_WP
+registered, user-space will
+.I not
+receive any notification when a missing page is written.
+Instead, user-space will receive a write-protect page-fault notification
+only when an existing but write-protected page got written.
+.PP
+After the
+.B UFFDIO_REGISTER
+ioctl completed with
+.B UFFDIO_REGISTER_MODE_WP
+mode set,
+the user can write-protect any existing memory within the range using the ioctl
+.B UFFDIO_WRITEPROTECT
+where
+.I uffdio_writeprotect.mode
+should be set to
+.BR UFFDIO_WRITEPROTECT_MODE_WP .
+.PP
+When a write-protect event happens,
+user-space will receive a page-fault notification whose
+.I uffd_msg.pagefault.flags
+will be with
+.B UFFD_PAGEFAULT_FLAG_WP
+flag set.
+Note: since only writes can trigger this kind of fault,
+write-protect notifications will always have the
+.B UFFD_PAGEFAULT_FLAG_WRITE
+bit set along with the
+.B UFFD_PAGEFAULT_FLAG_WP
+bit.
+.PP
+To resolve a write-protection page fault, the user should initiate another
+.B UFFDIO_WRITEPROTECT
+ioctl, whose
+.I uffd_msg.pagefault.flags
+should have the flag
+.B UFFDIO_WRITEPROTECT_MODE_WP
+cleared upon the faulted page or range.
+.\"
+.SS Userfaultfd minor fault mode (since Linux 5.13)
+Since Linux 5.13,
+userfaultfd supports minor fault mode.
+In this mode,
+fault messages are produced not for major faults
+(where the page was missing),
+but rather for minor faults,
+where a page exists in the page cache,
+but the page table entries are not yet present.
+The user needs to first check availability of this feature using the
+.B UFFDIO_API
+ioctl with the appropriate feature bits set before using this feature:
+.B UFFD_FEATURE_MINOR_HUGETLBFS
+since Linux 5.13,
+or
+.B UFFD_FEATURE_MINOR_SHMEM
+since Linux 5.14.
+.PP
+To register with userfaultfd minor fault mode,
+the user needs to initiate the
+.B UFFDIO_REGISTER
+ioctl with mode
+.B UFFD_REGISTER_MODE_MINOR
+set.
+.PP
+When a minor fault occurs,
+user-space will receive a page-fault notification
+whose
+.I uffd_msg.pagefault.flags
+will have the
+.B UFFD_PAGEFAULT_FLAG_MINOR
+flag set.
+.PP
+To resolve a minor page fault,
+the handler should decide whether or not
+the existing page contents need to be modified first.
+If so,
+this should be done in-place via a second,
+non-userfaultfd-registered mapping
+to the same backing page
+(e.g., by mapping the shmem or hugetlbfs file twice).
+Once the page is considered "up to date",
+the fault can be resolved by initiating an
+.B UFFDIO_CONTINUE
+ioctl,
+which installs the page table entries and
+(by default)
+wakes up the faulting thread(s).
+.PP
+Minor fault mode supports only hugetlbfs-backed (since Linux 5.13)
+and shmem-backed (since Linux 5.14) memory.
+.\"
+.SS Reading from the userfaultfd structure
+Each
+.BR read (2)
+from the userfaultfd file descriptor returns one or more
+.I uffd_msg
+structures, each of which describes a page-fault event
+or an event required for the non-cooperative userfaultfd usage:
+.PP
+.in +4n
+.EX
+struct uffd_msg {
+ __u8 event; /* Type of event */
+ ...
+ union {
+ struct {
+ __u64 flags; /* Flags describing fault */
+ __u64 address; /* Faulting address */
+ union {
+ __u32 ptid; /* Thread ID of the fault */
+ } feat;
+ } pagefault;
+\&
+ struct { /* Since Linux 4.11 */
+ __u32 ufd; /* Userfault file descriptor
+ of the child process */
+ } fork;
+\&
+ struct { /* Since Linux 4.11 */
+ __u64 from; /* Old address of remapped area */
+ __u64 to; /* New address of remapped area */
+ __u64 len; /* Original mapping length */
+ } remap;
+\&
+ struct { /* Since Linux 4.11 */
+ __u64 start; /* Start address of removed area */
+ __u64 end; /* End address of removed area */
+ } remove;
+ ...
+ } arg;
+\&
+ /* Padding fields omitted */
+} __packed;
+.EE
+.in
+.PP
+If multiple events are available and the supplied buffer is large enough,
+.BR read (2)
+returns as many events as will fit in the supplied buffer.
+If the buffer supplied to
+.BR read (2)
+is smaller than the size of the
+.I uffd_msg
+structure, the
+.BR read (2)
+fails with the error
+.BR EINVAL .
+.PP
+The fields set in the
+.I uffd_msg
+structure are as follows:
+.TP
+.I event
+The type of event.
+Depending of the event type,
+different fields of the
+.I arg
+union represent details required for the event processing.
+The non-page-fault events are generated only when appropriate feature
+is enabled during API handshake with
+.B UFFDIO_API
+.BR ioctl (2).
+.IP
+The following values can appear in the
+.I event
+field:
+.RS
+.TP
+.BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)"
+A page-fault event.
+The page-fault details are available in the
+.I pagefault
+field.
+.TP
+.BR UFFD_EVENT_FORK " (since Linux 4.11)"
+Generated when the faulting process invokes
+.BR fork (2)
+(or
+.BR clone (2)
+without the
+.B CLONE_VM
+flag).
+The event details are available in the
+.I fork
+field.
+.\" FIXME describe duplication of userfault file descriptor during fork
+.TP
+.BR UFFD_EVENT_REMAP " (since Linux 4.11)"
+Generated when the faulting process invokes
+.BR mremap (2).
+The event details are available in the
+.I remap
+field.
+.TP
+.BR UFFD_EVENT_REMOVE " (since Linux 4.11)"
+Generated when the faulting process invokes
+.BR madvise (2)
+with
+.B MADV_DONTNEED
+or
+.B MADV_REMOVE
+advice.
+The event details are available in the
+.I remove
+field.
+.TP
+.BR UFFD_EVENT_UNMAP " (since Linux 4.11)"
+Generated when the faulting process unmaps a memory range,
+either explicitly using
+.BR munmap (2)
+or implicitly during
+.BR mmap (2)
+or
+.BR mremap (2).
+The event details are available in the
+.I remove
+field.
+.RE
+.TP
+.I pagefault.address
+The address that triggered the page fault.
+.TP
+.I pagefault.flags
+A bit mask of flags that describe the event.
+For
+.BR UFFD_EVENT_PAGEFAULT ,
+the following flag may appear:
+.RS
+.TP
+.B UFFD_PAGEFAULT_FLAG_WP
+If this flag is set, then the fault was a write-protect fault.
+.TP
+.B UFFD_PAGEFAULT_FLAG_MINOR
+If this flag is set, then the fault was a minor fault.
+.TP
+.B UFFD_PAGEFAULT_FLAG_WRITE
+If this flag is set, then the fault was a write fault.
+.PP
+If neither
+.B UFFD_PAGEFAULT_FLAG_WP
+nor
+.B UFFD_PAGEFAULT_FLAG_MINOR
+are set, then the fault was a missing fault.
+.RE
+.TP
+.I pagefault.feat.pid
+The thread ID that triggered the page fault.
+.TP
+.I fork.ufd
+The file descriptor associated with the userfault object
+created for the child created by
+.BR fork (2).
+.TP
+.I remap.from
+The original address of the memory range that was remapped using
+.BR mremap (2).
+.TP
+.I remap.to
+The new address of the memory range that was remapped using
+.BR mremap (2).
+.TP
+.I remap.len
+The original length of the memory range that was remapped using
+.BR mremap (2).
+.TP
+.I remove.start
+The start address of the memory range that was freed using
+.BR madvise (2)
+or unmapped
+.TP
+.I remove.end
+The end address of the memory range that was freed using
+.BR madvise (2)
+or unmapped
+.PP
+A
+.BR read (2)
+on a userfaultfd file descriptor can fail with the following errors:
+.TP
+.B EINVAL
+The userfaultfd object has not yet been enabled using the
+.B UFFDIO_API
+.BR ioctl (2)
+operation
+.PP
+If the
+.B O_NONBLOCK
+flag is enabled in the associated open file description,
+the userfaultfd file descriptor can be monitored with
+.BR poll (2),
+.BR select (2),
+and
+.BR epoll (7).
+When events are available, the file descriptor indicates as readable.
+If the
+.B O_NONBLOCK
+flag is not enabled, then
+.BR poll (2)
+(always) indicates the file as having a
+.B POLLERR
+condition, and
+.BR select (2)
+indicates the file descriptor as both readable and writable.
+.\" FIXME What is the reason for this seemingly odd behavior with respect
+.\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
+.\" Something needs to be said about this.
+.SH RETURN VALUE
+On success,
+.BR userfaultfd ()
+returns a new file descriptor that refers to the userfaultfd object.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+An unsupported value was specified in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been
+reached
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been
+reached.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.BR EPERM " (since Linux 5.2)"
+.\" cefdca0a86be517bc390fc4541e3674b8e7803b0
+The caller is not privileged (does not have the
+.B CAP_SYS_PTRACE
+capability in the initial user namespace), and
+.I /proc/sys/vm/unprivileged_userfaultfd
+has the value 0.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.3.
+.PP
+Support for hugetlbfs and shared memory areas and
+non-page-fault events was added in Linux 4.11
+.SH NOTES
+The userfaultfd mechanism can be used as an alternative to
+traditional user-space paging techniques based on the use of the
+.B SIGSEGV
+signal and
+.BR mmap (2).
+It can also be used to implement lazy restore
+for checkpoint/restore mechanisms,
+as well as post-copy migration to allow (nearly) uninterrupted execution
+when transferring virtual machines and Linux containers
+from one host to another.
+.SH BUGS
+If the
+.B UFFD_FEATURE_EVENT_FORK
+is enabled and a system call from the
+.BR fork (2)
+family is interrupted by a signal or failed, a stale userfaultfd descriptor
+might be created.
+In this case, a spurious
+.B UFFD_EVENT_FORK
+will be delivered to the userfaultfd monitor.
+.SH EXAMPLES
+The program below demonstrates the use of the userfaultfd mechanism.
+The program creates two threads, one of which acts as the
+page-fault handler for the process, for the pages in a demand-page zero
+region created using
+.BR mmap (2).
+.PP
+The program takes one command-line argument,
+which is the number of pages that will be created in a mapping
+whose page faults will be handled via userfaultfd.
+After creating a userfaultfd object,
+the program then creates an anonymous private mapping of the specified size
+and registers the address range of that mapping using the
+.B UFFDIO_REGISTER
+.BR ioctl (2)
+operation.
+The program then creates a second thread that will perform the
+task of handling page faults.
+.PP
+The main thread then walks through the pages of the mapping fetching
+bytes from successive pages.
+Because the pages have not yet been accessed,
+the first access of a byte in each page will trigger a page-fault event
+on the userfaultfd file descriptor.
+.PP
+Each of the page-fault events is handled by the second thread,
+which sits in a loop processing input from the userfaultfd file descriptor.
+In each loop iteration, the second thread first calls
+.BR poll (2)
+to check the state of the file descriptor,
+and then reads an event from the file descriptor.
+All such events should be
+.B UFFD_EVENT_PAGEFAULT
+events,
+which the thread handles by copying a page of data into
+the faulting region using the
+.B UFFDIO_COPY
+.BR ioctl (2)
+operation.
+.PP
+The following is an example of what we see when running the program:
+.PP
+.in +4n
+.EX
+$ \fB./userfaultfd_demo 3\fP
+Address returned by mmap() = 0x7fd30106c000
+\&
+fault_handler_thread():
+ poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
+ UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
+ (uffdio_copy.copy returned 4096)
+Read address 0x7fd30106c00f in main(): A
+Read address 0x7fd30106c40f in main(): A
+Read address 0x7fd30106c80f in main(): A
+Read address 0x7fd30106cc0f in main(): A
+\&
+fault_handler_thread():
+ poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
+ UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
+ (uffdio_copy.copy returned 4096)
+Read address 0x7fd30106d00f in main(): B
+Read address 0x7fd30106d40f in main(): B
+Read address 0x7fd30106d80f in main(): B
+Read address 0x7fd30106dc0f in main(): B
+\&
+fault_handler_thread():
+ poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
+ UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
+ (uffdio_copy.copy returned 4096)
+Read address 0x7fd30106e00f in main(): C
+Read address 0x7fd30106e40f in main(): C
+Read address 0x7fd30106e80f in main(): C
+Read address 0x7fd30106ec0f in main(): C
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (userfaultfd.c)
+.EX
+/* userfaultfd_demo.c
+\&
+ Licensed under the GNU General Public License version 2 or later.
+*/
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <linux/userfaultfd.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+static int page_size;
+\&
+static void *
+fault_handler_thread(void *arg)
+{
+ int nready;
+ long uffd; /* userfaultfd file descriptor */
+ ssize_t nread;
+ struct pollfd pollfd;
+ struct uffdio_copy uffdio_copy;
+\&
+ static int fault_cnt = 0; /* Number of faults so far handled */
+ static char *page = NULL;
+ static struct uffd_msg msg; /* Data read from userfaultfd */
+\&
+ uffd = (long) arg;
+\&
+ /* Create a page that will be copied into the faulting region. */
+\&
+ if (page == NULL) {
+ page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
+ if (page == MAP_FAILED)
+ err(EXIT_FAILURE, "mmap");
+ }
+\&
+ /* Loop, handling incoming events on the userfaultfd
+ file descriptor. */
+\&
+ for (;;) {
+\&
+ /* See what poll() tells us about the userfaultfd. */
+\&
+ pollfd.fd = uffd;
+ pollfd.events = POLLIN;
+ nready = poll(&pollfd, 1, \-1);
+ if (nready == \-1)
+ err(EXIT_FAILURE, "poll");
+\&
+ printf("\enfault_handler_thread():\en");
+ printf(" poll() returns: nready = %d; "
+ "POLLIN = %d; POLLERR = %d\en", nready,
+ (pollfd.revents & POLLIN) != 0,
+ (pollfd.revents & POLLERR) != 0);
+\&
+ /* Read an event from the userfaultfd. */
+\&
+ nread = read(uffd, &msg, sizeof(msg));
+ if (nread == 0) {
+ printf("EOF on userfaultfd!\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (nread == \-1)
+ err(EXIT_FAILURE, "read");
+\&
+ /* We expect only one kind of event; verify that assumption. */
+\&
+ if (msg.event != UFFD_EVENT_PAGEFAULT) {
+ fprintf(stderr, "Unexpected event on userfaultfd\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Display info about the page\-fault event. */
+\&
+ printf(" UFFD_EVENT_PAGEFAULT event: ");
+ printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags);
+ printf("address = %"PRIx64"\en", msg.arg.pagefault.address);
+\&
+ /* Copy the page pointed to by \[aq]page\[aq] into the faulting
+ region. Vary the contents that are copied in, so that it
+ is more obvious that each fault is handled separately. */
+\&
+ memset(page, \[aq]A\[aq] + fault_cnt % 20, page_size);
+ fault_cnt++;
+\&
+ uffdio_copy.src = (unsigned long) page;
+\&
+ /* We need to handle page faults in units of pages(!).
+ So, round faulting address down to page boundary. */
+\&
+ uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
+ \[ti](page_size \- 1);
+ uffdio_copy.len = page_size;
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
+ err(EXIT_FAILURE, "ioctl\-UFFDIO_COPY");
+\&
+ printf(" (uffdio_copy.copy returned %"PRId64")\en",
+ uffdio_copy.copy);
+ }
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int s;
+ char c;
+ char *addr; /* Start of region handled by userfaultfd */
+ long uffd; /* userfaultfd file descriptor */
+ size_t len, l; /* Length of region handled by userfaultfd */
+ pthread_t thr; /* ID of thread that handles page faults */
+ struct uffdio_api uffdio_api;
+ struct uffdio_register uffdio_register;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s num\-pages\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ page_size = sysconf(_SC_PAGE_SIZE);
+ len = strtoull(argv[1], NULL, 0) * page_size;
+\&
+ /* Create and enable userfaultfd object. */
+\&
+ uffd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd == \-1)
+ err(EXIT_FAILURE, "userfaultfd");
+\&
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
+ err(EXIT_FAILURE, "ioctl\-UFFDIO_API");
+\&
+ /* Create a private anonymous mapping. The memory will be
+ demand\-zero paged\-\-that is, not yet allocated. When we
+ actually touch the memory, it will be allocated via
+ the userfaultfd. */
+\&
+ addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
+ if (addr == MAP_FAILED)
+ err(EXIT_FAILURE, "mmap");
+\&
+ printf("Address returned by mmap() = %p\en", addr);
+\&
+ /* Register the memory range of the mapping we just created for
+ handling by the userfaultfd object. In mode, we request to track
+ missing pages (i.e., pages that have not yet been faulted in). */
+\&
+ uffdio_register.range.start = (unsigned long) addr;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
+ err(EXIT_FAILURE, "ioctl\-UFFDIO_REGISTER");
+\&
+ /* Create a thread that will process the userfaultfd events. */
+\&
+ s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
+ if (s != 0) {
+ errc(EXIT_FAILURE, s, "pthread_create");
+ }
+\&
+ /* Main thread now touches memory in the mapping, touching
+ locations 1024 bytes apart. This will trigger userfaultfd
+ events for all pages in the region. */
+\&
+ l = 0xf; /* Ensure that faulting address is not on a page
+ boundary, in order to test that we correctly
+ handle that case in fault_handling_thread(). */
+ while (l < len) {
+ c = addr[l];
+ printf("Read address %p in %s(): ", addr + l, __func__);
+ printf("%c\en", c);
+ l += 1024;
+ usleep(100000); /* Slow things down a little */
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR fcntl (2),
+.BR ioctl (2),
+.BR ioctl_userfaultfd (2),
+.BR madvise (2),
+.BR mmap (2)
+.PP
+.I Documentation/admin\-guide/mm/userfaultfd.rst
+in the Linux kernel source tree