summaryrefslogtreecommitdiffstats
path: root/man2/perf_event_open.2
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:40:15 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:40:15 +0000
commit399644e47874bff147afb19c89228901ac39340e (patch)
tree1c4c0b733f4c16b5783b41bebb19194a9ef62ad1 /man2/perf_event_open.2
parentInitial commit. (diff)
downloadmanpages-399644e47874bff147afb19c89228901ac39340e.tar.xz
manpages-399644e47874bff147afb19c89228901ac39340e.zip
Adding upstream version 6.05.01.upstream/6.05.01
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'man2/perf_event_open.2')
-rw-r--r--man2/perf_event_open.23989
1 files changed, 3989 insertions, 0 deletions
diff --git a/man2/perf_event_open.2 b/man2/perf_event_open.2
new file mode 100644
index 0000000..d9e7877
--- /dev/null
+++ b/man2/perf_event_open.2
@@ -0,0 +1,3989 @@
+.\" Copyright (c) 2012, Vincent Weaver
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" This document is based on the perf_event.h header file, the
+.\" tools/perf/design.txt file, and a lot of bitter experience.
+.\"
+.TH perf_event_open 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+perf_event_open \- set up performance monitoring
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/perf_event.h>" " /* Definition of " PERF_* " constants */"
+.BR "#include <linux/hw_breakpoint.h>" " /* Definition of " HW_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_perf_event_open, struct perf_event_attr *" attr ,
+.BI " pid_t " pid ", int " cpu ", int " group_fd \
+", unsigned long " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR perf_event_open (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+Given a list of parameters,
+.BR perf_event_open ()
+returns a file descriptor, for use in subsequent system calls
+.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
+.PP
+A call to
+.BR perf_event_open ()
+creates a file descriptor that allows measuring performance
+information.
+Each file descriptor corresponds to one
+event that is measured; these can be grouped together
+to measure multiple events simultaneously.
+.PP
+Events can be enabled and disabled in two ways: via
+.BR ioctl (2)
+and via
+.BR prctl (2).
+When an event is disabled it does not count or generate overflows but does
+continue to exist and maintain its count value.
+.PP
+Events come in two flavors: counting and sampled.
+A
+.I counting
+event is one that is used for counting the aggregate number of events
+that occur.
+In general, counting event results are gathered with a
+.BR read (2)
+call.
+A
+.I sampling
+event periodically writes measurements to a buffer that can then
+be accessed via
+.BR mmap (2).
+.SS Arguments
+The
+.I pid
+and
+.I cpu
+arguments allow specifying which process and CPU to monitor:
+.TP
+.BR "pid == 0" " and " "cpu == \-1"
+This measures the calling process/thread on any CPU.
+.TP
+.BR "pid == 0" " and " "cpu >= 0"
+This measures the calling process/thread only
+when running on the specified CPU.
+.TP
+.BR "pid > 0" " and " "cpu == \-1"
+This measures the specified process/thread on any CPU.
+.TP
+.BR "pid > 0" " and " "cpu >= 0"
+This measures the specified process/thread only
+when running on the specified CPU.
+.TP
+.BR "pid == \-1" " and " "cpu >= 0"
+This measures all processes/threads on the specified CPU.
+This requires
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+capability or a
+.I /proc/sys/kernel/perf_event_paranoid
+value of less than 1.
+.TP
+.BR "pid == \-1" " and " "cpu == \-1"
+This setting is invalid and will return an error.
+.PP
+When
+.I pid
+is greater than zero, permission to perform this system call
+is governed by
+.B CAP_PERFMON
+(since Linux 5.9) and a ptrace access mode
+.B PTRACE_MODE_READ_REALCREDS
+check on older Linux versions; see
+.BR ptrace (2).
+.PP
+The
+.I group_fd
+argument allows event groups to be created.
+An event group has one event which is the group leader.
+The leader is created first, with
+.IR group_fd " = \-1."
+The rest of the group members are created with subsequent
+.BR perf_event_open ()
+calls with
+.I group_fd
+being set to the file descriptor of the group leader.
+(A single event on its own is created with
+.IR group_fd " = \-1"
+and is considered to be a group with only 1 member.)
+An event group is scheduled onto the CPU as a unit:
+it will be put onto the CPU
+only if all of the events in the group can be put onto the CPU.
+This means that the values of the member events can be meaningfully compared
+\[em]added, divided (to get ratios), and so on\[em]
+with each other,
+since they have counted events for the same set of executed instructions.
+.PP
+The
+.I flags
+argument is formed by ORing together zero or more of the following values:
+.TP
+.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)"
+.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e
+This flag enables the close-on-exec flag for the created
+event file descriptor,
+so that the file descriptor is automatically closed on
+.BR execve (2).
+Setting the close-on-exec flags at creation time, rather than later with
+.BR fcntl (2),
+avoids potential race conditions where the calling thread invokes
+.BR perf_event_open ()
+and
+.BR fcntl (2)
+at the same time as another thread calls
+.BR fork (2)
+then
+.BR execve (2).
+.TP
+.B PERF_FLAG_FD_NO_GROUP
+This flag tells the event to ignore the
+.I group_fd
+parameter except for the purpose of setting up output redirection
+using the
+.B PERF_FLAG_FD_OUTPUT
+flag.
+.TP
+.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)"
+.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318
+This flag re-routes the event's sampled output to instead
+be included in the mmap buffer of the event specified by
+.IR group_fd .
+.TP
+.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)"
+.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25
+This flag activates per-container system-wide monitoring.
+A container
+is an abstraction that isolates a set of resources for finer-grained
+control (CPUs, memory, etc.).
+In this mode, the event is measured
+only if the thread running on the monitored CPU belongs to the designated
+container (cgroup).
+The cgroup is identified by passing a file descriptor
+opened on its directory in the cgroupfs filesystem.
+For instance, if the
+cgroup to monitor is called
+.IR test ,
+then a file descriptor opened on
+.I /dev/cgroup/test
+(assuming cgroupfs is mounted on
+.IR /dev/cgroup )
+must be passed as the
+.I pid
+parameter.
+cgroup monitoring is available only
+for system-wide events and may therefore require extra permissions.
+.PP
+The
+.I perf_event_attr
+structure provides detailed configuration information
+for the event being created.
+.PP
+.in +4n
+.EX
+struct perf_event_attr {
+ __u32 type; /* Type of event */
+ __u32 size; /* Size of attribute structure */
+ __u64 config; /* Type\-specific configuration */
+\&
+ union {
+ __u64 sample_period; /* Period of sampling */
+ __u64 sample_freq; /* Frequency of sampling */
+ };
+\&
+ __u64 sample_type; /* Specifies values included in sample */
+ __u64 read_format; /* Specifies values returned in read */
+\&
+ __u64 disabled : 1, /* off by default */
+ inherit : 1, /* children inherit it */
+ pinned : 1, /* must always be on PMU */
+ exclusive : 1, /* only group on PMU */
+ exclude_user : 1, /* don\[aq]t count user */
+ exclude_kernel : 1, /* don\[aq]t count kernel */
+ exclude_hv : 1, /* don\[aq]t count hypervisor */
+ exclude_idle : 1, /* don\[aq]t count when idle */
+ mmap : 1, /* include mmap data */
+ comm : 1, /* include comm data */
+ freq : 1, /* use freq, not period */
+ inherit_stat : 1, /* per task counts */
+ enable_on_exec : 1, /* next exec enables */
+ task : 1, /* trace fork/exit */
+ watermark : 1, /* wakeup_watermark */
+ precise_ip : 2, /* skid constraint */
+ mmap_data : 1, /* non\-exec mmap data */
+ sample_id_all : 1, /* sample_type all events */
+ exclude_host : 1, /* don\[aq]t count in host */
+ exclude_guest : 1, /* don\[aq]t count in guest */
+ exclude_callchain_kernel : 1,
+ /* exclude kernel callchains */
+ exclude_callchain_user : 1,
+ /* exclude user callchains */
+ mmap2 : 1, /* include mmap with inode data */
+ comm_exec : 1, /* flag comm events that are
+ due to exec */
+ use_clockid : 1, /* use clockid for time fields */
+ context_switch : 1, /* context switch data */
+ write_backward : 1, /* Write ring buffer from end
+ to beginning */
+ namespaces : 1, /* include namespaces data */
+ ksymbol : 1, /* include ksymbol events */
+ bpf_event : 1, /* include bpf events */
+ aux_output : 1, /* generate AUX records
+ instead of events */
+ cgroup : 1, /* include cgroup events */
+ text_poke : 1, /* include text poke events */
+ build_id : 1, /* use build id in mmap2 events */
+ inherit_thread : 1, /* children only inherit */
+ /* if cloned with CLONE_THREAD */
+ remove_on_exec : 1, /* event is removed from task
+ on exec */
+ sigtrap : 1, /* send synchronous SIGTRAP
+ on event */
+\&
+ __reserved_1 : 26;
+\&
+ union {
+ __u32 wakeup_events; /* wakeup every n events */
+ __u32 wakeup_watermark; /* bytes before wakeup */
+ };
+\&
+ __u32 bp_type; /* breakpoint type */
+\&
+ union {
+ __u64 bp_addr; /* breakpoint address */
+ __u64 kprobe_func; /* for perf_kprobe */
+ __u64 uprobe_path; /* for perf_uprobe */
+ __u64 config1; /* extension of config */
+ };
+\&
+ union {
+ __u64 bp_len; /* breakpoint length */
+ __u64 kprobe_addr; /* with kprobe_func == NULL */
+ __u64 probe_offset; /* for perf_[k,u]probe */
+ __u64 config2; /* extension of config1 */
+ };
+ __u64 branch_sample_type; /* enum perf_branch_sample_type */
+ __u64 sample_regs_user; /* user regs to dump on samples */
+ __u32 sample_stack_user; /* size of stack to dump on
+ samples */
+ __s32 clockid; /* clock to use for time fields */
+ __u64 sample_regs_intr; /* regs to dump on samples */
+ __u32 aux_watermark; /* aux bytes before wakeup */
+ __u16 sample_max_stack; /* max frames in callchain */
+ __u16 __reserved_2; /* align to u64 */
+ __u32 aux_sample_size; /* max aux sample size */
+ __u32 __reserved_3; /* align to u64 */
+ __u64 sig_data; /* user data for sigtrap */
+\&
+};
+.EE
+.in
+.PP
+The fields of the
+.I perf_event_attr
+structure are described in more detail below:
+.TP
+.I type
+This field specifies the overall event type.
+It has one of the following values:
+.RS
+.TP
+.B PERF_TYPE_HARDWARE
+This indicates one of the "generalized" hardware events provided
+by the kernel.
+See the
+.I config
+field definition for more details.
+.TP
+.B PERF_TYPE_SOFTWARE
+This indicates one of the software-defined events provided by the kernel
+(even if no hardware support is available).
+.TP
+.B PERF_TYPE_TRACEPOINT
+This indicates a tracepoint
+provided by the kernel tracepoint infrastructure.
+.TP
+.B PERF_TYPE_HW_CACHE
+This indicates a hardware cache event.
+This has a special encoding, described in the
+.I config
+field definition.
+.TP
+.B PERF_TYPE_RAW
+This indicates a "raw" implementation-specific event in the
+.IR config " field."
+.TP
+.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+This indicates a hardware breakpoint as provided by the CPU.
+Breakpoints can be read/write accesses to an address as well as
+execution of an instruction address.
+.TP
+dynamic PMU
+Since Linux 2.6.38,
+.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19
+.BR perf_event_open ()
+can support multiple PMUs.
+To enable this, a value exported by the kernel can be used in the
+.I type
+field to indicate which PMU to use.
+The value to use can be found in the sysfs filesystem:
+there is a subdirectory per PMU instance under
+.IR /sys/bus/event_source/devices .
+In each subdirectory there is a
+.I type
+file whose content is an integer that can be used in the
+.I type
+field.
+For instance,
+.I /sys/bus/event_source/devices/cpu/type
+contains the value for the core CPU PMU, which is usually 4.
+.TP
+.BR kprobe " and " uprobe " (since Linux 4.17)"
+.\" commit 65074d43fc77bcae32776724b7fa2696923c78e4
+.\" commit e12f03d7031a977356e3d7b75a68c2185ff8d155
+.\" commit 33ea4b24277b06dbc55d7f5772a46f029600255e
+These two dynamic PMUs create a kprobe/uprobe and attach it to the
+file descriptor generated by perf_event_open.
+The kprobe/uprobe will be destroyed on the destruction of the file descriptor.
+See fields
+.IR kprobe_func ,
+.IR uprobe_path ,
+.IR kprobe_addr ,
+and
+.I probe_offset
+for more details.
+.RE
+.TP
+.I "size"
+The size of the
+.I perf_event_attr
+structure for forward/backward compatibility.
+Set this using
+.I sizeof(struct perf_event_attr)
+to allow the kernel to see
+the struct size at the time of compilation.
+.IP
+The related define
+.B PERF_ATTR_SIZE_VER0
+is set to 64; this was the size of the first published struct.
+.B PERF_ATTR_SIZE_VER1
+is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
+.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
+.\" this was added much later when PERF_ATTR_SIZE_VER2 happened
+.\" but the actual attr_size had increased in Linux 2.6.33
+.B PERF_ATTR_SIZE_VER2
+is 80 corresponding to the addition of branch sampling in Linux 3.4.
+.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
+.B PERF_ATTR_SIZE_VER3
+is 96 corresponding to the addition
+of
+.I sample_regs_user
+and
+.I sample_stack_user
+in Linux 3.7.
+.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03
+.B PERF_ATTR_SIZE_VER4
+is 104 corresponding to the addition of
+.I sample_regs_intr
+in Linux 3.19.
+.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
+.B PERF_ATTR_SIZE_VER5
+is 112 corresponding to the addition of
+.I aux_watermark
+in Linux 4.1.
+.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
+.TP
+.I "config"
+This specifies which event you want, in conjunction with
+the
+.I type
+field.
+The
+.I config1
+and
+.I config2
+fields are also taken into account in cases where 64 bits is not
+enough to fully specify the event.
+The encoding of these fields are event dependent.
+.IP
+There are various ways to set the
+.I config
+field that are dependent on the value of the previously
+described
+.I type
+field.
+What follows are various possible settings for
+.I config
+separated out by
+.IR type .
+.IP
+If
+.I type
+is
+.BR PERF_TYPE_HARDWARE ,
+we are measuring one of the generalized hardware CPU events.
+Not all of these are available on all platforms.
+Set
+.I config
+to one of the following:
+.RS 12
+.TP
+.B PERF_COUNT_HW_CPU_CYCLES
+Total cycles.
+Be wary of what happens during CPU frequency scaling.
+.TP
+.B PERF_COUNT_HW_INSTRUCTIONS
+Retired instructions.
+Be careful, these can be affected by various
+issues, most notably hardware interrupt counts.
+.TP
+.B PERF_COUNT_HW_CACHE_REFERENCES
+Cache accesses.
+Usually this indicates Last Level Cache accesses but this may
+vary depending on your CPU.
+This may include prefetches and coherency messages; again this
+depends on the design of your CPU.
+.TP
+.B PERF_COUNT_HW_CACHE_MISSES
+Cache misses.
+Usually this indicates Last Level Cache misses; this is intended to be
+used in conjunction with the
+.B PERF_COUNT_HW_CACHE_REFERENCES
+event to calculate cache miss rates.
+.TP
+.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
+Retired branch instructions.
+Prior to Linux 2.6.35, this used
+the wrong event on AMD processors.
+.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
+.TP
+.B PERF_COUNT_HW_BRANCH_MISSES
+Mispredicted branch instructions.
+.TP
+.B PERF_COUNT_HW_BUS_CYCLES
+Bus cycles, which can be different from total cycles.
+.TP
+.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
+.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
+Stalled cycles during issue.
+.TP
+.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
+.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
+Stalled cycles during retirement.
+.TP
+.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
+.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890
+Total cycles; not affected by CPU frequency scaling.
+.RE
+.IP
+If
+.I type
+is
+.BR PERF_TYPE_SOFTWARE ,
+we are measuring software events provided by the kernel.
+Set
+.I config
+to one of the following:
+.RS 12
+.TP
+.B PERF_COUNT_SW_CPU_CLOCK
+This reports the CPU clock, a high-resolution per-CPU timer.
+.TP
+.B PERF_COUNT_SW_TASK_CLOCK
+This reports a clock count specific to the task that is running.
+.TP
+.B PERF_COUNT_SW_PAGE_FAULTS
+This reports the number of page faults.
+.TP
+.B PERF_COUNT_SW_CONTEXT_SWITCHES
+This counts context switches.
+Until Linux 2.6.34, these were all reported as user-space
+events, after that they are reported as happening in the kernel.
+.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21
+.TP
+.B PERF_COUNT_SW_CPU_MIGRATIONS
+This reports the number of times the process
+has migrated to a new CPU.
+.TP
+.B PERF_COUNT_SW_PAGE_FAULTS_MIN
+This counts the number of minor page faults.
+These did not require disk I/O to handle.
+.TP
+.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
+This counts the number of major page faults.
+These required disk I/O to handle.
+.TP
+.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
+.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
+This counts the number of alignment faults.
+These happen when unaligned memory accesses happen; the kernel
+can handle these but it reduces performance.
+This happens only on some architectures (never on x86).
+.TP
+.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
+.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
+This counts the number of emulation faults.
+The kernel sometimes traps on unimplemented instructions
+and emulates them for user space.
+This can negatively impact performance.
+.TP
+.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
+.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77
+This is a placeholder event that counts nothing.
+Informational sample record types such as mmap or comm
+must be associated with an active event.
+This dummy event allows gathering such records without requiring
+a counting event.
+.TP
+.BR PERF_COUNT_SW_BPF_OUTPUT " (since Linux 4.4)"
+.\" commit a43eec304259a6c637f4014a6d4767159b6a3aa3
+This is used to generate raw sample data from BPF.
+BPF programs can write to this event using
+.B bpf_perf_event_output
+helper.
+.TP
+.BR PERF_COUNT_SW_CGROUP_SWITCHES " (since Linux 5.13)"
+.\" commit d0d1dd628527c77db2391ce0293c1ed344b2365f
+This counts context switches to a task in a different cgroup.
+In other words, if the next task is in the same cgroup,
+it won't count the switch.
+.RE
+.PP
+.RS
+If
+.I type
+is
+.BR PERF_TYPE_TRACEPOINT ,
+then we are measuring kernel tracepoints.
+The value to use in
+.I config
+can be obtained from under debugfs
+.I tracing/events/*/*/id
+if ftrace is enabled in the kernel.
+.RE
+.PP
+.RS
+If
+.I type
+is
+.BR PERF_TYPE_HW_CACHE ,
+then we are measuring a hardware CPU cache event.
+To calculate the appropriate
+.I config
+value, use the following equation:
+.RS 4
+.PP
+.in +4n
+.EX
+config = (perf_hw_cache_id) |
+ (perf_hw_cache_op_id << 8) |
+ (perf_hw_cache_op_result_id << 16);
+.EE
+.in
+.PP
+where
+.I perf_hw_cache_id
+is one of:
+.RS 4
+.TP
+.B PERF_COUNT_HW_CACHE_L1D
+for measuring Level 1 Data Cache
+.TP
+.B PERF_COUNT_HW_CACHE_L1I
+for measuring Level 1 Instruction Cache
+.TP
+.B PERF_COUNT_HW_CACHE_LL
+for measuring Last-Level Cache
+.TP
+.B PERF_COUNT_HW_CACHE_DTLB
+for measuring the Data TLB
+.TP
+.B PERF_COUNT_HW_CACHE_ITLB
+for measuring the Instruction TLB
+.TP
+.B PERF_COUNT_HW_CACHE_BPU
+for measuring the branch prediction unit
+.TP
+.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)"
+.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477
+for measuring local memory accesses
+.RE
+.PP
+and
+.I perf_hw_cache_op_id
+is one of:
+.RS 4
+.TP
+.B PERF_COUNT_HW_CACHE_OP_READ
+for read accesses
+.TP
+.B PERF_COUNT_HW_CACHE_OP_WRITE
+for write accesses
+.TP
+.B PERF_COUNT_HW_CACHE_OP_PREFETCH
+for prefetch accesses
+.RE
+.PP
+and
+.I perf_hw_cache_op_result_id
+is one of:
+.RS 4
+.TP
+.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
+to measure accesses
+.TP
+.B PERF_COUNT_HW_CACHE_RESULT_MISS
+to measure misses
+.RE
+.RE
+.PP
+If
+.I type
+is
+.BR PERF_TYPE_RAW ,
+then a custom "raw"
+.I config
+value is needed.
+Most CPUs support events that are not covered by the "generalized" events.
+These are implementation defined; see your CPU manual (for example
+the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
+Guide).
+The libpfm4 library can be used to translate from the name in the
+architectural manuals to the raw hex value
+.BR perf_event_open ()
+expects in this field.
+.PP
+If
+.I type
+is
+.BR PERF_TYPE_BREAKPOINT ,
+then leave
+.I config
+set to zero.
+Its parameters are set in other places.
+.PP
+If
+.I type
+is
+.B kprobe
+or
+.BR uprobe ,
+set
+.I retprobe
+(bit 0 of
+.IR config ,
+see
+.IR /sys/bus/event_source/devices/[k,u]probe/format/retprobe )
+for kretprobe/uretprobe.
+See fields
+.IR kprobe_func ,
+.IR uprobe_path ,
+.IR kprobe_addr ,
+and
+.I probe_offset
+for more details.
+.RE
+.TP
+.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset
+These fields describe the kprobe/uprobe for dynamic PMUs
+.B kprobe
+and
+.BR uprobe .
+For
+.BR kprobe :
+use
+.I kprobe_func
+and
+.IR probe_offset ,
+or use
+.I kprobe_addr
+and leave
+.I kprobe_func
+as NULL.
+For
+.BR uprobe :
+use
+.I uprobe_path
+and
+.IR probe_offset .
+.TP
+.IR sample_period ", " sample_freq
+A "sampling" event is one that generates an overflow notification
+every N events, where N is given by
+.IR sample_period .
+A sampling event has
+.IR sample_period " > 0."
+When an overflow occurs, requested data is recorded
+in the mmap buffer.
+The
+.I sample_type
+field controls what data is recorded on each overflow.
+.IP
+.I sample_freq
+can be used if you wish to use frequency rather than period.
+In this case, you set the
+.I freq
+flag.
+The kernel will adjust the sampling period
+to try and achieve the desired rate.
+The rate of adjustment is a
+timer tick.
+.TP
+.I sample_type
+The various bits in this field specify which values to include
+in the sample.
+They will be recorded in a ring-buffer,
+which is available to user space using
+.BR mmap (2).
+The order in which the values are saved in the
+sample are documented in the MMAP Layout subsection below;
+it is not the
+.I "enum perf_event_sample_format"
+order.
+.RS
+.TP
+.B PERF_SAMPLE_IP
+Records instruction pointer.
+.TP
+.B PERF_SAMPLE_TID
+Records the process and thread IDs.
+.TP
+.B PERF_SAMPLE_TIME
+Records a timestamp.
+.TP
+.B PERF_SAMPLE_ADDR
+Records an address, if applicable.
+.TP
+.B PERF_SAMPLE_READ
+Record counter values for all events in a group, not just the group leader.
+.TP
+.B PERF_SAMPLE_CALLCHAIN
+Records the callchain (stack backtrace).
+.TP
+.B PERF_SAMPLE_ID
+Records a unique ID for the opened event's group leader.
+.TP
+.B PERF_SAMPLE_CPU
+Records CPU number.
+.TP
+.B PERF_SAMPLE_PERIOD
+Records the current sampling period.
+.TP
+.B PERF_SAMPLE_STREAM_ID
+Records a unique ID for the opened event.
+Unlike
+.B PERF_SAMPLE_ID
+the actual ID is returned, not the group leader.
+This ID is the same as the one returned by
+.BR PERF_FORMAT_ID .
+.TP
+.B PERF_SAMPLE_RAW
+Records additional data, if applicable.
+Usually returned by tracepoint events.
+.TP
+.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
+.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
+This provides a record of recent branches, as provided
+by CPU branch sampling hardware (such as Intel Last Branch Record).
+Not all hardware supports this feature.
+.IP
+See the
+.I branch_sample_type
+field for how to filter which branches are reported.
+.TP
+.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
+.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
+Records the current user-level CPU register state
+(the values in the process before the kernel was called).
+.TP
+.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
+.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
+Records the user level stack, allowing stack unwinding.
+.TP
+.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
+.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c
+Records a hardware provided weight value that expresses how
+costly the sampled event was.
+This allows the hardware to highlight expensive events in
+a profile.
+.TP
+.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
+.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1
+Records the data source: where in the memory hierarchy
+the data associated with the sampled instruction came from.
+This is available only if the underlying hardware
+supports this feature.
+.TP
+.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
+.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955
+Places the
+.B SAMPLE_ID
+value in a fixed position in the record,
+either at the beginning (for sample events) or at the end
+(if a non-sample event).
+.IP
+This was necessary because a sample stream may have
+records from various different event sources with different
+.I sample_type
+settings.
+Parsing the event stream properly was not possible because the
+format of the record was needed to find
+.BR SAMPLE_ID ,
+but
+the format could not be found without knowing what
+event the sample belonged to (causing a circular
+dependency).
+.IP
+The
+.B PERF_SAMPLE_IDENTIFIER
+setting makes the event stream always parsable
+by putting
+.B SAMPLE_ID
+in a fixed location, even though
+it means having duplicate
+.B SAMPLE_ID
+values in records.
+.TP
+.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)"
+.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5
+Records reasons for transactional memory abort events
+(for example, from Intel TSX transactional memory support).
+.IP
+The
+.I precise_ip
+setting must be greater than 0 and a transactional memory abort
+event must be measured or no values will be recorded.
+Also note that some perf_event measurements, such as sampled
+cycle counting, may cause extraneous aborts (by causing an
+interrupt during a transaction).
+.TP
+.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)"
+.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
+Records a subset of the current CPU register state
+as specified by
+.IR sample_regs_intr .
+Unlike
+.B PERF_SAMPLE_REGS_USER
+the register values will return kernel register
+state if the overflow happened while kernel
+code is running.
+If the CPU supports hardware sampling of
+register state (i.e., PEBS on Intel x86) and
+.I precise_ip
+is set higher than zero then the register
+values returned are those captured by
+hardware at the time of the sampled
+instruction's retirement.
+.TP
+.BR PERF_SAMPLE_PHYS_ADDR " (since Linux 4.13)"
+.\" commit fc7ce9c74c3ad232b084d80148654f926d01ece7
+Records physical address of data like in
+.BR PERF_SAMPLE_ADDR .
+.TP
+.BR PERF_SAMPLE_CGROUP " (since Linux 5.7)"
+.\" commit 96aaab686505c449e24d76e76507290dcc30e008
+Records (perf_event) cgroup ID of the process.
+This corresponds to the
+.I id
+field in the
+.B PERF_RECORD_CGROUP
+event.
+.TP
+.BR PERF_SAMPLE_DATA_PAGE_SIZE " (since Linux 5.11)"
+.\" commit 8d97e71811aaafe4abf611dc24822fd6e73df1a1
+Records page size of data like in
+.BR PERF_SAMPLE_ADDR .
+.TP
+.BR PERF_SAMPLE_CODE_PAGE_SIZE " (since Linux 5.11)"
+.\" commit 995f088efebe1eba0282a6ffa12411b37f8990c2
+Records page size of ip like in
+.BR PERF_SAMPLE_IP .
+.TP
+.BR PERF_SAMPLE_WEIGHT_STRUCT " (since Linux 5.12)"
+.\" commit 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c
+Records hardware provided weight values like in
+.BR PERF_SAMPLE_WEIGHT ,
+but it can represent multiple values in a struct.
+This shares the same space as
+.BR PERF_SAMPLE_WEIGHT ,
+so users can apply either of those,
+not both.
+It has the following format and
+the meaning of each field is
+dependent on the hardware implementation.
+.PP
+.in +4n
+.EX
+union perf_sample_weight {
+ u64 full; /* PERF_SAMPLE_WEIGHT */
+ struct { /* PERF_SAMPLE_WEIGHT_STRUCT */
+ u32 var1_dw;
+ u16 var2_w;
+ u16 var3_w;
+ };
+};
+.EE
+.in
+.RE
+.TP
+.I read_format
+This field specifies the format of the data returned by
+.BR read (2)
+on a
+.BR perf_event_open ()
+file descriptor.
+.RS
+.TP
+.B PERF_FORMAT_TOTAL_TIME_ENABLED
+Adds the 64-bit
+.I time_enabled
+field.
+This can be used to calculate estimated totals if
+the PMU is overcommitted and multiplexing is happening.
+.TP
+.B PERF_FORMAT_TOTAL_TIME_RUNNING
+Adds the 64-bit
+.I time_running
+field.
+This can be used to calculate estimated totals if
+the PMU is overcommitted and multiplexing is happening.
+.TP
+.B PERF_FORMAT_ID
+Adds a 64-bit unique value that corresponds to the event group.
+.TP
+.B PERF_FORMAT_GROUP
+Allows all counter values in an event group to be read with one read.
+.TP
+.B PERF_FORMAT_LOST " (since Linux 6.0)"
+.\" commit 119a784c81270eb88e573174ed2209225d646656
+Adds a 64-bit value that is the number of lost samples for this event.
+This would be only meaningful when
+.I sample_period
+or
+.I sample_freq
+is set.
+.RE
+.TP
+.I disabled
+The
+.I disabled
+bit specifies whether the counter starts out disabled or enabled.
+If disabled, the event can later be enabled by
+.BR ioctl (2),
+.BR prctl (2),
+or
+.IR enable_on_exec .
+.IP
+When creating an event group, typically the group leader is initialized
+with
+.I disabled
+set to 1 and any child events are initialized with
+.I disabled
+set to 0.
+Despite
+.I disabled
+being 0, the child events will not start until the group leader
+is enabled.
+.TP
+.I inherit
+The
+.I inherit
+bit specifies that this counter should count events of child
+tasks as well as the task specified.
+This applies only to new children, not to any existing children at
+the time the counter is created (nor to any new children of
+existing children).
+.IP
+Inherit does not work for some combinations of
+.I read_format
+values, such as
+.BR PERF_FORMAT_GROUP .
+.TP
+.I pinned
+The
+.I pinned
+bit specifies that the counter should always be on the CPU if at all
+possible.
+It applies only to hardware counters and only to group leaders.
+If a pinned counter cannot be put onto the CPU (e.g., because there are
+not enough hardware counters or because of a conflict with some other
+event), then the counter goes into an 'error' state, where reads
+return end-of-file (i.e.,
+.BR read (2)
+returns 0) until the counter is subsequently enabled or disabled.
+.TP
+.I exclusive
+The
+.I exclusive
+bit specifies that when this counter's group is on the CPU,
+it should be the only group using the CPU's counters.
+In the future this may allow monitoring programs to
+support PMU features that need to run alone so that they do not
+disrupt other hardware counters.
+.IP
+Note that many unexpected situations may prevent events with the
+.I exclusive
+bit set from ever running.
+This includes any users running a system-wide
+measurement as well as any kernel use of the performance counters
+(including the commonly enabled NMI Watchdog Timer interface).
+.TP
+.I exclude_user
+If this bit is set, the count excludes events that happen in user space.
+.TP
+.I exclude_kernel
+If this bit is set, the count excludes events that happen in kernel space.
+.TP
+.I exclude_hv
+If this bit is set, the count excludes events that happen in the
+hypervisor.
+This is mainly for PMUs that have built-in support for handling this
+(such as POWER).
+Extra support is needed for handling hypervisor measurements on most
+machines.
+.TP
+.I exclude_idle
+If set, don't count when the CPU is running the idle task.
+While you can currently enable this for any event type, it is ignored
+for all but software events.
+.TP
+.I mmap
+The
+.I mmap
+bit enables generation of
+.B PERF_RECORD_MMAP
+samples for every
+.BR mmap (2)
+call that has
+.B PROT_EXEC
+set.
+This allows tools to notice new executable code being mapped into
+a program (dynamic shared libraries for example)
+so that addresses can be mapped back to the original code.
+.TP
+.I comm
+The
+.I comm
+bit enables tracking of process command name as modified by the
+.BR execve (2)
+and
+.BR prctl (PR_SET_NAME)
+system calls as well as writing to
+.IR /proc/self/comm .
+If the
+.I comm_exec
+flag is also successfully set (possible since Linux 3.16),
+.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
+then the misc flag
+.B PERF_RECORD_MISC_COMM_EXEC
+can be used to differentiate the
+.BR execve (2)
+case from the others.
+.TP
+.I freq
+If this bit is set, then
+.I sample_frequency
+not
+.I sample_period
+is used when setting up the sampling interval.
+.TP
+.I inherit_stat
+This bit enables saving of event counts on context switch for
+inherited tasks.
+This is meaningful only if the
+.I inherit
+field is set.
+.TP
+.I enable_on_exec
+If this bit is set, a counter is automatically
+enabled after a call to
+.BR execve (2).
+.TP
+.I task
+If this bit is set, then
+fork/exit notifications are included in the ring buffer.
+.TP
+.I watermark
+If set, have an overflow notification happen when we cross the
+.I wakeup_watermark
+boundary.
+Otherwise, overflow notifications happen after
+.I wakeup_events
+samples.
+.TP
+.IR precise_ip " (since Linux 2.6.35)"
+.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076
+This controls the amount of skid.
+Skid is how many instructions
+execute between an event of interest happening and the kernel
+being able to stop and record the event.
+Smaller skid is
+better and allows more accurate reporting of which events
+correspond to which instructions, but hardware is often limited
+with how small this can be.
+.IP
+The possible values of this field are the following:
+.RS
+.TP
+.B 0
+.B SAMPLE_IP
+can have arbitrary skid.
+.TP
+.B 1
+.B SAMPLE_IP
+must have constant skid.
+.TP
+.B 2
+.B SAMPLE_IP
+requested to have 0 skid.
+.TP
+.B 3
+.B SAMPLE_IP
+must have 0 skid.
+See also the description of
+.BR PERF_RECORD_MISC_EXACT_IP .
+.RE
+.TP
+.IR mmap_data " (since Linux 2.6.36)"
+.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e
+This is the counterpart of the
+.I mmap
+field.
+This enables generation of
+.B PERF_RECORD_MMAP
+samples for
+.BR mmap (2)
+calls that do not have
+.B PROT_EXEC
+set (for example data and SysV shared memory).
+.TP
+.IR sample_id_all " (since Linux 2.6.38)"
+.\" commit c980d1091810df13f21aabbce545fd98f545bbf7
+If set, then TID, TIME, ID, STREAM_ID, and CPU can
+additionally be included in
+.RB non- PERF_RECORD_SAMPLE s
+if the corresponding
+.I sample_type
+is selected.
+.IP
+If
+.B PERF_SAMPLE_IDENTIFIER
+is specified, then an additional ID value is included
+as the last value to ease parsing the record stream.
+This may lead to the
+.I id
+value appearing twice.
+.IP
+The layout is described by this pseudo-structure:
+.IP
+.in +4n
+.EX
+struct sample_id {
+ { u32 pid, tid; } /* if PERF_SAMPLE_TID set */
+ { u64 time; } /* if PERF_SAMPLE_TIME set */
+ { u64 id; } /* if PERF_SAMPLE_ID set */
+ { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
+ { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
+ { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
+};
+.EE
+.in
+.TP
+.IR exclude_host " (since Linux 3.2)"
+.\" commit a240f76165e6255384d4bdb8139895fac7988799
+When conducting measurements that include processes running
+VM instances (i.e., have executed a
+.B KVM_RUN
+.BR ioctl (2)),
+only measure events happening inside a guest instance.
+This is only meaningful outside the guests; this setting does
+not change counts gathered inside of a guest.
+Currently, this functionality is x86 only.
+.TP
+.IR exclude_guest " (since Linux 3.2)"
+.\" commit a240f76165e6255384d4bdb8139895fac7988799
+When conducting measurements that include processes running
+VM instances (i.e., have executed a
+.B KVM_RUN
+.BR ioctl (2)),
+do not measure events happening inside guest instances.
+This is only meaningful outside the guests; this setting does
+not change counts gathered inside of a guest.
+Currently, this functionality is x86 only.
+.TP
+.IR exclude_callchain_kernel " (since Linux 3.7)"
+.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
+Do not include kernel callchains.
+.TP
+.IR exclude_callchain_user " (since Linux 3.7)"
+.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
+Do not include user callchains.
+.TP
+.IR mmap2 " (since Linux 3.16)"
+.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741
+.\" This is tricky; was committed during 3.12 development
+.\" but right before release was disabled.
+.\" So while you could select mmap2 starting with Linux 3.12
+.\" it did not work until Linux 3.16
+.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005
+Generate an extended executable mmap record that contains enough
+additional information to uniquely identify shared mappings.
+The
+.I mmap
+flag must also be set for this to work.
+.TP
+.IR comm_exec " (since Linux 3.16)"
+.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
+This is purely a feature-detection flag, it does not change
+kernel behavior.
+If this flag can successfully be set, then, when
+.I comm
+is enabled, the
+.B PERF_RECORD_MISC_COMM_EXEC
+flag will be set in the
+.I misc
+field of a comm record header if the rename event being
+reported was caused by a call to
+.BR execve (2).
+This allows tools to distinguish between the various
+types of process renaming.
+.TP
+.IR use_clockid " (since Linux 4.1)"
+.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
+This allows selecting which internal Linux clock to use
+when generating timestamps via the
+.I clockid
+field.
+This can make it easier to correlate perf sample times with
+timestamps generated by other tools.
+.TP
+.IR context_switch " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+This enables the generation of
+.B PERF_RECORD_SWITCH
+records when a context switch occurs.
+It also enables the generation of
+.B PERF_RECORD_SWITCH_CPU_WIDE
+records when sampling in CPU-wide mode.
+This functionality is in addition to existing tracepoint and
+software events for measuring context switches.
+The advantage of this method is that it will give full
+information even with strict
+.I perf_event_paranoid
+settings.
+.TP
+.IR write_backward " (since Linux 4.6)"
+.\" commit 9ecda41acb971ebd07c8fb35faf24005c0baea12
+This causes the ring buffer to be written from the end to the beginning.
+This is to support reading from overwritable ring buffer.
+.TP
+.IR namespaces " (since Linux 4.11)"
+.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
+This enables the generation of
+.B PERF_RECORD_NAMESPACES
+records when a task enters a new namespace.
+Each namespace has a combination of device and inode numbers.
+.TP
+.IR ksymbol " (since Linux 5.0)"
+.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
+This enables the generation of
+.B PERF_RECORD_KSYMBOL
+records when new kernel symbols are registered or unregistered.
+This is analyzing dynamic kernel functions like eBPF.
+.TP
+.IR bpf_event " (since Linux 5.0)"
+.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
+This enables the generation of
+.B PERF_RECORD_BPF_EVENT
+records when an eBPF program is loaded or unloaded.
+.TP
+.IR aux_output " (since Linux 5.4)"
+.\" commit ab43762ef010967e4ccd53627f70a2eecbeafefb
+This allows normal (non-AUX) events to generate data for AUX events
+if the hardware supports it.
+.TP
+.IR cgroup " (since Linux 5.7)"
+.\" commit 96aaab686505c449e24d76e76507290dcc30e008
+This enables the generation of
+.B PERF_RECORD_CGROUP
+records when a new cgroup is created (and activated).
+.TP
+.IR text_poke " (since Linux 5.8)"
+.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
+This enables the generation of
+.B PERF_RECORD_TEXT_POKE
+records when there's a change to the kernel text
+(i.e., self-modifying code).
+.TP
+.IR build_id " (since Linux 5.12)"
+.\" commit 88a16a1309333e43d328621ece3e9fa37027e8eb
+This changes the contents in the
+.B PERF_RECORD_MMAP2
+to have a build-id instead of device and inode numbers.
+.TP
+.IR inherit_thread " (since Linux 5.13)"
+.\" commit 2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f
+This disables the inheritance of the event to a child process.
+Only new threads in the same process
+(which is cloned with
+.BR CLONE_THREAD )
+will inherit the event.
+.TP
+.IR remove_on_exec " (since Linux 5.13)"
+.\" commit 2e498d0a74e5b88a6689ae1b811f247f91ff188e
+This closes the event when it starts a new process image by
+.BR execve (2).
+.TP
+.IR sigtrap " (since Linux 5.13)"
+.\" commit 97ba62b278674293762c3d91f724f1bb922f04e0
+This enables synchronous signal delivery of
+.B SIGTRAP
+on event overflow.
+.TP
+.IR wakeup_events ", " wakeup_watermark
+This union sets how many samples
+.RI ( wakeup_events )
+or bytes
+.RI ( wakeup_watermark )
+happen before an overflow notification happens.
+Which one is used is selected by the
+.I watermark
+bit flag.
+.IP
+.I wakeup_events
+counts only
+.B PERF_RECORD_SAMPLE
+record types.
+To receive overflow notification for all
+.B PERF_RECORD
+types choose watermark and set
+.I wakeup_watermark
+to 1.
+.IP
+Prior to Linux 3.0, setting
+.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50
+.I wakeup_events
+to 0 resulted in no overflow notifications;
+more recent kernels treat 0 the same as 1.
+.TP
+.IR bp_type " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+This chooses the breakpoint type.
+It is one of:
+.RS
+.TP
+.B HW_BREAKPOINT_EMPTY
+No breakpoint.
+.TP
+.B HW_BREAKPOINT_R
+Count when we read the memory location.
+.TP
+.B HW_BREAKPOINT_W
+Count when we write the memory location.
+.TP
+.B HW_BREAKPOINT_RW
+Count when we read or write the memory location.
+.TP
+.B HW_BREAKPOINT_X
+Count when we execute code at the memory location.
+.PP
+The values can be combined via a bitwise or, but the
+combination of
+.B HW_BREAKPOINT_R
+or
+.B HW_BREAKPOINT_W
+with
+.B HW_BREAKPOINT_X
+is not allowed.
+.RE
+.TP
+.IR bp_addr " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+This is the address of the breakpoint.
+For execution breakpoints, this is the memory address of the instruction
+of interest; for read and write breakpoints, it is the memory address
+of the memory location of interest.
+.TP
+.IR config1 " (since Linux 2.6.39)"
+.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
+.I config1
+is used for setting events that need an extra register or otherwise
+do not fit in the regular config field.
+Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
+on Linux 3.3 and later kernels.
+.TP
+.IR bp_len " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+.I bp_len
+is the length of the breakpoint being measured if
+.I type
+is
+.BR PERF_TYPE_BREAKPOINT .
+Options are
+.BR HW_BREAKPOINT_LEN_1 ,
+.BR HW_BREAKPOINT_LEN_2 ,
+.BR HW_BREAKPOINT_LEN_4 ,
+and
+.BR HW_BREAKPOINT_LEN_8 .
+For an execution breakpoint, set this to
+.IR sizeof(long) .
+.TP
+.IR config2 " (since Linux 2.6.39)"
+.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
+.I config2
+is a further extension of the
+.I config1
+field.
+.TP
+.IR branch_sample_type " (since Linux 3.4)"
+.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
+If
+.B PERF_SAMPLE_BRANCH_STACK
+is enabled, then this specifies what branches to include
+in the branch record.
+.IP
+The first part of the value is the privilege level, which
+is a combination of one of the values listed below.
+If the user does not set privilege level explicitly, the kernel
+will use the event's privilege level.
+Event and branch privilege levels do not have to match.
+.RS
+.TP
+.B PERF_SAMPLE_BRANCH_USER
+Branch target is in user space.
+.TP
+.B PERF_SAMPLE_BRANCH_KERNEL
+Branch target is in kernel space.
+.TP
+.B PERF_SAMPLE_BRANCH_HV
+Branch target is in hypervisor.
+.TP
+.B PERF_SAMPLE_BRANCH_PLM_ALL
+A convenience value that is the three preceding values ORed together.
+.PP
+In addition to the privilege value, at least one or more of the
+following bits must be set.
+.TP
+.B PERF_SAMPLE_BRANCH_ANY
+Any branch type.
+.TP
+.B PERF_SAMPLE_BRANCH_ANY_CALL
+Any call branch (includes direct calls, indirect calls, and far jumps).
+.TP
+.B PERF_SAMPLE_BRANCH_IND_CALL
+Indirect calls.
+.TP
+.BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)"
+.\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73
+Direct calls.
+.TP
+.B PERF_SAMPLE_BRANCH_ANY_RETURN
+Any return branch.
+.TP
+.BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)"
+.\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc
+Indirect jumps.
+.TP
+.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)"
+.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050
+Conditional branches.
+.TP
+.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+Transactional memory aborts.
+.TP
+.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+Branch in transactional memory transaction.
+.TP
+.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+Branch not in transactional memory transaction.
+.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)"
+.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70
+Branch is part of a hardware-generated call stack.
+This requires hardware support, currently only found
+on Intel x86 Haswell or newer.
+.RE
+.TP
+.IR sample_regs_user " (since Linux 3.7)"
+.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
+This bit mask defines the set of user CPU registers to dump on samples.
+The layout of the register mask is architecture-specific and
+is described in the kernel header file
+.IR arch/ARCH/include/uapi/asm/perf_regs.h .
+.TP
+.IR sample_stack_user " (since Linux 3.7)"
+.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
+This defines the size of the user stack to dump if
+.B PERF_SAMPLE_STACK_USER
+is specified.
+.TP
+.IR clockid " (since Linux 4.1)"
+.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
+If
+.I use_clockid
+is set, then this field selects which internal Linux timer to
+use for timestamps.
+The available timers are defined in
+.IR linux/time.h ,
+with
+.BR CLOCK_MONOTONIC ,
+.BR CLOCK_MONOTONIC_RAW ,
+.BR CLOCK_REALTIME ,
+.BR CLOCK_BOOTTIME ,
+and
+.B CLOCK_TAI
+currently supported.
+.TP
+.IR aux_watermark " (since Linux 4.1)"
+.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
+This specifies how much data is required to trigger a
+.B PERF_RECORD_AUX
+sample.
+.TP
+.IR sample_max_stack " (since Linux 4.8)"
+.\" commit 97c79a38cd454602645f0470ffb444b3b75ce574
+When
+.I sample_type
+includes
+.BR PERF_SAMPLE_CALLCHAIN ,
+this field specifies how many stack frames to report when
+generating the callchain.
+.TP
+.IR aux_sample_size " (since Linux 5.5)"
+.\" commit a4faf00d994c40e64f656805ac375c65e324eefb
+When
+.B PERF_SAMPLE_AUX
+flag is set,
+specify the desired size of AUX data.
+Note that it can get smaller data than the specified size.
+.TP
+.IR sig_data " (since Linux 5.13)"
+.\" commit 97ba62b278674293762c3d91f724f1bb922f04e0
+This data will be copied to user's signal handler
+(through
+.I si_perf
+in the
+.IR siginfo_t )
+to disambiguate which event triggered the signal.
+.SS Reading results
+Once a
+.BR perf_event_open ()
+file descriptor has been opened, the values
+of the events can be read from the file descriptor.
+The values that are there are specified by the
+.I read_format
+field in the
+.I attr
+structure at open time.
+.PP
+If you attempt to read into a buffer that is not big enough to hold the
+data, the error
+.B ENOSPC
+results.
+.PP
+Here is the layout of the data returned by a read:
+.IP \[bu] 3
+If
+.B PERF_FORMAT_GROUP
+was specified to allow reading all events in a group at once:
+.IP
+.in +4n
+.EX
+struct read_format {
+ u64 nr; /* The number of events */
+ u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+ u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+ struct {
+ u64 value; /* The value of the event */
+ u64 id; /* if PERF_FORMAT_ID */
+ u64 lost; /* if PERF_FORMAT_LOST */
+ } values[nr];
+};
+.EE
+.in
+.IP \[bu]
+If
+.B PERF_FORMAT_GROUP
+was
+.I not
+specified:
+.IP
+.in +4n
+.EX
+struct read_format {
+ u64 value; /* The value of the event */
+ u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+ u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+ u64 id; /* if PERF_FORMAT_ID */
+ u64 lost; /* if PERF_FORMAT_LOST */
+};
+.EE
+.in
+.PP
+The values read are as follows:
+.TP
+.I nr
+The number of events in this file descriptor.
+Available only if
+.B PERF_FORMAT_GROUP
+was specified.
+.TP
+.IR time_enabled ", " time_running
+Total time the event was enabled and running.
+Normally these values are the same.
+Multiplexing happens if the number of events is more than the
+number of available PMU counter slots.
+In that case the events run only part of the time and the
+.I time_enabled
+and
+.I time running
+values can be used to scale an estimated value for the count.
+.TP
+.I value
+An unsigned 64-bit value containing the counter result.
+.TP
+.I id
+A globally unique value for this particular event; only present if
+.B PERF_FORMAT_ID
+was specified in
+.IR read_format .
+.TP
+.I lost
+The number of lost samples of this event;
+only present if
+.B PERF_FORMAT_LOST
+was specified in
+.IR read_format .
+.SS MMAP layout
+When using
+.BR perf_event_open ()
+in sampled mode, asynchronous events
+(like counter overflow or
+.B PROT_EXEC
+mmap tracking)
+are logged into a ring-buffer.
+This ring-buffer is created and accessed through
+.BR mmap (2).
+.PP
+The mmap size should be 1+2\[ha]n pages, where the first page is a
+metadata page
+.RI ( "struct perf_event_mmap_page" )
+that contains various
+bits of information such as where the ring-buffer head is.
+.PP
+Before Linux 2.6.39, there is a bug that means you must allocate an mmap
+ring buffer when sampling even if you do not plan to access it.
+.PP
+The structure of the first metadata mmap page is as follows:
+.PP
+.in +4n
+.EX
+struct perf_event_mmap_page {
+ __u32 version; /* version number of this structure */
+ __u32 compat_version; /* lowest version this is compat with */
+ __u32 lock; /* seqlock for synchronization */
+ __u32 index; /* hardware counter identifier */
+ __s64 offset; /* add to hardware counter value */
+ __u64 time_enabled; /* time event active */
+ __u64 time_running; /* time event on CPU */
+ union {
+ __u64 capabilities;
+ struct {
+ __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
+ cap_bit0_is_deprecated : 1,
+ cap_user_rdpmc : 1,
+ cap_user_time : 1,
+ cap_user_time_zero : 1,
+ };
+ };
+ __u16 pmc_width;
+ __u16 time_shift;
+ __u32 time_mult;
+ __u64 time_offset;
+ __u64 __reserved[120]; /* Pad to 1 k */
+ __u64 data_head; /* head in the data section */
+ __u64 data_tail; /* user\-space written tail */
+ __u64 data_offset; /* where the buffer starts */
+ __u64 data_size; /* data buffer size */
+ __u64 aux_head;
+ __u64 aux_tail;
+ __u64 aux_offset;
+ __u64 aux_size;
+\&
+}
+.EE
+.in
+.PP
+The following list describes the fields in the
+.I perf_event_mmap_page
+structure in more detail:
+.TP
+.I version
+Version number of this structure.
+.TP
+.I compat_version
+The lowest version this is compatible with.
+.TP
+.I lock
+A seqlock for synchronization.
+.TP
+.I index
+A unique hardware counter identifier.
+.TP
+.I offset
+When using rdpmc for reads this offset value
+must be added to the one returned by rdpmc to get
+the current total event count.
+.TP
+.I time_enabled
+Time the event was active.
+.TP
+.I time_running
+Time the event was running.
+.TP
+.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
+.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
+There was a bug in the definition of
+.I cap_usr_time
+and
+.I cap_usr_rdpmc
+from Linux 3.4 until Linux 3.11.
+Both bits were defined to point to the same location, so it was
+impossible to know if
+.I cap_usr_time
+or
+.I cap_usr_rdpmc
+were actually set.
+.IP
+Starting with Linux 3.12, these are renamed to
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+.I cap_bit0
+and you should use the
+.I cap_user_time
+and
+.I cap_user_rdpmc
+fields instead.
+.TP
+.IR cap_bit0_is_deprecated " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+If set, this bit indicates that the kernel supports
+the properly separated
+.I cap_user_time
+and
+.I cap_user_rdpmc
+bits.
+.IP
+If not-set, it indicates an older kernel where
+.I cap_usr_time
+and
+.I cap_usr_rdpmc
+map to the same bit and thus both features should
+be used with caution.
+.TP
+.IR cap_user_rdpmc " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+If the hardware supports user-space read of performance counters
+without syscall (this is the "rdpmc" instruction on x86), then
+the following code can be used to do a read:
+.IP
+.in +4n
+.EX
+u32 seq, time_mult, time_shift, idx, width;
+u64 count, enabled, running;
+u64 cyc, time_offset;
+\&
+do {
+ seq = pc\->lock;
+ barrier();
+ enabled = pc\->time_enabled;
+ running = pc\->time_running;
+\&
+ if (pc\->cap_usr_time && enabled != running) {
+ cyc = rdtsc();
+ time_offset = pc\->time_offset;
+ time_mult = pc\->time_mult;
+ time_shift = pc\->time_shift;
+ }
+\&
+ idx = pc\->index;
+ count = pc\->offset;
+\&
+ if (pc\->cap_usr_rdpmc && idx) {
+ width = pc\->pmc_width;
+ count += rdpmc(idx \- 1);
+ }
+\&
+ barrier();
+} while (pc\->lock != seq);
+.EE
+.in
+.TP
+.IR cap_user_time " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+This bit indicates the hardware has a constant, nonstop
+timestamp counter (TSC on x86).
+.TP
+.IR cap_user_time_zero " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+Indicates the presence of
+.I time_zero
+which allows mapping timestamp values to
+the hardware clock.
+.TP
+.I pmc_width
+If
+.IR cap_usr_rdpmc ,
+this field provides the bit-width of the value
+read using the rdpmc or equivalent instruction.
+This can be used to sign extend the result like:
+.IP
+.in +4n
+.EX
+pmc <<= 64 \- pmc_width;
+pmc >>= 64 \- pmc_width; // signed shift right
+count += pmc;
+.EE
+.in
+.TP
+.IR time_shift ", " time_mult ", " time_offset
+.IP
+If
+.IR cap_usr_time ,
+these fields can be used to compute the time
+delta since
+.I time_enabled
+(in nanoseconds) using rdtsc or similar.
+.IP
+.in +4n
+.EX
+u64 quot, rem;
+u64 delta;
+\&
+quot = cyc >> time_shift;
+rem = cyc & (((u64)1 << time_shift) \- 1);
+delta = time_offset + quot * time_mult +
+ ((rem * time_mult) >> time_shift);
+.EE
+.in
+.IP
+Where
+.IR time_offset ,
+.IR time_mult ,
+.IR time_shift ,
+and
+.I cyc
+are read in the
+seqcount loop described above.
+This delta can then be added to
+enabled and possible running (if idx), improving the scaling:
+.IP
+.in +4n
+.EX
+enabled += delta;
+if (idx)
+ running += delta;
+quot = count / running;
+rem = count % running;
+count = quot * enabled + (rem * enabled) / running;
+.EE
+.in
+.TP
+.IR time_zero " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+.IP
+If
+.I cap_usr_time_zero
+is set, then the hardware clock (the TSC timestamp counter on x86)
+can be calculated from the
+.IR time_zero ,
+.IR time_mult ,
+and
+.I time_shift
+values:
+.IP
+.in +4n
+.EX
+time = timestamp \- time_zero;
+quot = time / time_mult;
+rem = time % time_mult;
+cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
+.EE
+.in
+.IP
+And vice versa:
+.IP
+.in +4n
+.EX
+quot = cyc >> time_shift;
+rem = cyc & (((u64)1 << time_shift) \- 1);
+timestamp = time_zero + quot * time_mult +
+ ((rem * time_mult) >> time_shift);
+.EE
+.in
+.TP
+.I data_head
+This points to the head of the data section.
+The value continuously increases, it does not wrap.
+The value needs to be manually wrapped by the size of the mmap buffer
+before accessing the samples.
+.IP
+On SMP-capable platforms, after reading the
+.I data_head
+value,
+user space should issue an rmb().
+.TP
+.I data_tail
+When the mapping is
+.BR PROT_WRITE ,
+the
+.I data_tail
+value should be written by user space to reflect the last read data.
+In this case, the kernel will not overwrite unread data.
+.TP
+.IR data_offset " (since Linux 4.1)"
+.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
+Contains the offset of the location in the mmap buffer
+where perf sample data begins.
+.TP
+.IR data_size " (since Linux 4.1)"
+.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
+Contains the size of the perf sample region within
+the mmap buffer.
+.TP
+.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)"
+.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff
+The AUX region allows
+.BR mmap (2)-ing
+a separate sample buffer for
+high-bandwidth data streams (separate from the main perf sample buffer).
+An example of a high-bandwidth stream is instruction tracing support,
+as is found in newer Intel processors.
+.IP
+To set up an AUX area, first
+.I aux_offset
+needs to be set with an offset greater than
+.IR data_offset + data_size
+and
+.I aux_size
+needs to be set to the desired buffer size.
+The desired offset and size must be page aligned, and the size
+must be a power of two.
+These values are then passed to mmap in order to map the AUX buffer.
+Pages in the AUX buffer are included as part of the
+.B RLIMIT_MEMLOCK
+resource limit (see
+.BR setrlimit (2)),
+and also as part of the
+.I perf_event_mlock_kb
+allowance.
+.IP
+By default, the AUX buffer will be truncated if it will not fit
+in the available space in the ring buffer.
+If the AUX buffer is mapped as a read only buffer, then it will
+operate in ring buffer mode where old data will be overwritten
+by new.
+In overwrite mode, it might not be possible to infer where the
+new data began, and it is the consumer's job to disable
+measurement while reading to avoid possible data races.
+.IP
+The
+.I aux_head
+and
+.I aux_tail
+ring buffer pointers have the same behavior and ordering
+rules as the previous described
+.I data_head
+and
+.IR data_tail .
+.PP
+The following 2^n ring-buffer pages have the layout described below.
+.PP
+If
+.I perf_event_attr.sample_id_all
+is set, then all event types will
+have the sample_type selected fields related to where/when (identity)
+an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
+.B PERF_RECORD_SAMPLE
+below, it will be stashed just after the
+.I perf_event_header
+and the fields already present for the existing
+fields, that is, at the end of the payload.
+This allows a newer perf.data
+file to be supported by older perf tools, with the new optional
+fields being ignored.
+.PP
+The mmap values start with a header:
+.PP
+.in +4n
+.EX
+struct perf_event_header {
+ __u32 type;
+ __u16 misc;
+ __u16 size;
+};
+.EE
+.in
+.PP
+Below, we describe the
+.I perf_event_header
+fields in more detail.
+For ease of reading,
+the fields with shorter descriptions are presented first.
+.TP
+.I size
+This indicates the size of the record.
+.TP
+.I misc
+The
+.I misc
+field contains additional information about the sample.
+.IP
+The CPU mode can be determined from this value by masking with
+.B PERF_RECORD_MISC_CPUMODE_MASK
+and looking for one of the following (note these are not
+bit masks, only one can be set at a time):
+.RS
+.TP
+.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
+Unknown CPU mode.
+.TP
+.B PERF_RECORD_MISC_KERNEL
+Sample happened in the kernel.
+.TP
+.B PERF_RECORD_MISC_USER
+Sample happened in user code.
+.TP
+.B PERF_RECORD_MISC_HYPERVISOR
+Sample happened in the hypervisor.
+.TP
+.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)"
+.\" commit 39447b386c846bbf1c56f6403c5282837486200f
+Sample happened in the guest kernel.
+.TP
+.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)"
+.\" commit 39447b386c846bbf1c56f6403c5282837486200f
+Sample happened in guest user code.
+.RE
+.PP
+.RS
+Since the following three statuses are generated by
+different record types, they alias to the same bit:
+.TP
+.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)"
+.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75
+This is set when the mapping is not executable;
+otherwise the mapping is executable.
+.TP
+.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)"
+.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
+This is set for a
+.B PERF_RECORD_COMM
+record on kernels more recent than Linux 3.16
+if a process name change was caused by an
+.BR execve (2)
+system call.
+.TP
+.BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+When a
+.B PERF_RECORD_SWITCH
+or
+.B PERF_RECORD_SWITCH_CPU_WIDE
+record is generated, this bit indicates that the
+context switch is away from the current process
+(instead of into the current process).
+.RE
+.PP
+.RS
+In addition, the following bits can be set:
+.TP
+.B PERF_RECORD_MISC_EXACT_IP
+This indicates that the content of
+.B PERF_SAMPLE_IP
+points
+to the actual instruction that triggered the event.
+See also
+.IR perf_event_attr.precise_ip .
+.TP
+.BR PERF_RECORD_MISC_SWITCH_OUT_PREEMPT " (since Linux 4.17)"
+.\" commit 101592b4904ecf6b8ed2a4784d41d180319d95a1
+When a
+.B PERF_RECORD_SWITCH
+or
+.B PERF_RECORD_SWITCH_CPU_WIDE
+record is generated,
+this indicates the context switch was a preemption.
+.TP
+.BR PERF_RECORD_MISC_MMAP_BUILD_ID " (since Linux 5.12)"
+.\" commit 88a16a1309333e43d328621ece3e9fa37027e8eb
+This indicates that the content of
+.B PERF_SAMPLE_MMAP2
+contains build-ID data instead of device major and minor numbers
+as well as the inode number.
+.TP
+.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)"
+.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74
+This indicates there is extended data available (currently not used).
+.TP
+.B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT
+.\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4
+This bit is not set by the kernel.
+It is reserved for the user-space perf utility to indicate that
+.IR /proc/ pid /maps
+parsing was taking too long and was stopped, and thus the mmap
+records may be truncated.
+.RE
+.TP
+.I type
+The
+.I type
+value is one of the below.
+The values in the corresponding record (that follows the header)
+depend on the
+.I type
+selected as shown.
+.RS
+.TP 4
+.B PERF_RECORD_MMAP
+The MMAP events record the
+.B PROT_EXEC
+mappings so that we can correlate
+user-space IPs to code.
+They have the following structure:
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, tid;
+ u64 addr;
+ u64 len;
+ u64 pgoff;
+ char filename[];
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID.
+.TP
+.I tid
+is the thread ID.
+.TP
+.I addr
+is the address of the allocated memory.
+.I len
+is the length of the allocated memory.
+.I pgoff
+is the page offset of the allocated memory.
+.I filename
+is a string describing the backing of the allocated memory.
+.RE
+.TP
+.B PERF_RECORD_LOST
+This record indicates when events are lost.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I id
+is the unique event ID for the samples that were lost.
+.TP
+.I lost
+is the number of events that were lost.
+.RE
+.TP
+.B PERF_RECORD_COMM
+This record indicates a change in the process name.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ char comm[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID.
+.TP
+.I tid
+is the thread ID.
+.TP
+.I comm
+is a string containing the new name of the process.
+.RE
+.TP
+.B PERF_RECORD_EXIT
+This record indicates a process exit event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, ppid;
+ u32 tid, ptid;
+ u64 time;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
+This record indicates a throttle/unthrottle event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 time;
+ u64 id;
+ u64 stream_id;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.B PERF_RECORD_FORK
+This record indicates a fork event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, ppid;
+ u32 tid, ptid;
+ u64 time;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.B PERF_RECORD_READ
+This record indicates a read event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, tid;
+ struct read_format values;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.B PERF_RECORD_SAMPLE
+This record indicates a sample.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
+ u64 ip; /* if PERF_SAMPLE_IP */
+ u32 pid, tid; /* if PERF_SAMPLE_TID */
+ u64 time; /* if PERF_SAMPLE_TIME */
+ u64 addr; /* if PERF_SAMPLE_ADDR */
+ u64 id; /* if PERF_SAMPLE_ID */
+ u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
+ u32 cpu, res; /* if PERF_SAMPLE_CPU */
+ u64 period; /* if PERF_SAMPLE_PERIOD */
+ struct read_format v;
+ /* if PERF_SAMPLE_READ */
+ u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
+ u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
+ u32 size; /* if PERF_SAMPLE_RAW */
+ char data[size]; /* if PERF_SAMPLE_RAW */
+ u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
+ struct perf_branch_entry lbr[bnr];
+ /* if PERF_SAMPLE_BRANCH_STACK */
+ u64 abi; /* if PERF_SAMPLE_REGS_USER */
+ u64 regs[weight(mask)];
+ /* if PERF_SAMPLE_REGS_USER */
+ u64 size; /* if PERF_SAMPLE_STACK_USER */
+ char data[size]; /* if PERF_SAMPLE_STACK_USER */
+ u64 dyn_size; /* if PERF_SAMPLE_STACK_USER &&
+ size != 0 */
+ union perf_sample_weight weight;
+ /* if PERF_SAMPLE_WEIGHT */
+ /* || PERF_SAMPLE_WEIGHT_STRUCT */
+ u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
+ u64 transaction; /* if PERF_SAMPLE_TRANSACTION */
+ u64 abi; /* if PERF_SAMPLE_REGS_INTR */
+ u64 regs[weight(mask)];
+ /* if PERF_SAMPLE_REGS_INTR */
+ u64 phys_addr; /* if PERF_SAMPLE_PHYS_ADDR */
+ u64 cgroup; /* if PERF_SAMPLE_CGROUP */
+ u64 data_page_size;
+ /* if PERF_SAMPLE_DATA_PAGE_SIZE */
+ u64 code_page_size;
+ /* if PERF_SAMPLE_CODE_PAGE_SIZE */
+ u64 size; /* if PERF_SAMPLE_AUX */
+ char data[size]; /* if PERF_SAMPLE_AUX */
+};
+.EE
+.in
+.RS 4
+.TP 4
+.I sample_id
+If
+.B PERF_SAMPLE_IDENTIFIER
+is enabled, a 64-bit unique ID is included.
+This is a duplication of the
+.B PERF_SAMPLE_ID
+.I id
+value, but included at the beginning of the sample
+so parsers can easily obtain the value.
+.TP
+.I ip
+If
+.B PERF_SAMPLE_IP
+is enabled, then a 64-bit instruction
+pointer value is included.
+.TP
+.IR pid ", " tid
+If
+.B PERF_SAMPLE_TID
+is enabled, then a 32-bit process ID
+and 32-bit thread ID are included.
+.TP
+.I time
+If
+.B PERF_SAMPLE_TIME
+is enabled, then a 64-bit timestamp
+is included.
+This is obtained via local_clock() which is a hardware timestamp
+if available and the jiffies value if not.
+.TP
+.I addr
+If
+.B PERF_SAMPLE_ADDR
+is enabled, then a 64-bit address is included.
+This is usually the address of a tracepoint,
+breakpoint, or software event; otherwise the value is 0.
+.TP
+.I id
+If
+.B PERF_SAMPLE_ID
+is enabled, a 64-bit unique ID is included.
+If the event is a member of an event group, the group leader ID is returned.
+This ID is the same as the one returned by
+.BR PERF_FORMAT_ID .
+.TP
+.I stream_id
+If
+.B PERF_SAMPLE_STREAM_ID
+is enabled, a 64-bit unique ID is included.
+Unlike
+.B PERF_SAMPLE_ID
+the actual ID is returned, not the group leader.
+This ID is the same as the one returned by
+.BR PERF_FORMAT_ID .
+.TP
+.IR cpu ", " res
+If
+.B PERF_SAMPLE_CPU
+is enabled, this is a 32-bit value indicating
+which CPU was being used, in addition to a reserved (unused)
+32-bit value.
+.TP
+.I period
+If
+.B PERF_SAMPLE_PERIOD
+is enabled, a 64-bit value indicating
+the current sampling period is written.
+.TP
+.I v
+If
+.B PERF_SAMPLE_READ
+is enabled, a structure of type read_format
+is included which has values for all events in the event group.
+The values included depend on the
+.I read_format
+value used at
+.BR perf_event_open ()
+time.
+.TP
+.IR nr ", " ips[nr]
+If
+.B PERF_SAMPLE_CALLCHAIN
+is enabled, then a 64-bit number is included
+which indicates how many following 64-bit instruction pointers will
+follow.
+This is the current callchain.
+.TP
+.IR size ", " data[size]
+If
+.B PERF_SAMPLE_RAW
+is enabled, then a 32-bit value indicating size
+is included followed by an array of 8-bit values of length size.
+The values are padded with 0 to have 64-bit alignment.
+.IP
+This RAW record data is opaque with respect to the ABI.
+The ABI doesn't make any promises with respect to the stability
+of its content, it may vary depending
+on event, hardware, and kernel version.
+.TP
+.IR bnr ", " lbr[bnr]
+If
+.B PERF_SAMPLE_BRANCH_STACK
+is enabled, then a 64-bit value indicating
+the number of records is included, followed by
+.I bnr
+.I perf_branch_entry
+structures which each include the fields:
+.RS
+.TP
+.I from
+This indicates the source instruction (may not be a branch).
+.TP
+.I to
+The branch target.
+.TP
+.I mispred
+The branch target was mispredicted.
+.TP
+.I predicted
+The branch target was predicted.
+.TP
+.IR in_tx " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+The branch was in a transactional memory transaction.
+.TP
+.IR abort " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+The branch was in an aborted transactional memory transaction.
+.TP
+.IR cycles " (since Linux 4.3)"
+.\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
+This reports the number of cycles elapsed since the
+previous branch stack update.
+.PP
+The entries are from most to least recent, so the first entry
+has the most recent branch.
+.PP
+Support for
+.IR mispred ,
+.IR predicted ,
+and
+.I cycles
+is optional; if not supported, those
+values will be 0.
+.PP
+The type of branches recorded is specified by the
+.I branch_sample_type
+field.
+.RE
+.TP
+.IR abi ", " regs[weight(mask)]
+If
+.B PERF_SAMPLE_REGS_USER
+is enabled, then the user CPU registers are recorded.
+.IP
+The
+.I abi
+field is one of
+.BR PERF_SAMPLE_REGS_ABI_NONE ,
+.BR PERF_SAMPLE_REGS_ABI_32 ,
+or
+.BR PERF_SAMPLE_REGS_ABI_64 .
+.IP
+The
+.I regs
+field is an array of the CPU registers that were specified by
+the
+.I sample_regs_user
+attr field.
+The number of values is the number of bits set in the
+.I sample_regs_user
+bit mask.
+.TP
+.IR size ", " data[size] ", " dyn_size
+If
+.B PERF_SAMPLE_STACK_USER
+is enabled, then the user stack is recorded.
+This can be used to generate stack backtraces.
+.I size
+is the size requested by the user in
+.I sample_stack_user
+or else the maximum record size.
+.I data
+is the stack data (a raw dump of the memory pointed to by the
+stack pointer at the time of sampling).
+.I dyn_size
+is the amount of data actually dumped (can be less than
+.IR size ).
+Note that
+.I dyn_size
+is omitted if
+.I size
+is 0.
+.TP
+.I weight
+If
+.B PERF_SAMPLE_WEIGHT
+or
+.B PERF_SAMPLE_WEIGHT_STRUCT
+is enabled, then a 64-bit value provided by the hardware
+is recorded that indicates how costly the event was.
+This allows expensive events to stand out more clearly
+in profiles.
+.TP
+.I data_src
+If
+.B PERF_SAMPLE_DATA_SRC
+is enabled, then a 64-bit value is recorded that is made up of
+the following fields:
+.RS
+.TP 4
+.I mem_op
+Type of opcode, a bitwise combination of:
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_OP_NA
+Not available
+.TP
+.B PERF_MEM_OP_LOAD
+Load instruction
+.TP
+.B PERF_MEM_OP_STORE
+Store instruction
+.TP
+.B PERF_MEM_OP_PFETCH
+Prefetch
+.TP
+.B PERF_MEM_OP_EXEC
+Executable code
+.RE
+.PD
+.TP
+.I mem_lvl
+Memory hierarchy level hit or miss, a bitwise combination of
+the following, shifted left by
+.BR PERF_MEM_LVL_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_LVL_NA
+Not available
+.TP
+.B PERF_MEM_LVL_HIT
+Hit
+.TP
+.B PERF_MEM_LVL_MISS
+Miss
+.TP
+.B PERF_MEM_LVL_L1
+Level 1 cache
+.TP
+.B PERF_MEM_LVL_LFB
+Line fill buffer
+.TP
+.B PERF_MEM_LVL_L2
+Level 2 cache
+.TP
+.B PERF_MEM_LVL_L3
+Level 3 cache
+.TP
+.B PERF_MEM_LVL_LOC_RAM
+Local DRAM
+.TP
+.B PERF_MEM_LVL_REM_RAM1
+Remote DRAM 1 hop
+.TP
+.B PERF_MEM_LVL_REM_RAM2
+Remote DRAM 2 hops
+.TP
+.B PERF_MEM_LVL_REM_CCE1
+Remote cache 1 hop
+.TP
+.B PERF_MEM_LVL_REM_CCE2
+Remote cache 2 hops
+.TP
+.B PERF_MEM_LVL_IO
+I/O memory
+.TP
+.B PERF_MEM_LVL_UNC
+Uncached memory
+.RE
+.PD
+.TP
+.I mem_snoop
+Snoop mode, a bitwise combination of the following, shifted left by
+.BR PERF_MEM_SNOOP_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_SNOOP_NA
+Not available
+.TP
+.B PERF_MEM_SNOOP_NONE
+No snoop
+.TP
+.B PERF_MEM_SNOOP_HIT
+Snoop hit
+.TP
+.B PERF_MEM_SNOOP_MISS
+Snoop miss
+.TP
+.B PERF_MEM_SNOOP_HITM
+Snoop hit modified
+.RE
+.PD
+.TP
+.I mem_lock
+Lock instruction, a bitwise combination of the following, shifted left by
+.BR PERF_MEM_LOCK_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_LOCK_NA
+Not available
+.TP
+.B PERF_MEM_LOCK_LOCKED
+Locked transaction
+.RE
+.PD
+.TP
+.I mem_dtlb
+TLB access hit or miss, a bitwise combination of the following, shifted
+left by
+.BR PERF_MEM_TLB_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_TLB_NA
+Not available
+.TP
+.B PERF_MEM_TLB_HIT
+Hit
+.TP
+.B PERF_MEM_TLB_MISS
+Miss
+.TP
+.B PERF_MEM_TLB_L1
+Level 1 TLB
+.TP
+.B PERF_MEM_TLB_L2
+Level 2 TLB
+.TP
+.B PERF_MEM_TLB_WK
+Hardware walker
+.TP
+.B PERF_MEM_TLB_OS
+OS fault handler
+.RE
+.PD
+.RE
+.TP
+.I transaction
+If the
+.B PERF_SAMPLE_TRANSACTION
+flag is set, then a 64-bit field is recorded describing
+the sources of any transactional memory aborts.
+.IP
+The field is a bitwise combination of the following values:
+.RS
+.TP
+.B PERF_TXN_ELISION
+Abort from an elision type transaction (Intel-CPU-specific).
+.TP
+.B PERF_TXN_TRANSACTION
+Abort from a generic transaction.
+.TP
+.B PERF_TXN_SYNC
+Synchronous abort (related to the reported instruction).
+.TP
+.B PERF_TXN_ASYNC
+Asynchronous abort (not related to the reported instruction).
+.TP
+.B PERF_TXN_RETRY
+Retryable abort (retrying the transaction may have succeeded).
+.TP
+.B PERF_TXN_CONFLICT
+Abort due to memory conflicts with other threads.
+.TP
+.B PERF_TXN_CAPACITY_WRITE
+Abort due to write capacity overflow.
+.TP
+.B PERF_TXN_CAPACITY_READ
+Abort due to read capacity overflow.
+.RE
+.IP
+In addition, a user-specified abort code can be obtained from
+the high 32 bits of the field by shifting right by
+.B PERF_TXN_ABORT_SHIFT
+and masking with the value
+.BR PERF_TXN_ABORT_MASK .
+.TP
+.IR abi ", " regs[weight(mask)]
+If
+.B PERF_SAMPLE_REGS_INTR
+is enabled, then the user CPU registers are recorded.
+.IP
+The
+.I abi
+field is one of
+.BR PERF_SAMPLE_REGS_ABI_NONE ,
+.BR PERF_SAMPLE_REGS_ABI_32 ,
+or
+.BR PERF_SAMPLE_REGS_ABI_64 .
+.IP
+The
+.I regs
+field is an array of the CPU registers that were specified by
+the
+.I sample_regs_intr
+attr field.
+The number of values is the number of bits set in the
+.I sample_regs_intr
+bit mask.
+.TP
+.I phys_addr
+If the
+.B PERF_SAMPLE_PHYS_ADDR
+flag is set, then the 64-bit physical address is recorded.
+.TP
+.I cgroup
+If the
+.B PERF_SAMPLE_CGROUP
+flag is set,
+then the 64-bit cgroup ID (for the perf_event subsystem) is recorded.
+To get the pathname of the cgroup, the ID should match to one in a
+.BR PERF_RECORD_CGROUP .
+.TP
+.I data_page_size
+If the
+.B PERF_SAMPLE_DATA_PAGE_SIZE
+flag is set,
+then the 64-bit page size value of the
+.B data
+address is recorded.
+.TP
+.I code_page_size
+If the
+.B PERF_SAMPLE_CODE_PAGE_SIZE
+flag is set,
+then the 64-bit page size value of the
+.B ip
+address is recorded.
+.TP
+.I size
+.TQ
+.IR data [ size ]
+If
+.B PERF_SAMPLE_AUX
+is enabled,
+a snapshot of the aux buffer is recorded.
+.RE
+.TP
+.B PERF_RECORD_MMAP2
+This record includes extended information on
+.BR mmap (2)
+calls returning executable mappings.
+The format is similar to that of the
+.B PERF_RECORD_MMAP
+record, but includes extra values that allow uniquely identifying
+shared mappings.
+Depending on the
+.B PERF_RECORD_MISC_MMAP_BUILD_ID
+bit in the header,
+the extra values have different layout and meanings.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ u64 addr;
+ u64 len;
+ u64 pgoff;
+ union {
+ struct {
+ u32 maj;
+ u32 min;
+ u64 ino;
+ u64 ino_generation;
+ };
+ struct { /* if PERF_RECORD_MISC_MMAP_BUILD_ID */
+ u8 build_id_size;
+ u8 __reserved_1;
+ u16 __reserved_2;
+ u8 build_id[20];
+ };
+ };
+ u32 prot;
+ u32 flags;
+ char filename[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID.
+.TP
+.I tid
+is the thread ID.
+.TP
+.I addr
+is the address of the allocated memory.
+.TP
+.I len
+is the length of the allocated memory.
+.TP
+.I pgoff
+is the page offset of the allocated memory.
+.TP
+.I maj
+is the major ID of the underlying device.
+.TP
+.I min
+is the minor ID of the underlying device.
+.TP
+.I ino
+is the inode number.
+.TP
+.I ino_generation
+is the inode generation.
+.TP
+.I build_id_size
+is the actual size of
+.I build_id
+field (up to 20).
+.TP
+.I build_id
+is a raw data to identify a binary.
+.TP
+.I prot
+is the protection information.
+.TP
+.I flags
+is the flags information.
+.TP
+.I filename
+is a string describing the backing of the allocated memory.
+.RE
+.TP
+.BR PERF_RECORD_AUX " (since Linux 4.1)"
+.\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0
+This record reports that new data is available in the separate
+AUX buffer region.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 aux_offset;
+ u64 aux_size;
+ u64 flags;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I aux_offset
+offset in the AUX mmap region where the new data begins.
+.TP
+.I aux_size
+size of the data made available.
+.TP
+.I flags
+describes the AUX update.
+.RS
+.TP
+.B PERF_AUX_FLAG_TRUNCATED
+if set, then the data returned was truncated to fit the available
+buffer size.
+.TP
+.B PERF_AUX_FLAG_OVERWRITE
+.\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142
+if set, then the data returned has overwritten previous data.
+.RE
+.RE
+.TP
+.BR PERF_RECORD_ITRACE_START " (since Linux 4.1)"
+.\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368
+This record indicates which process has initiated an instruction
+trace event, allowing tools to properly correlate the instruction
+addresses in the AUX buffer with the proper executable.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+process ID of the thread starting an instruction trace.
+.TP
+.I tid
+thread ID of the thread starting an instruction trace.
+.RE
+.TP
+.BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)"
+.\" f38b0dbb491a6987e198aa6b428db8692a6480f8
+When using hardware sampling (such as Intel PEBS) this record
+indicates some number of samples that may have been lost.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 lost;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I lost
+the number of potentially lost samples.
+.RE
+.TP
+.BR PERF_RECORD_SWITCH " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+This record indicates a context switch has happened.
+The
+.B PERF_RECORD_MISC_SWITCH_OUT
+bit in the
+.I misc
+field indicates whether it was a context switch into
+or away from the current process.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+As with
+.B PERF_RECORD_SWITCH
+this record indicates a context switch has happened,
+but it only occurs when sampling in CPU-wide mode
+and provides additional information on the process
+being switched to/from.
+The
+.B PERF_RECORD_MISC_SWITCH_OUT
+bit in the
+.I misc
+field indicates whether it was a context switch into
+or away from the current process.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 next_prev_pid;
+ u32 next_prev_tid;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I next_prev_pid
+The process ID of the previous (if switching in)
+or next (if switching out) process on the CPU.
+.TP
+.I next_prev_tid
+The thread ID of the previous (if switching in)
+or next (if switching out) thread on the CPU.
+.RE
+.TP
+.BR PERF_RECORD_NAMESPACES " (since Linux 4.11)"
+.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
+This record includes various namespace information of a process.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ u64 nr_namespaces;
+ struct { u64 dev, inode } [nr_namespaces];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID
+.TP
+.I tid
+is the thread ID
+.TP
+.I nr_namespace
+is the number of namespaces in this record
+.RE
+.IP
+Each namespace has
+.I dev
+and
+.I inode
+fields and is recorded in the
+fixed position like below:
+.RS
+.TP
+.BR NET_NS_INDEX = 0
+Network namespace
+.TP
+.BR UTS_NS_INDEX = 1
+UTS namespace
+.TP
+.BR IPC_NS_INDEX = 2
+IPC namespace
+.TP
+.BR PID_NS_INDEX = 3
+PID namespace
+.TP
+.BR USER_NS_INDEX = 4
+User namespace
+.TP
+.BR MNT_NS_INDEX = 5
+Mount namespace
+.TP
+.BR CGROUP_NS_INDEX = 6
+Cgroup namespace
+.RE
+.TP
+.BR PERF_RECORD_KSYMBOL " (since Linux 5.0)"
+.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
+This record indicates kernel symbol register/unregister events.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 addr;
+ u32 len;
+ u16 ksym_type;
+ u16 flags;
+ char name[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I addr
+is the address of the kernel symbol.
+.TP
+.I len
+is the length of the kernel symbol.
+.TP
+.I ksym_type
+is the type of the kernel symbol.
+Currently the following types are available:
+.RS
+.TP
+.B PERF_RECORD_KSYMBOL_TYPE_BPF
+The kernel symbol is a BPF function.
+.RE
+.TP
+.I flags
+If the
+.B PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER
+is set, then this event is for unregistering the kernel symbol.
+.RE
+.TP
+.BR PERF_RECORD_BPF_EVENT " (since Linux 5.0)"
+.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
+This record indicates BPF program is loaded or unloaded.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u16 type;
+ u16 flags;
+ u32 id;
+ u8 tag[BPF_TAG_SIZE];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I type
+is one of the following values:
+.RS
+.TP
+.B PERF_BPF_EVENT_PROG_LOAD
+A BPF program is loaded
+.TP
+.B PERF_BPF_EVENT_PROG_UNLOAD
+A BPF program is unloaded
+.RE
+.TP
+.I id
+is the ID of the BPF program.
+.TP
+.I tag
+is the tag of the BPF program.
+Currently,
+.B BPF_TAG_SIZE
+is defined as 8.
+.RE
+.TP
+.BR PERF_RECORD_CGROUP " (since Linux 5.7)"
+.\" commit 96aaab686505c449e24d76e76507290dcc30e008
+This record indicates a new cgroup is created and activated.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 id;
+ char path[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I id
+is the cgroup identifier.
+This can be also retrieved by
+.BR name_to_handle_at (2)
+on the cgroup path (as a file handle).
+.TP
+.I path
+is the path of the cgroup from the root.
+.RE
+.TP
+.BR PERF_RECORD_TEXT_POKE " (since Linux 5.8)"
+.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
+This record indicates a change in the kernel text.
+This includes addition and removal of the text
+and the corresponding length is zero in this case.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 addr;
+ u16 old_len;
+ u16 new_len;
+ u8 bytes[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I addr
+is the address of the change
+.TP
+.I old_len
+is the old length
+.TP
+.I new_len
+is the new length
+.TP
+.I bytes
+contains old bytes immediately followed by new bytes.
+.RE
+.RE
+.SS Overflow handling
+Events can be set to notify when a threshold is crossed,
+indicating an overflow.
+Overflow conditions can be captured by monitoring the
+event file descriptor with
+.BR poll (2),
+.BR select (2),
+or
+.BR epoll (7).
+Alternatively, the overflow events can be captured via sa signal handler,
+by enabling I/O signaling on the file descriptor; see the discussion of the
+.B F_SETOWN
+and
+.B F_SETSIG
+operations in
+.BR fcntl (2).
+.PP
+Overflows are generated only by sampling events
+.RI ( sample_period
+must have a nonzero value).
+.PP
+There are two ways to generate overflow notifications.
+.PP
+The first is to set a
+.I wakeup_events
+or
+.I wakeup_watermark
+value that will trigger if a certain number of samples
+or bytes have been written to the mmap ring buffer.
+In this case,
+.B POLL_IN
+is indicated.
+.PP
+The other way is by use of the
+.B PERF_EVENT_IOC_REFRESH
+ioctl.
+This ioctl adds to a counter that decrements each time the event overflows.
+When nonzero,
+.B POLL_IN
+is indicated, but
+once the counter reaches 0
+.B POLL_HUP
+is indicated and
+the underlying event is disabled.
+.PP
+Refreshing an event group leader refreshes all siblings and
+refreshing with a parameter of 0 currently enables infinite
+refreshes;
+these behaviors are unsupported and should not be relied on.
+.\" See https://lkml.org/lkml/2011/5/24/337
+.PP
+Starting with Linux 3.18,
+.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883
+.B POLL_HUP
+is indicated if the event being monitored is attached to a different
+process and that process exits.
+.SS rdpmc instruction
+Starting with Linux 3.4 on x86, you can use the
+.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
+.I rdpmc
+instruction to get low-latency reads without having to enter the kernel.
+Note that using
+.I rdpmc
+is not necessarily faster than other methods for reading event values.
+.PP
+Support for this can be detected with the
+.I cap_usr_rdpmc
+field in the mmap page; documentation on how
+to calculate event values can be found in that section.
+.PP
+Originally, when rdpmc support was enabled, any process (not just ones
+with an active perf event) could use the rdpmc instruction to access
+the counters.
+Starting with Linux 4.0,
+.\" 7911d3f7af14a614617e38245fedf98a724e46a9
+rdpmc support is only allowed if an event is currently enabled
+in a process's context.
+To restore the old behavior, write the value 2 to
+.IR /sys/devices/cpu/rdpmc .
+.SS perf_event ioctl calls
+Various ioctls act on
+.BR perf_event_open ()
+file descriptors:
+.TP
+.B PERF_EVENT_IOC_ENABLE
+This enables the individual event or event group specified by the
+file descriptor argument.
+.IP
+If the
+.B PERF_IOC_FLAG_GROUP
+bit is set in the ioctl argument, then all events in a group are
+enabled, even if the event specified is not the group leader
+(but see BUGS).
+.TP
+.B PERF_EVENT_IOC_DISABLE
+This disables the individual counter or event group specified by the
+file descriptor argument.
+.IP
+Enabling or disabling the leader of a group enables or disables the
+entire group; that is, while the group leader is disabled, none of the
+counters in the group will count.
+Enabling or disabling a member of a group other than the leader
+affects only that counter; disabling a non-leader
+stops that counter from counting but doesn't affect any other counter.
+.IP
+If the
+.B PERF_IOC_FLAG_GROUP
+bit is set in the ioctl argument, then all events in a group are
+disabled, even if the event specified is not the group leader
+(but see BUGS).
+.TP
+.B PERF_EVENT_IOC_REFRESH
+Non-inherited overflow counters can use this
+to enable a counter for a number of overflows specified by the argument,
+after which it is disabled.
+Subsequent calls of this ioctl add the argument value to the current
+count.
+An overflow notification with
+.B POLL_IN
+set will happen on each overflow until the
+count reaches 0; when that happens a notification with
+.B POLL_HUP
+set is sent and the event is disabled.
+Using an argument of 0 is considered undefined behavior.
+.TP
+.B PERF_EVENT_IOC_RESET
+Reset the event count specified by the
+file descriptor argument to zero.
+This resets only the counts; there is no way to reset the
+multiplexing
+.I time_enabled
+or
+.I time_running
+values.
+.IP
+If the
+.B PERF_IOC_FLAG_GROUP
+bit is set in the ioctl argument, then all events in a group are
+reset, even if the event specified is not the group leader
+(but see BUGS).
+.TP
+.B PERF_EVENT_IOC_PERIOD
+This updates the overflow period for the event.
+.IP
+Since Linux 3.7 (on ARM)
+.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc
+and Linux 3.14 (all other architectures),
+.\" commit bad7192b842c83e580747ca57104dd51fe08c223
+the new period takes effect immediately.
+On older kernels, the new period did not take effect until
+after the next overflow.
+.IP
+The argument is a pointer to a 64-bit value containing the
+desired new period.
+.IP
+Prior to Linux 2.6.36,
+.\" commit ad0cf3478de8677f720ee06393b3147819568d6a
+this ioctl always failed due to a bug
+in the kernel.
+.TP
+.B PERF_EVENT_IOC_SET_OUTPUT
+This tells the kernel to report event notifications to the specified
+file descriptor rather than the default one.
+The file descriptors must all be on the same CPU.
+.IP
+The argument specifies the desired file descriptor, or \-1 if
+output should be ignored.
+.TP
+.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
+.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830
+This adds an ftrace filter to this event.
+.IP
+The argument is a pointer to the desired ftrace filter.
+.TP
+.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
+.\" commit cf4957f17f2a89984915ea808876d9c82225b862
+This returns the event ID value for the given event file descriptor.
+.IP
+The argument is a pointer to a 64-bit unsigned integer
+to hold the result.
+.TP
+.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)"
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+This allows attaching a Berkeley Packet Filter (BPF)
+program to an existing kprobe tracepoint event.
+You need
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+privileges to use this ioctl.
+.IP
+The argument is a BPF program file descriptor that was created by
+a previous
+.BR bpf (2)
+system call.
+.TP
+.BR PERF_EVENT_IOC_PAUSE_OUTPUT " (since Linux 4.7)"
+.\" commit 86e7972f690c1017fd086cdfe53d8524e68c661c
+This allows pausing and resuming the event's ring-buffer.
+A paused ring-buffer does not prevent generation of samples,
+but simply discards them.
+The discarded samples are considered lost, and cause a
+.B PERF_RECORD_LOST
+sample to be generated when possible.
+An overflow signal may still be triggered by the discarded sample
+even though the ring-buffer remains empty.
+.IP
+The argument is an unsigned 32-bit integer.
+A nonzero value pauses the ring-buffer, while a
+zero value resumes the ring-buffer.
+.TP
+.BR PERF_EVENT_MODIFY_ATTRIBUTES " (since Linux 4.17)"
+.\" commit 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573
+This allows modifying an existing event without the overhead
+of closing and reopening a new event.
+Currently this is supported only for breakpoint events.
+.IP
+The argument is a pointer to a
+.I perf_event_attr
+structure containing the updated event settings.
+.TP
+.BR PERF_EVENT_IOC_QUERY_BPF " (since Linux 4.16)"
+.\" commit f371b304f12e31fe30207c41ca7754564e0ea4dc
+This allows querying which Berkeley Packet Filter (BPF)
+programs are attached to an existing kprobe tracepoint.
+You can only attach one BPF program per event, but you can
+have multiple events attached to a tracepoint.
+Querying this value on one tracepoint event returns the ID
+of all BPF programs in all events attached to the tracepoint.
+You need
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+privileges to use this ioctl.
+.IP
+The argument is a pointer to a structure
+.in +4n
+.EX
+struct perf_event_query_bpf {
+ __u32 ids_len;
+ __u32 prog_cnt;
+ __u32 ids[0];
+};
+.EE
+.in
+.IP
+The
+.I ids_len
+field indicates the number of ids that can fit in the provided
+.I ids
+array.
+The
+.I prog_cnt
+value is filled in by the kernel with the number of attached
+BPF programs.
+The
+.I ids
+array is filled with the ID of each attached BPF program.
+If there are more programs than will fit in the array, then the
+kernel will return
+.B ENOSPC
+and
+.I ids_len
+will indicate the number of program IDs that were successfully copied.
+.\"
+.SS Using prctl(2)
+A process can enable or disable all currently open event groups
+using the
+.BR prctl (2)
+.B PR_TASK_PERF_EVENTS_ENABLE
+and
+.B PR_TASK_PERF_EVENTS_DISABLE
+operations.
+This applies only to events created locally by the calling process.
+This does not apply to events created by other processes attached
+to the calling process or inherited events from a parent process.
+Only group leaders are enabled and disabled,
+not any other members of the groups.
+.SS perf_event related configuration files
+Files in
+.I /proc/sys/kernel/
+.RS 4
+.TP
+.I /proc/sys/kernel/perf_event_paranoid
+The
+.I perf_event_paranoid
+file can be set to restrict access to the performance counters.
+.IP
+.PD 0
+.RS
+.TP
+.B 2
+allow only user-space measurements (default since Linux 4.6).
+.\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66
+.TP
+.B 1
+allow both kernel and user measurements (default before Linux 4.6).
+.TP
+.B 0
+allow access to CPU-specific data but not raw tracepoint samples.
+.TP
+.B \-1
+no restrictions.
+.RE
+.PD
+.IP
+The existence of the
+.I perf_event_paranoid
+file is the official method for determining if a kernel supports
+.BR perf_event_open ().
+.TP
+.I /proc/sys/kernel/perf_event_max_sample_rate
+This sets the maximum sample rate.
+Setting this too high can allow
+users to sample at a rate that impacts overall machine performance
+and potentially lock up the machine.
+The default value is
+100000 (samples per second).
+.TP
+.I /proc/sys/kernel/perf_event_max_stack
+.\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e
+This file sets the maximum depth of stack frame entries reported
+when generating a call trace.
+.TP
+.I /proc/sys/kernel/perf_event_mlock_kb
+Maximum number of pages an unprivileged user can
+.BR mlock (2).
+The default is 516 (kB).
+.RE
+.PP
+Files in
+.I /sys/bus/event_source/devices/
+.PP
+.RS 4
+Since Linux 2.6.34, the kernel supports having multiple PMUs
+available for monitoring.
+Information on how to program these PMUs can be found under
+.IR /sys/bus/event_source/devices/ .
+Each subdirectory corresponds to a different PMU.
+.TP
+.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
+.\" commit abe43400579d5de0078c2d3a760e6598e183f871
+This contains an integer that can be used in the
+.I type
+field of
+.I perf_event_attr
+to indicate that you wish to use this PMU.
+.TP
+.IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)"
+.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f
+If this file is 1, then direct user-space access to the
+performance counter registers is allowed via the rdpmc instruction.
+This can be disabled by echoing 0 to the file.
+.IP
+As of Linux 4.0
+.\" a66734297f78707ce39d756b656bfae861d53f62
+.\" 7911d3f7af14a614617e38245fedf98a724e46a9
+the behavior has changed, so that 1 now means only allow access
+to processes with active perf events, with 2 indicating the old
+allow-anyone-access behavior.
+.TP
+.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
+.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
+This subdirectory contains information on the architecture-specific
+subfields available for programming the various
+.I config
+fields in the
+.I perf_event_attr
+struct.
+.IP
+The content of each file is the name of the config field, followed
+by a colon, followed by a series of integer bit ranges separated by
+commas.
+For example, the file
+.I event
+may contain the value
+.I config1:1,6\-10,44
+which indicates that event is an attribute that occupies bits 1,6\[en]10, and 44
+of
+.IR perf_event_attr::config1 .
+.TP
+.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
+.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
+This subdirectory contains files with predefined events.
+The contents are strings describing the event settings
+expressed in terms of the fields found in the previously mentioned
+.I ./format/
+directory.
+These are not necessarily complete lists of all events supported by
+a PMU, but usually a subset of events deemed useful or interesting.
+.IP
+The content of each file is a list of attribute names
+separated by commas.
+Each entry has an optional value (either hex or decimal).
+If no value is specified, then it is assumed to be a single-bit
+field with a value of 1.
+An example entry may look like this:
+.IR event=0x2,inv,ldlat=3 .
+.TP
+.I /sys/bus/event_source/devices/*/uevent
+This file is the standard kernel device interface
+for injecting hotplug events.
+.TP
+.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
+.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac
+The
+.I cpumask
+file contains a comma-separated list of integers that
+indicate a representative CPU number for each socket (package)
+on the motherboard.
+This is needed when setting up uncore or northbridge events, as
+those PMUs present socket-wide events.
+.RE
+.SH RETURN VALUE
+On success,
+.BR perf_event_open ()
+returns the new file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The errors returned by
+.BR perf_event_open ()
+can be inconsistent, and may
+vary across processor architectures and performance monitoring units.
+.TP
+.B E2BIG
+Returned if the
+.I perf_event_attr
+.I size
+value is too small
+(smaller than
+.BR PERF_ATTR_SIZE_VER0 ),
+too big (larger than the page size),
+or larger than the kernel supports and the extra bytes are not zero.
+When
+.B E2BIG
+is returned, the
+.I perf_event_attr
+.I size
+field is overwritten by the kernel to be the size of the structure
+it was expecting.
+.TP
+.B EACCES
+Returned when the requested event requires
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+permissions (or a more permissive perf_event paranoid setting).
+Some common cases where an unprivileged process
+may encounter this error:
+attaching to a process owned by a different user;
+monitoring all processes on a given CPU (i.e., specifying the
+.I pid
+argument as \-1);
+and not setting
+.I exclude_kernel
+when the paranoid setting requires it.
+.TP
+.B EBADF
+Returned if the
+.I group_fd
+file descriptor is not valid, or, if
+.B PERF_FLAG_PID_CGROUP
+is set,
+the cgroup file descriptor in
+.I pid
+is not valid.
+.TP
+.BR EBUSY " (since Linux 4.1)"
+.\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0
+Returned if another event already has exclusive
+access to the PMU.
+.TP
+.B EFAULT
+Returned if the
+.I attr
+pointer points at an invalid memory address.
+.TP
+.B EINTR
+Returned when trying to mix perf and ftrace handling
+for a uprobe.
+.TP
+.B EINVAL
+Returned if the specified event is invalid.
+There are many possible reasons for this.
+A not-exhaustive list:
+.I sample_freq
+is higher than the maximum setting;
+the
+.I cpu
+to monitor does not exist;
+.I read_format
+is out of range;
+.I sample_type
+is out of range;
+the
+.I flags
+value is out of range;
+.I exclusive
+or
+.I pinned
+set and the event is not a group leader;
+the event
+.I config
+values are out of range or set reserved bits;
+the generic event selected is not supported; or
+there is not enough room to add the selected event.
+.TP
+.B EMFILE
+Each opened event uses one file descriptor.
+If a large number of events are opened,
+the per-process limit on the number of open file descriptors will be reached,
+and no more events can be created.
+.TP
+.B ENODEV
+Returned when the event involves a feature not supported
+by the current CPU.
+.TP
+.B ENOENT
+Returned if the
+.I type
+setting is not valid.
+This error is also returned for
+some unsupported generic events.
+.TP
+.B ENOSPC
+Prior to Linux 3.3, if there was not enough room for the event,
+.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6
+.B ENOSPC
+was returned.
+In Linux 3.3, this was changed to
+.BR EINVAL .
+.B ENOSPC
+is still returned if you try to add more breakpoint events
+than supported by the hardware.
+.TP
+.B ENOSYS
+Returned if
+.B PERF_SAMPLE_STACK_USER
+is set in
+.I sample_type
+and it is not supported by hardware.
+.TP
+.B EOPNOTSUPP
+Returned if an event requiring a specific hardware feature is
+requested but there is no hardware support.
+This includes requesting low-skid events if not supported,
+branch tracing if it is not available, sampling if no PMU
+interrupt is available, and branch stacks for software events.
+.TP
+.BR EOVERFLOW " (since Linux 4.8)"
+.\" 97c79a38cd454602645f0470ffb444b3b75ce574
+Returned if
+.B PERF_SAMPLE_CALLCHAIN
+is requested and
+.I sample_max_stack
+is larger than the maximum specified in
+.IR /proc/sys/kernel/perf_event_max_stack .
+.TP
+.B EPERM
+Returned on many (but not all) architectures when an unsupported
+.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel
+setting is specified.
+.IP
+It can also happen, as with
+.BR EACCES ,
+when the requested event requires
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+permissions (or a more permissive perf_event paranoid setting).
+This includes setting a breakpoint on a kernel address,
+and (since Linux 3.13) setting a kernel function-trace tracepoint.
+.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34
+.TP
+.B ESRCH
+Returned if attempting to attach to a process that does not exist.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.BR perf_event_open ()
+was introduced in Linux 2.6.31 but was called
+.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e
+.BR perf_counter_open ().
+It was renamed in Linux 2.6.32.
+.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
+.SH NOTES
+The official way of knowing if
+.BR perf_event_open ()
+support is enabled is checking
+for the existence of the file
+.IR /proc/sys/kernel/perf_event_paranoid .
+.PP
+.B CAP_PERFMON
+capability (since Linux 5.8) provides secure approach to
+performance monitoring and observability operations in a system
+according to the principal of least privilege (POSIX IEEE 1003.1e).
+Accessing system performance monitoring and observability operations
+using
+.B CAP_PERFMON
+rather than the much more powerful
+.B CAP_SYS_ADMIN
+excludes chances to misuse credentials and makes operations more secure.
+.B CAP_SYS_ADMIN
+usage for secure system performance monitoring and observability
+is discouraged in favor of the
+.B CAP_PERFMON
+capability.
+.SH BUGS
+The
+.B F_SETOWN_EX
+option to
+.BR fcntl (2)
+is needed to properly get overflow signals in threads.
+This was introduced in Linux 2.6.32.
+.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5
+.PP
+Prior to Linux 2.6.33 (at least for x86),
+.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1
+the kernel did not check
+if events could be scheduled together until read time.
+The same happens on all known kernels if the NMI watchdog is enabled.
+This means to see if a given set of events works you have to
+.BR perf_event_open (),
+start, then read before you know for sure you
+can get valid measurements.
+.PP
+Prior to Linux 2.6.34,
+.\" FIXME . cannot find a kernel commit for this one
+event constraints were not enforced by the kernel.
+In that case, some events would silently return "0" if the kernel
+scheduled them in an improper counter slot.
+.PP
+Prior to Linux 2.6.34, there was a bug when multiplexing where the
+wrong results could be returned.
+.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8
+.PP
+Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
+"inherit" is enabled and many threads are started.
+.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd
+.PP
+Prior to Linux 2.6.35,
+.\" commit 050735b08ca8a016bbace4445fa025b88fee770b
+.B PERF_FORMAT_GROUP
+did not work with attached processes.
+.PP
+There is a bug in the kernel code between
+Linux 2.6.36 and Linux 3.0 that ignores the
+"watermark" field and acts as if a wakeup_event
+was chosen if the union has a
+nonzero value in it.
+.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02
+.PP
+From Linux 2.6.31 to Linux 3.4, the
+.B PERF_IOC_FLAG_GROUP
+ioctl argument was broken and would repeatedly operate
+on the event specified rather than iterating across
+all sibling events in a group.
+.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e
+.PP
+From Linux 3.4 to Linux 3.11, the mmap
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+.I cap_usr_rdpmc
+and
+.I cap_usr_time
+bits mapped to the same location.
+Code should migrate to the new
+.I cap_user_rdpmc
+and
+.I cap_user_time
+fields instead.
+.PP
+Always double-check your results!
+Various generalized events have had wrong values.
+For example, retired branches measured
+the wrong thing on AMD machines until Linux 2.6.35.
+.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
+.SH EXAMPLES
+The following is a short example that measures the total
+instruction count of a call to
+.BR printf (3).
+.PP
+.\" SRC BEGIN (perf_event_open.c)
+.EX
+#include <linux/perf_event.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+static long
+perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+ int cpu, int group_fd, unsigned long flags)
+{
+ int ret;
+\&
+ ret = syscall(SYS_perf_event_open, hw_event, pid, cpu,
+ group_fd, flags);
+ return ret;
+}
+\&
+int
+main(void)
+{
+ int fd;
+ long long count;
+ struct perf_event_attr pe;
+\&
+ memset(&pe, 0, sizeof(pe));
+ pe.type = PERF_TYPE_HARDWARE;
+ pe.size = sizeof(pe);
+ pe.config = PERF_COUNT_HW_INSTRUCTIONS;
+ pe.disabled = 1;
+ pe.exclude_kernel = 1;
+ pe.exclude_hv = 1;
+\&
+ fd = perf_event_open(&pe, 0, \-1, \-1, 0);
+ if (fd == \-1) {
+ fprintf(stderr, "Error opening leader %llx\en", pe.config);
+ exit(EXIT_FAILURE);
+ }
+\&
+ ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+ ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+\&
+ printf("Measuring instruction count for this printf\en");
+\&
+ ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+ read(fd, &count, sizeof(count));
+\&
+ printf("Used %lld instructions\en", count);
+\&
+ close(fd);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR perf (1),
+.BR fcntl (2),
+.BR mmap (2),
+.BR open (2),
+.BR prctl (2),
+.BR read (2)
+.PP
+.I Documentation/admin\-guide/perf\-security.rst
+in the kernel source tree