Adding upstream version 6.1.76.upstream/6.1.76 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:49:45 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:49:45 +0000
commit: 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree: 848558de17fb3008cdf4d861b01ac7781903ce39 /tools/perf/Documentation
parent: Initial commit. (diff)
download: linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
70 files changed, 14110 insertions, 0 deletions
diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt
new file mode 100644
index 000000000..3766886c4
--- /dev/null
+++ b/tools/perf/Documentation/Build.txt
@@ -0,0 +1,73 @@
+
+1) perf build
+=============
+The perf build process consists of several separated building blocks,
+which are linked together to form the perf binary:
+  - libperf library (static)
+  - perf builtin commands
+  - traceevent library (static)
+  - GTK ui library
+
+Several makefiles govern the perf build:
+
+  - Makefile
+    top level Makefile working as a wrapper that calls the main
+    Makefile.perf with a -j option to do parallel builds.
+
+  - Makefile.perf
+    main makefile that triggers build of all perf objects including
+    installation and documentation processing.
+
+  - tools/build/Makefile.build
+    main makefile of the build framework
+
+  - tools/build/Build.include
+    build framework generic definitions
+
+  - Build makefiles
+    makefiles that defines build objects
+
+Please refer to tools/build/Documentation/Build.txt for more
+information about build framework.
+
+
+2) perf build
+=============
+The Makefile.perf triggers the build framework for build objects:
+   perf, libperf, gtk
+
+resulting in following objects:
+  $ ls  *-in.o
+  gtk-in.o  libperf-in.o  perf-in.o
+
+Those objects are then used in final linking:
+  libperf-gtk.so <- gtk-in.o  libperf-in.o
+  perf           <- perf-in.o libperf-in.o
+
+
+NOTE this description is omitting other libraries involved, only
+     focusing on build framework outcomes
+
+3) Build with ASan or UBSan
+==========================
+  $ cd tools/perf
+  $ make DESTDIR=/usr
+  $ make DESTDIR=/usr install
+
+AddressSanitizer (or ASan) is a GCC feature that detects memory corruption bugs
+such as buffer overflows and memory leaks.
+
+  $ cd tools/perf
+  $ make DEBUG=1 EXTRA_CFLAGS='-fno-omit-frame-pointer -fsanitize=address'
+  $ ASAN_OPTIONS=log_path=asan.log ./perf record -a
+
+ASan outputs all detected issues into a log file named 'asan.log.<pid>'.
+
+UndefinedBehaviorSanitizer (or UBSan) is a fast undefined behavior detector
+supported by GCC. UBSan detects undefined behaviors of programs at runtime.
+
+  $ cd tools/perf
+  $ make DEBUG=1 EXTRA_CFLAGS='-fno-omit-frame-pointer -fsanitize=undefined'
+  $ UBSAN_OPTIONS=print_stacktrace=1 ./perf record -a
+
+If UBSan detects any problem at runtime, it outputs a “runtime error:” message.
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
new file mode 100644
index 000000000..6e7b88917
--- /dev/null
+++ b/tools/perf/Documentation/Makefile
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: GPL-2.0-only
+include ../../scripts/Makefile.include
+include ../../scripts/utilities.mak
+
+ARTICLES =
+# with their own formatting rules.
+SP_ARTICLES =
+
+MAN1_TXT= \
+	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
+		$(wildcard perf-*.txt)) \
+	perf.txt
+MAN5_TXT=
+MAN7_TXT=
+
+MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
+_MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
+_MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
+
+MAN_XML=$(addprefix $(OUTPUT),$(_MAN_XML))
+MAN_HTML=$(addprefix $(OUTPUT),$(_MAN_HTML))
+
+_DOC_HTML = $(_MAN_HTML)
+_DOC_HTML+=$(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
+DOC_HTML=$(addprefix $(OUTPUT),$(_DOC_HTML))
+
+_DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
+_DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
+_DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
+
+DOC_MAN1=$(addprefix $(OUTPUT),$(_DOC_MAN1))
+DOC_MAN5=$(addprefix $(OUTPUT),$(_DOC_MAN5))
+DOC_MAN7=$(addprefix $(OUTPUT),$(_DOC_MAN7))
+
+# Make the path relative to DESTDIR, not prefix
+ifndef DESTDIR
+prefix?=$(HOME)
+endif
+bindir?=$(prefix)/bin
+htmldir?=$(prefix)/share/doc/perf-doc
+pdfdir?=$(prefix)/share/doc/perf-doc
+mandir?=$(prefix)/share/man
+man1dir=$(mandir)/man1
+man5dir=$(mandir)/man5
+man7dir=$(mandir)/man7
+
+ASCIIDOC=asciidoc
+ASCIIDOC_EXTRA += --unsafe -f asciidoc.conf
+ASCIIDOC_HTML = xhtml11
+MANPAGE_XSL = manpage-normal.xsl
+XMLTO_EXTRA =
+INSTALL?=install
+RM ?= rm -f
+DOC_REF = origin/man
+HTML_REF = origin/html
+
+ifdef USE_ASCIIDOCTOR
+ASCIIDOC = asciidoctor
+ASCIIDOC_EXTRA += -a compat-mode
+ASCIIDOC_EXTRA += -I. -rasciidoctor-extensions
+ASCIIDOC_EXTRA += -a mansource="perf" -a manmanual="perf Manual"
+ASCIIDOC_HTML = xhtml5
+endif
+
+infodir?=$(prefix)/share/info
+MAKEINFO=makeinfo
+INSTALL_INFO=install-info
+DOCBOOK2X_TEXI=docbook2x-texi
+DBLATEX=dblatex
+XMLTO=xmlto
+ifndef PERL_PATH
+	PERL_PATH = /usr/bin/perl
+endif
+
+-include ../config.mak.autogen
+-include ../config.mak
+
+_tmp_tool_path := $(call get-executable,$(ASCIIDOC))
+ifeq ($(_tmp_tool_path),)
+	missing_tools = $(ASCIIDOC)
+endif
+
+ifndef USE_ASCIIDOCTOR
+_tmp_tool_path := $(call get-executable,$(XMLTO))
+ifeq ($(_tmp_tool_path),)
+	missing_tools += $(XMLTO)
+endif
+endif
+
+#
+# For asciidoc ...
+#	-7.1.2,	no extra settings are needed.
+#	8.0-,	set ASCIIDOC8.
+#
+
+#
+# For docbook-xsl ...
+#	-1.68.1,	set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
+#	1.69.0,		no extra settings are needed?
+#	1.69.1-1.71.0,	set DOCBOOK_SUPPRESS_SP?
+#	1.71.1,		no extra settings are needed?
+#	1.72.0,		set DOCBOOK_XSL_172.
+#	1.73.0-,	set ASCIIDOC_NO_ROFF
+#
+
+#
+# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
+# of 'the ".ft C" problem' in your generated manpages, and you
+# instead ended up with weird characters around callouts, try
+# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
+#
+
+ifdef ASCIIDOC8
+ASCIIDOC_EXTRA += -a asciidoc7compatible
+endif
+ifdef DOCBOOK_XSL_172
+ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+MANPAGE_XSL = manpage-1.72.xsl
+else
+	ifdef ASCIIDOC_NO_ROFF
+	# docbook-xsl after 1.72 needs the regular XSL, but will not
+	# pass-thru raw roff codes from asciidoc.conf, so turn them off.
+	ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+	endif
+endif
+ifdef MAN_BOLD_LITERAL
+XMLTO_EXTRA += -m manpage-bold-literal.xsl
+endif
+ifdef DOCBOOK_SUPPRESS_SP
+XMLTO_EXTRA += -m manpage-suppress-sp.xsl
+endif
+
+SHELL_PATH ?= $(SHELL)
+# Shell quote;
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+
+#
+# Please note that there is a minor bug in asciidoc.
+# The version after 6.0.3 _will_ include the patch found here:
+#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
+#
+# Until that version is released you may have to apply the patch
+# yourself - yes, all 6 characters of it!
+#
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifneq ($(V),1)
+	QUIET_ASCIIDOC	= @echo '  ASCIIDOC '$@;
+	QUIET_XMLTO	= @echo '  XMLTO    '$@;
+	QUIET_DB2TEXI	= @echo '  DB2TEXI  '$@;
+	QUIET_MAKEINFO	= @echo '  MAKEINFO '$@;
+	QUIET_DBLATEX	= @echo '  DBLATEX  '$@;
+	QUIET_XSLTPROC	= @echo '  XSLTPROC '$@;
+	QUIET_GEN	= @echo '  GEN      '$@;
+	QUIET_STDERR	= 2> /dev/null
+	QUIET_SUBDIR0	= +@subdir=
+	QUIET_SUBDIR1	= ;$(NO_SUBDIR) \
+			   echo '  SUBDIR   ' $$subdir; \
+			  $(MAKE) $(PRINT_DIR) -C $$subdir
+	export V
+endif
+endif
+
+all: html man info
+
+html: $(DOC_HTML)
+
+$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
+
+man: man1 man5 man7
+man1: $(DOC_MAN1)
+man5: $(DOC_MAN5)
+man7: $(DOC_MAN7)
+
+info: $(OUTPUT)perf.info $(OUTPUT)perfman.info
+
+install: install-man
+
+check-man-tools:
+ifdef missing_tools
+	$(error "You need to install $(missing_tools) for man pages")
+endif
+
+do-install-man: man
+	$(call QUIET_INSTALL, Documentation-man) \
+		$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir); \
+#		$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir); \
+#		$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir); \
+		$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir); \
+#		$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir); \
+#		$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+
+install-man: check-man-tools man do-install-man
+
+ifdef missing_tools
+  DO_INSTALL_MAN = $(warning Please install $(missing_tools) to have the man pages installed)
+else
+  DO_INSTALL_MAN = do-install-man
+endif
+
+try-install-man: $(DO_INSTALL_MAN)
+
+install-info: info
+	$(call QUIET_INSTALL, Documentation-info) \
+		$(INSTALL) -d -m 755 $(DESTDIR)$(infodir); \
+		$(INSTALL) -m 644 $(OUTPUT)perf.info $(OUTPUT)perfman.info $(DESTDIR)$(infodir); \
+	if test -r $(DESTDIR)$(infodir)/dir; then \
+		$(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
+		$(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
+	else \
+	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
+	fi
+
+#install-html: html
+#	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
+
+
+#
+# Determine "include::" file references in asciidoc files.
+#
+$(OUTPUT)doc.dep : $(wildcard *.txt) build-docdep.perl
+	$(QUIET_GEN)$(RM) $@+ $@ && \
+	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
+	mv $@+ $@
+
+-include $(OUTPUT)doc.dep
+
+CLEAN_FILES =									\
+	$(MAN_XML) $(addsuffix +,$(MAN_XML))					\
+	$(MAN_HTML) $(addsuffix +,$(MAN_HTML))					\
+	$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7)				\
+	$(OUTPUT)*.texi $(OUTPUT)*.texi+ $(OUTPUT)*.texi++			\
+	$(OUTPUT)perf.info $(OUTPUT)perfman.info $(OUTPUT)doc.dep		\
+	$(OUTPUT)technical/api-*.html $(OUTPUT)technical/api-index.txt
+clean:
+	$(call QUIET_CLEAN, Documentation) $(RM) $(CLEAN_FILES)
+
+$(MAN_HTML): $(OUTPUT)%.html : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b $(ASCIIDOC_HTML) -d manpage \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+
+ifdef USE_ASCIIDOCTOR
+$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b manpage -d manpage \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+endif
+
+$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml
+	$(QUIET_XMLTO)$(RM) $@ && \
+	$(XMLTO) -o $(OUTPUT). -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
+
+$(OUTPUT)%.xml : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b docbook -d manpage \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \
+		-aperf_date=$(shell git log -1 --pretty="format:%cd" \
+				--date=short $<) \
+		-o $@+ $< && \
+	mv $@+ $@
+
+XSLT = docbook.xsl
+XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
+
+$(OUTPUT)perfman.texi: $(MAN_XML) cat-texi.perl
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
+		--to-stdout $(xml) &&) true) > $@++ && \
+	$(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
+	rm $@++ && \
+	mv $@+ $@
+
+$(OUTPUT)perfman.info: $(OUTPUT)perfman.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate -o $@ $*.texi
+
+$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	$(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
+	mv $@+ $@
+
+$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
+	$(QUIET_ASCIIDOC)$(ASCIIDOC) -b $(ASCIIDOC_HTML) $*.txt
+
+WEBDOC_DEST = /pub/software/tools/perf/docs
+
+# UNIMPLEMENTED
+#install-webdoc : html
+#	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
+
+# quick-install: quick-install-man
+
+# quick-install-man:
+#	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
+
+#quick-install-html:
+#	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
diff --git a/tools/perf/Documentation/android.txt b/tools/perf/Documentation/android.txt
new file mode 100644
index 000000000..24a59998f
--- /dev/null
+++ b/tools/perf/Documentation/android.txt
@@ -0,0 +1,78 @@
+How to compile perf for Android
+=========================================
+
+I. Set the Android NDK environment
+------------------------------------------------
+
+(a). Use the Android NDK
+------------------------------------------------
+1. You need to download and install the Android Native Development Kit (NDK).
+Set the NDK variable to point to the path where you installed the NDK:
+  export NDK=/path/to/android-ndk
+
+2. Set cross-compiling environment variables for NDK toolchain and sysroot.
+For arm:
+  export NDK_TOOLCHAIN=${NDK}/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-
+  export NDK_SYSROOT=${NDK}/platforms/android-24/arch-arm
+For x86:
+  export NDK_TOOLCHAIN=${NDK}/toolchains/x86-4.9/prebuilt/linux-x86_64/bin/i686-linux-android-
+  export NDK_SYSROOT=${NDK}/platforms/android-24/arch-x86
+
+This method is only tested for Android NDK versions Revision 11b and later.
+perf uses some bionic enhancements that are not included in prior NDK versions.
+You can use method (b) described below instead.
+
+(b). Use the Android source tree
+-----------------------------------------------
+1. Download the master branch of the Android source tree.
+Set the environment for the target you want using:
+  source build/envsetup.sh
+  lunch
+
+2. Build your own NDK sysroot to contain latest bionic changes and set the
+NDK sysroot environment variable.
+  cd ${ANDROID_BUILD_TOP}/ndk
+For arm:
+  ./build/tools/build-ndk-sysroot.sh --abi=arm
+  export NDK_SYSROOT=${ANDROID_BUILD_TOP}/ndk/build/platforms/android-3/arch-arm
+For x86:
+  ./build/tools/build-ndk-sysroot.sh --abi=x86
+  export NDK_SYSROOT=${ANDROID_BUILD_TOP}/ndk/build/platforms/android-3/arch-x86
+
+3. Set the NDK toolchain environment variable.
+For arm:
+  export NDK_TOOLCHAIN=${ANDROID_TOOLCHAIN}/arm-linux-androideabi-
+For x86:
+  export NDK_TOOLCHAIN=${ANDROID_TOOLCHAIN}/i686-linux-android-
+
+II. Compile perf for Android
+------------------------------------------------
+You need to run make with the NDK toolchain and sysroot defined above:
+For arm:
+  make WERROR=0 ARCH=arm CROSS_COMPILE=${NDK_TOOLCHAIN} EXTRA_CFLAGS="-pie --sysroot=${NDK_SYSROOT}"
+For x86:
+  make WERROR=0 ARCH=x86 CROSS_COMPILE=${NDK_TOOLCHAIN} EXTRA_CFLAGS="-pie --sysroot=${NDK_SYSROOT}"
+
+III. Install perf
+-----------------------------------------------
+You need to connect to your Android device/emulator using adb.
+Install perf using:
+  adb push perf /data/perf
+
+If you also want to use perf-archive you need busybox tools for Android.
+For installing perf-archive, you first need to replace #!/bin/bash with #!/system/bin/sh:
+  sed 's/#!\/bin\/bash/#!\/system\/bin\/sh/g' perf-archive >> /tmp/perf-archive
+  chmod +x /tmp/perf-archive
+  adb push /tmp/perf-archive /data/perf-archive
+
+IV. Environment settings for running perf
+------------------------------------------------
+Some perf features need environment variables to run properly.
+You need to set these before running perf on the target:
+  adb shell
+  # PERF_PAGER=cat
+
+IV. Run perf
+------------------------------------------------
+Run perf on your device/emulator to which you previously connected using adb:
+  # ./data/perf
diff --git a/tools/perf/Documentation/arm-coresight.txt b/tools/perf/Documentation/arm-coresight.txt
new file mode 100644
index 000000000..c117fc50a
--- /dev/null
+++ b/tools/perf/Documentation/arm-coresight.txt
@@ -0,0 +1,5 @@
+Arm CoreSight Support
+=====================
+
+For full documentation, see Documentation/trace/coresight/coresight-perf.rst
+in the kernel tree.
diff --git a/tools/perf/Documentation/asciidoc.conf b/tools/perf/Documentation/asciidoc.conf
new file mode 100644
index 000000000..2b62ba1e7
--- /dev/null
+++ b/tools/perf/Documentation/asciidoc.conf
@@ -0,0 +1,94 @@
+## linkperf: macro
+#
+# Usage: linkperf:command[manpage-section]
+#
+# Note, {0} is the manpage section, while {target} is the command.
+#
+# Show PERF link as: <command>(<section>); if section is defined, else just show
+# the command.
+
+[macros]
+(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
+
+[attributes]
+asterisk=&#42;
+plus=&#43;
+caret=&#94;
+startsb=&#91;
+endsb=&#93;
+tilde=&#126;
+
+ifdef::backend-docbook[]
+[linkperf-inlinemacro]
+{0%{target}}
+{0#<citerefentry>}
+{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
+{0#</citerefentry>}
+endif::backend-docbook[]
+
+ifdef::backend-docbook[]
+ifndef::perf-asciidoc-no-roff[]
+# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
+# v1.72 breaks with this because it replaces dots not in roff requests.
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+ifdef::doctype-manpage[]
+&#10;.ft C&#10;
+endif::doctype-manpage[]
+|
+ifdef::doctype-manpage[]
+&#10;.ft&#10;
+endif::doctype-manpage[]
+</literallayout>
+{title#}</example>
+endif::perf-asciidoc-no-roff[]
+
+ifdef::perf-asciidoc-no-roff[]
+ifdef::doctype-manpage[]
+# The following two small workarounds insert a simple paragraph after screen
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+|
+</literallayout><simpara></simpara>
+{title#}</example>
+
+[verseblock]
+<formalpara{id? id="{id}"}><title>{title}</title><para>
+{title%}<literallayout{id? id="{id}"}>
+{title#}<literallayout>
+|
+</literallayout>
+{title#}</para></formalpara>
+{title%}<simpara></simpara>
+endif::doctype-manpage[]
+endif::perf-asciidoc-no-roff[]
+endif::backend-docbook[]
+
+ifdef::doctype-manpage[]
+ifdef::backend-docbook[]
+[header]
+template::[header-declarations]
+<refentry>
+ifdef::perf_date[]
+<refentryinfo><date>{perf_date}</date></refentryinfo>
+endif::perf_date[]
+<refmeta>
+<refentrytitle>{mantitle}</refentrytitle>
+<manvolnum>{manvolnum}</manvolnum>
+<refmiscinfo class="source">perf</refmiscinfo>
+<refmiscinfo class="version">{perf_version}</refmiscinfo>
+<refmiscinfo class="manual">perf Manual</refmiscinfo>
+</refmeta>
+<refnamediv>
+  <refname>{manname}</refname>
+  <refpurpose>{manpurpose}</refpurpose>
+</refnamediv>
+endif::backend-docbook[]
+endif::doctype-manpage[]
+
+ifdef::backend-xhtml11[]
+[linkperf-inlinemacro]
+<a href="{target}.html">{target}{0?({0})}</a>
+endif::backend-xhtml11[]
diff --git a/tools/perf/Documentation/asciidoctor-extensions.rb b/tools/perf/Documentation/asciidoctor-extensions.rb
new file mode 100644
index 000000000..d148fe95c
--- /dev/null
+++ b/tools/perf/Documentation/asciidoctor-extensions.rb
@@ -0,0 +1,29 @@
+require 'asciidoctor'
+require 'asciidoctor/extensions'
+
+module Perf
+  module Documentation
+    class LinkPerfProcessor < Asciidoctor::Extensions::InlineMacroProcessor
+      use_dsl
+
+      named :chrome
+
+      def process(parent, target, attrs)
+        if parent.document.basebackend? 'html'
+          %(<a href="#{target}.html">#{target}(#{attrs[1]})</a>\n)
+        elsif parent.document.basebackend? 'manpage'
+          "#{target}(#{attrs[1]})"
+        elsif parent.document.basebackend? 'docbook'
+          "<citerefentry>\n" \
+            "<refentrytitle>#{target}</refentrytitle>" \
+            "<manvolnum>#{attrs[1]}</manvolnum>\n" \
+          "</citerefentry>\n"
+        end
+      end
+    end
+  end
+end
+
+Asciidoctor::Extensions.register do
+  inline_macro Perf::Documentation::LinkPerfProcessor, :linkperf
+end
diff --git a/tools/perf/Documentation/build-docdep.perl b/tools/perf/Documentation/build-docdep.perl
new file mode 100755
index 000000000..ba4205e03
--- /dev/null
+++ b/tools/perf/Documentation/build-docdep.perl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+my %include = ();
+my %included = ();
+
+for my $text (<*.txt>) {
+    open I, '<', $text || die "cannot read: $text";
+    while (<I>) {
+	if (/^include::/) {
+	    chomp;
+	    s/^include::\s*//;
+	    s/\[\]//;
+	    $include{$text}{$_} = 1;
+	    $included{$_} = 1;
+	}
+    }
+    close I;
+}
+
+# Do we care about chained includes???
+my $changed = 1;
+while ($changed) {
+    $changed = 0;
+    while (my ($text, $included) = each %include) {
+	for my $i (keys %$included) {
+	    # $text has include::$i; if $i includes $j
+	    # $text indirectly includes $j.
+	    if (exists $include{$i}) {
+		for my $j (keys %{$include{$i}}) {
+		    if (!exists $include{$text}{$j}) {
+			$include{$text}{$j} = 1;
+			$included{$j} = 1;
+			$changed = 1;
+		    }
+		}
+	    }
+	}
+    }
+}
+
+while (my ($text, $included) = each %include) {
+    if (! exists $included{$text} &&
+	(my $base = $text) =~ s/\.txt$//) {
+	print "$base.html $base.xml : ", join(" ", keys %$included), "\n";
+    }
+}
diff --git a/tools/perf/Documentation/build-xed.txt b/tools/perf/Documentation/build-xed.txt
new file mode 100644
index 000000000..6222c1e72
--- /dev/null
+++ b/tools/perf/Documentation/build-xed.txt
@@ -0,0 +1,19 @@
+
+For --xed the xed tool is needed. Here is how to install it:
+
+  $ git clone https://github.com/intelxed/mbuild.git mbuild
+  $ git clone https://github.com/intelxed/xed
+  $ cd xed
+  $ ./mfile.py --share
+  $ ./mfile.py examples
+  $ sudo ./mfile.py --prefix=/usr/local install
+  $ sudo ldconfig
+  $ sudo cp obj/examples/xed /usr/local/bin
+
+Basic xed testing:
+
+  $ xed | head -3
+  ERROR: required argument(s) were missing
+  Copyright (C) 2017, Intel Corporation. All rights reserved.
+  XED version: [v10.0-328-g7d62c8c49b7b]
+  $
diff --git a/tools/perf/Documentation/callchain-overhead-calculation.txt b/tools/perf/Documentation/callchain-overhead-calculation.txt
new file mode 100644
index 000000000..1a7579271
--- /dev/null
+++ b/tools/perf/Documentation/callchain-overhead-calculation.txt
@@ -0,0 +1,108 @@
+Overhead calculation
+--------------------
+The overhead can be shown in two columns as 'Children' and 'Self' when
+perf collects callchains.  The 'self' overhead is simply calculated by
+adding all period values of the entry - usually a function (symbol).
+This is the value that perf shows traditionally and sum of all the
+'self' overhead values should be 100%.
+
+The 'children' overhead is calculated by adding all period values of
+the child functions so that it can show the total overhead of the
+higher level functions even if they don't directly execute much.
+'Children' here means functions that are called from another (parent)
+function.
+
+It might be confusing that the sum of all the 'children' overhead
+values exceeds 100% since each of them is already an accumulation of
+'self' overhead of its child functions.  But with this enabled, users
+can find which function has the most overhead even if samples are
+spread over the children.
+
+Consider the following example; there are three functions like below.
+
+-----------------------
+void foo(void) {
+    /* do something */
+}
+
+void bar(void) {
+    /* do something */
+    foo();
+}
+
+int main(void) {
+    bar()
+    return 0;
+}
+-----------------------
+
+In this case 'foo' is a child of 'bar', and 'bar' is an immediate
+child of 'main' so 'foo' also is a child of 'main'.  In other words,
+'main' is a parent of 'foo' and 'bar', and 'bar' is a parent of 'foo'.
+
+Suppose all samples are recorded in 'foo' and 'bar' only.  When it's
+recorded with callchains the output will show something like below
+in the usual (self-overhead-only) output of perf report:
+
+----------------------------------
+Overhead  Symbol
+........  .....................
+  60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+
+  40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+----------------------------------
+
+When the --children option is enabled, the 'self' overhead values of
+child functions (i.e. 'foo' and 'bar') are added to the parents to
+calculate the 'children' overhead.  In this case the report could be
+displayed as:
+
+-------------------------------------------
+Children      Self  Symbol
+........  ........  ....................
+ 100.00%     0.00%  __libc_start_main
+          |
+          --- __libc_start_main
+
+ 100.00%     0.00%  main
+          |
+          --- main
+              __libc_start_main
+
+ 100.00%    40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+
+  60.00%    60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+-------------------------------------------
+
+In the above output, the 'self' overhead of 'foo' (60%) was add to the
+'children' overhead of 'bar', 'main' and '\_\_libc_start_main'.
+Likewise, the 'self' overhead of 'bar' (40%) was added to the
+'children' overhead of 'main' and '\_\_libc_start_main'.
+
+So '\_\_libc_start_main' and 'main' are shown first since they have
+same (100%) 'children' overhead (even though they have zero 'self'
+overhead) and they are the parents of 'foo' and 'bar'.
+
+Since v3.16 the 'children' overhead is shown by default and the output
+is sorted by its values. The 'children' overhead is disabled by
+specifying --no-children option on the command line or by adding
+'report.children = false' or 'top.children = false' in the perf config
+file.
diff --git a/tools/perf/Documentation/cat-texi.perl b/tools/perf/Documentation/cat-texi.perl
new file mode 100755
index 000000000..14d2f8341
--- /dev/null
+++ b/tools/perf/Documentation/cat-texi.perl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+my @menu = ();
+my $output = $ARGV[0];
+
+open my $tmp, '>', "$output.tmp";
+
+while (<STDIN>) {
+	next if (/^\\input texinfo/../\@node Top/);
+	next if (/^\@bye/ || /^\.ft/);
+	if (s/^\@top (.*)/\@node $1,,,Top/) {
+		push @menu, $1;
+	}
+	s/\(\@pxref\{\[(URLS|REMOTES)\]}\)//;
+	s/\@anchor\{[^{}]*\}//g;
+	print $tmp $_;
+}
+close $tmp;
+
+print '\input texinfo
+@setfilename gitman.info
+@documentencoding UTF-8
+@dircategory Development
+@direntry
+* Git Man Pages: (gitman).  Manual pages for Git revision control system
+@end direntry
+@node Top,,, (dir)
+@top Git Manual Pages
+@documentlanguage en
+@menu
+';
+
+for (@menu) {
+	print "* ${_}::\n";
+}
+print "\@end menu\n";
+open $tmp, '<', "$output.tmp";
+while (<$tmp>) {
+	print;
+}
+close $tmp;
+print "\@bye\n";
+unlink "$output.tmp";
diff --git a/tools/perf/Documentation/db-export.txt b/tools/perf/Documentation/db-export.txt
new file mode 100644
index 000000000..52ffccb02
--- /dev/null
+++ b/tools/perf/Documentation/db-export.txt
@@ -0,0 +1,41 @@
+Database Export
+===============
+
+perf tool's python scripting engine:
+
+	tools/perf/util/scripting-engines/trace-event-python.c
+
+supports scripts:
+
+	tools/perf/scripts/python/export-to-sqlite.py
+	tools/perf/scripts/python/export-to-postgresql.py
+
+which export data to a SQLite3 or PostgreSQL database.
+
+The export process provides records with unique sequential ids which allows the
+data to be imported directly to a database and provides the relationships
+between tables.
+
+Over time it is possible to continue to expand the export while maintaining
+backward and forward compatibility, by following some simple rules:
+
+1. Because of the nature of SQL, existing tables and columns can continue to be
+used so long as the names and meanings (and to some extent data types) remain
+the same.
+
+2. New tables and columns can be added, without affecting existing SQL queries,
+so long as the new names are unique.
+
+3. Scripts that use a database (e.g. exported-sql-viewer.py) can maintain
+backward compatibility by testing for the presence of new tables and columns
+before using them. e.g. function IsSelectable() in exported-sql-viewer.py
+
+4. The export scripts themselves maintain forward compatibility (i.e. an existing
+script will continue to work with new versions of perf) by accepting a variable
+number of arguments (e.g. def call_return_table(*x)) i.e. perf can pass more
+arguments which old scripts will ignore.
+
+5. The scripting engine tests for the existence of script handler functions
+before calling them.  The scripting engine can also test for the support of new
+or optional features by checking for the existence and value of script global
+variables.
diff --git a/tools/perf/Documentation/examples.txt b/tools/perf/Documentation/examples.txt
new file mode 100644
index 000000000..c0d22fbe9
--- /dev/null
+++ b/tools/perf/Documentation/examples.txt
@@ -0,0 +1,225 @@
+
+		------------------------------
+		****** perf by examples ******
+		------------------------------
+
+[ From an e-mail by Ingo Molnar, https://lore.kernel.org/lkml/20090804195717.GA5998@elte.hu ]
+
+
+First, discovery/enumeration of available counters can be done via
+'perf list':
+
+titan:~> perf list
+  [...]
+  kmem:kmalloc                             [Tracepoint event]
+  kmem:kmem_cache_alloc                    [Tracepoint event]
+  kmem:kmalloc_node                        [Tracepoint event]
+  kmem:kmem_cache_alloc_node               [Tracepoint event]
+  kmem:kfree                               [Tracepoint event]
+  kmem:kmem_cache_free                     [Tracepoint event]
+  kmem:mm_page_free                        [Tracepoint event]
+  kmem:mm_page_free_batched                [Tracepoint event]
+  kmem:mm_page_alloc                       [Tracepoint event]
+  kmem:mm_page_alloc_zone_locked           [Tracepoint event]
+  kmem:mm_page_pcpu_drain                  [Tracepoint event]
+  kmem:mm_page_alloc_extfrag               [Tracepoint event]
+
+Then any (or all) of the above event sources can be activated and
+measured. For example the page alloc/free properties of a 'hackbench
+run' are:
+
+ titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc
+ -e kmem:mm_page_free_batched -e kmem:mm_page_free ./hackbench 10
+ Time: 0.575
+
+ Performance counter stats for './hackbench 10':
+
+          13857  kmem:mm_page_pcpu_drain
+          27576  kmem:mm_page_alloc
+           6025  kmem:mm_page_free_batched
+          20934  kmem:mm_page_free
+
+    0.613972165  seconds time elapsed
+
+You can observe the statistical properties as well, by using the
+'repeat the workload N times' feature of perf stat:
+
+ titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e
+   kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+   kmem:mm_page_free ./hackbench 10
+ Time: 0.627
+ Time: 0.644
+ Time: 0.564
+ Time: 0.559
+ Time: 0.626
+
+ Performance counter stats for './hackbench 10' (5 runs):
+
+          12920  kmem:mm_page_pcpu_drain    ( +-   3.359% )
+          25035  kmem:mm_page_alloc         ( +-   3.783% )
+           6104  kmem:mm_page_free_batched  ( +-   0.934% )
+          18376  kmem:mm_page_free	    ( +-   4.941% )
+
+    0.643954516  seconds time elapsed   ( +-   2.363% )
+
+Furthermore, these tracepoints can be used to sample the workload as
+well. For example the page allocations done by a 'git gc' can be
+captured the following way:
+
+ titan:~/git> perf record -e kmem:mm_page_alloc -c 1 ./git gc
+ Counting objects: 1148, done.
+ Delta compression using up to 2 threads.
+ Compressing objects: 100% (450/450), done.
+ Writing objects: 100% (1148/1148), done.
+ Total 1148 (delta 690), reused 1148 (delta 690)
+ [ perf record: Captured and wrote 0.267 MB perf.data (~11679 samples) ]
+
+To check which functions generated page allocations:
+
+ titan:~/git> perf report
+ # Samples: 10646
+ #
+ # Overhead          Command               Shared Object
+ # ........  ...............  ..........................
+ #
+    23.57%       git-repack  /lib64/libc-2.5.so
+    21.81%              git  /lib64/libc-2.5.so
+    14.59%              git  ./git
+    11.79%       git-repack  ./git
+     7.12%              git  /lib64/ld-2.5.so
+     3.16%       git-repack  /lib64/libpthread-2.5.so
+     2.09%       git-repack  /bin/bash
+     1.97%               rm  /lib64/libc-2.5.so
+     1.39%               mv  /lib64/ld-2.5.so
+     1.37%               mv  /lib64/libc-2.5.so
+     1.12%       git-repack  /lib64/ld-2.5.so
+     0.95%               rm  /lib64/ld-2.5.so
+     0.90%  git-update-serv  /lib64/libc-2.5.so
+     0.73%  git-update-serv  /lib64/ld-2.5.so
+     0.68%             perf  /lib64/libpthread-2.5.so
+     0.64%       git-repack  /usr/lib64/libz.so.1.2.3
+
+Or to see it on a more finegrained level:
+
+titan:~/git> perf report --sort comm,dso,symbol
+# Samples: 10646
+#
+# Overhead          Command               Shared Object  Symbol
+# ........  ...............  ..........................  ......
+#
+     9.35%       git-repack  ./git                       [.] insert_obj_hash
+     9.12%              git  ./git                       [.] insert_obj_hash
+     7.31%              git  /lib64/libc-2.5.so          [.] memcpy
+     6.34%       git-repack  /lib64/libc-2.5.so          [.] _int_malloc
+     6.24%       git-repack  /lib64/libc-2.5.so          [.] memcpy
+     5.82%       git-repack  /lib64/libc-2.5.so          [.] __GI___fork
+     5.47%              git  /lib64/libc-2.5.so          [.] _int_malloc
+     2.99%              git  /lib64/libc-2.5.so          [.] memset
+
+Furthermore, call-graph sampling can be done too, of page
+allocations - to see precisely what kind of page allocations there
+are:
+
+ titan:~/git> perf record -g -e kmem:mm_page_alloc -c 1 ./git gc
+ Counting objects: 1148, done.
+ Delta compression using up to 2 threads.
+ Compressing objects: 100% (450/450), done.
+ Writing objects: 100% (1148/1148), done.
+ Total 1148 (delta 690), reused 1148 (delta 690)
+ [ perf record: Captured and wrote 0.963 MB perf.data (~42069 samples) ]
+
+ titan:~/git> perf report -g
+ # Samples: 10686
+ #
+ # Overhead          Command               Shared Object
+ # ........  ...............  ..........................
+ #
+    23.25%       git-repack  /lib64/libc-2.5.so
+                |
+                |--50.00%-- _int_free
+                |
+                |--37.50%-- __GI___fork
+                |          make_child
+                |
+                |--12.50%-- ptmalloc_unlock_all2
+                |          make_child
+                |
+                 --6.25%-- __GI_strcpy
+    21.61%              git  /lib64/libc-2.5.so
+                |
+                |--30.00%-- __GI_read
+                |          |
+                |           --83.33%-- git_config_from_file
+                |                     git_config
+                |                     |
+   [...]
+
+Or you can observe the whole system's page allocations for 10
+seconds:
+
+titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e
+kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+kmem:mm_page_free sleep 10
+
+ Performance counter stats for 'sleep 10':
+
+         171585  kmem:mm_page_pcpu_drain
+         322114  kmem:mm_page_alloc
+          73623  kmem:mm_page_free_batched
+         254115  kmem:mm_page_free
+
+   10.000591410  seconds time elapsed
+
+Or observe how fluctuating the page allocations are, via statistical
+analysis done over ten 1-second intervals:
+
+ titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e
+   kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+   kmem:mm_page_free sleep 1
+
+ Performance counter stats for 'sleep 1' (10 runs):
+
+          17254  kmem:mm_page_pcpu_drain    ( +-   3.709% )
+          34394  kmem:mm_page_alloc         ( +-   4.617% )
+           7509  kmem:mm_page_free_batched  ( +-   4.820% )
+          25653  kmem:mm_page_free	    ( +-   3.672% )
+
+    1.058135029  seconds time elapsed   ( +-   3.089% )
+
+Or you can annotate the recorded 'git gc' run on a per symbol basis
+and check which instructions/source-code generated page allocations:
+
+ titan:~/git> perf annotate __GI___fork
+ ------------------------------------------------
+  Percent |      Source code & Disassembly of libc-2.5.so
+ ------------------------------------------------
+          :
+          :
+          :      Disassembly of section .plt:
+          :      Disassembly of section .text:
+          :
+          :      00000031a2e95560 <__fork>:
+ [...]
+     0.00 :        31a2e95602:   b8 38 00 00 00          mov    $0x38,%eax
+     0.00 :        31a2e95607:   0f 05                   syscall
+    83.42 :        31a2e95609:   48 3d 00 f0 ff ff       cmp    $0xfffffffffffff000,%rax
+     0.00 :        31a2e9560f:   0f 87 4d 01 00 00       ja     31a2e95762 <__fork+0x202>
+     0.00 :        31a2e95615:   85 c0                   test   %eax,%eax
+
+( this shows that 83.42% of __GI___fork's page allocations come from
+  the 0x38 system call it performs. )
+
+etc. etc. - a lot more is possible. I could list a dozen of
+other different usecases straight away - neither of which is
+possible via /proc/vmstat.
+
+/proc/vmstat is not in the same league really, in terms of
+expressive power of system analysis and performance
+analysis.
+
+All that the above results needed were those new tracepoints
+in include/tracing/events/kmem.h.
+
+	Ingo
+
+
diff --git a/tools/perf/Documentation/guest-files.txt b/tools/perf/Documentation/guest-files.txt
new file mode 100644
index 000000000..8cc0b092f
--- /dev/null
+++ b/tools/perf/Documentation/guest-files.txt
@@ -0,0 +1,16 @@
+include::guestmount.txt[]
+
+--guestkallsyms=<path>::
+	Guest OS /proc/kallsyms file copy. perf reads it to get guest
+	kernel symbols. Users copy it out from guest OS.
+
+--guestmodules=<path>::
+	Guest OS /proc/modules file copy. perf reads it to get guest
+	kernel module information. Users copy it out from guest OS.
+
+--guestvmlinux=<path>::
+	Guest OS kernel vmlinux.
+
+--guest-code::
+	Indicate that guest code can be found in the hypervisor process,
+	which is a common case for KVM test programs.
diff --git a/tools/perf/Documentation/guestmount.txt b/tools/perf/Documentation/guestmount.txt
new file mode 100644
index 000000000..6edf12363
--- /dev/null
+++ b/tools/perf/Documentation/guestmount.txt
@@ -0,0 +1,11 @@
+--guestmount=<path>::
+	Guest OS root file system mount directory. Users mount guest OS
+	root directories under <path> by a specific filesystem access method,
+	typically, sshfs.
+	For example, start 2 guest OS, one's pid is 8888 and the other's is 9999:
+[verse]
+	$ mkdir \~/guestmount
+	$ cd \~/guestmount
+	$ sshfs -o allow_other,direct_io -p 5551 localhost:/ 8888/
+	$ sshfs -o allow_other,direct_io -p 5552 localhost:/ 9999/
+	$ perf {GMEXAMPLECMD} --guestmount=~/guestmount {GMEXAMPLESUBCMD}
diff --git a/tools/perf/Documentation/intel-bts.txt b/tools/perf/Documentation/intel-bts.txt
new file mode 100644
index 000000000..8bdc93bd7
--- /dev/null
+++ b/tools/perf/Documentation/intel-bts.txt
@@ -0,0 +1,86 @@
+Intel Branch Trace Store
+========================
+
+Overview
+========
+
+Intel BTS could be regarded as a predecessor to Intel PT and has some
+similarities because it can also identify every branch a program takes.  A
+notable difference is that Intel BTS has no timing information and as a
+consequence the present implementation is limited to per-thread recording.
+
+While decoding Intel BTS does not require walking the object code, the object
+code is still needed to pair up calls and returns correctly, consequently much
+of the Intel PT documentation applies also to Intel BTS.  Refer to the Intel PT
+documentation and consider that the PMU 'intel_bts' can usually be used in
+place of 'intel_pt' in the examples provided, with the proviso that per-thread
+recording must also be stipulated i.e. the --per-thread option for
+'perf record'.
+
+
+perf record
+===========
+
+new event
+---------
+
+The Intel BTS kernel driver creates a new PMU for Intel BTS.  The perf record
+option is:
+
+	-e intel_bts//
+
+Currently Intel BTS is limited to per-thread tracing so the --per-thread option
+is also needed.
+
+
+snapshot option
+---------------
+
+The snapshot option is the same as Intel PT (refer Intel PT documentation).
+
+
+auxtrace mmap size option
+-----------------------
+
+The mmap size option is the same as Intel PT (refer Intel PT documentation).
+
+
+perf script
+===========
+
+By default, perf script will decode trace data found in the perf.data file.
+This can be further controlled by option --itrace.  The --itrace option is
+the same as Intel PT (refer Intel PT documentation) except that neither
+"instructions" events nor "transactions" events (and consequently call
+chains) are supported.
+
+To disable trace decoding entirely, use the option --no-itrace.
+
+
+dump option
+-----------
+
+perf script has an option (-D) to "dump" the events i.e. display the binary
+data.
+
+When -D is used, Intel BTS packets are displayed.
+
+To disable the display of Intel BTS packets, combine the -D option with
+--no-itrace.
+
+
+perf report
+===========
+
+By default, perf report will decode trace data found in the perf.data file.
+This can be further controlled by new option --itrace exactly the same as
+perf script.
+
+
+perf inject
+===========
+
+perf inject also accepts the --itrace option in which case tracing data is
+removed and replaced with the synthesized events. e.g.
+
+	perf inject --itrace -i perf.data -o perf.data.new
diff --git a/tools/perf/Documentation/intel-hybrid.txt b/tools/perf/Documentation/intel-hybrid.txt
new file mode 100644
index 000000000..e7a776ad2
--- /dev/null
+++ b/tools/perf/Documentation/intel-hybrid.txt
@@ -0,0 +1,204 @@
+Intel hybrid support
+--------------------
+Support for Intel hybrid events within perf tools.
+
+For some Intel platforms, such as AlderLake, which is hybrid platform and
+it consists of atom cpu and core cpu. Each cpu has dedicated event list.
+Part of events are available on core cpu, part of events are available
+on atom cpu and even part of events are available on both.
+
+Kernel exports two new cpu pmus via sysfs:
+/sys/devices/cpu_core
+/sys/devices/cpu_atom
+
+The 'cpus' files are created under the directories. For example,
+
+cat /sys/devices/cpu_core/cpus
+0-15
+
+cat /sys/devices/cpu_atom/cpus
+16-23
+
+It indicates cpu0-cpu15 are core cpus and cpu16-cpu23 are atom cpus.
+
+As before, use perf-list to list the symbolic event.
+
+perf list
+
+inst_retired.any
+	[Fixed Counter: Counts the number of instructions retired. Unit: cpu_atom]
+inst_retired.any
+	[Number of instructions retired. Fixed Counter - architectural event. Unit: cpu_core]
+
+The 'Unit: xxx' is added to brief description to indicate which pmu
+the event is belong to. Same event name but with different pmu can
+be supported.
+
+Enable hybrid event with a specific pmu
+
+To enable a core only event or atom only event, following syntax is supported:
+
+	cpu_core/<event name>/
+or
+	cpu_atom/<event name>/
+
+For example, count the 'cycles' event on core cpus.
+
+	perf stat -e cpu_core/cycles/
+
+Create two events for one hardware event automatically
+
+When creating one event and the event is available on both atom and core,
+two events are created automatically. One is for atom, the other is for
+core. Most of hardware events and cache events are available on both
+cpu_core and cpu_atom.
+
+For hardware events, they have pre-defined configs (e.g. 0 for cycles).
+But on hybrid platform, kernel needs to know where the event comes from
+(from atom or from core). The original perf event type PERF_TYPE_HARDWARE
+can't carry pmu information. So now this type is extended to be PMU aware
+type. The PMU type ID is stored at attr.config[63:32].
+
+PMU type ID is retrieved from sysfs.
+/sys/devices/cpu_atom/type
+/sys/devices/cpu_core/type
+
+The new attr.config layout for PERF_TYPE_HARDWARE:
+
+PERF_TYPE_HARDWARE:                 0xEEEEEEEE000000AA
+                                    AA: hardware event ID
+                                    EEEEEEEE: PMU type ID
+
+Cache event is similar. The type PERF_TYPE_HW_CACHE is extended to be
+PMU aware type. The PMU type ID is stored at attr.config[63:32].
+
+The new attr.config layout for PERF_TYPE_HW_CACHE:
+
+PERF_TYPE_HW_CACHE:                 0xEEEEEEEE00DDCCBB
+                                    BB: hardware cache ID
+                                    CC: hardware cache op ID
+                                    DD: hardware cache op result ID
+                                    EEEEEEEE: PMU type ID
+
+When enabling a hardware event without specified pmu, such as,
+perf stat -e cycles -a (use system-wide in this example), two events
+are created automatically.
+
+  ------------------------------------------------------------
+  perf_event_attr:
+    size                             120
+    config                           0x400000000
+    sample_type                      IDENTIFIER
+    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+    disabled                         1
+    inherit                          1
+    exclude_guest                    1
+  ------------------------------------------------------------
+
+and
+
+  ------------------------------------------------------------
+  perf_event_attr:
+    size                             120
+    config                           0x800000000
+    sample_type                      IDENTIFIER
+    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+    disabled                         1
+    inherit                          1
+    exclude_guest                    1
+  ------------------------------------------------------------
+
+type 0 is PERF_TYPE_HARDWARE.
+0x4 in 0x400000000 indicates it's cpu_core pmu.
+0x8 in 0x800000000 indicates it's cpu_atom pmu (atom pmu type id is random).
+
+The kernel creates 'cycles' (0x400000000) on cpu0-cpu15 (core cpus),
+and create 'cycles' (0x800000000) on cpu16-cpu23 (atom cpus).
+
+For perf-stat result, it displays two events:
+
+ Performance counter stats for 'system wide':
+
+           6,744,979      cpu_core/cycles/
+           1,965,552      cpu_atom/cycles/
+
+The first 'cycles' is core event, the second 'cycles' is atom event.
+
+Thread mode example:
+
+perf-stat reports the scaled counts for hybrid event and with a percentage
+displayed. The percentage is the event's running time/enabling time.
+
+One example, 'triad_loop' runs on cpu16 (atom core), while we can see the
+scaled value for core cycles is 160,444,092 and the percentage is 0.47%.
+
+perf stat -e cycles \-- taskset -c 16 ./triad_loop
+
+As previous, two events are created.
+
+------------------------------------------------------------
+perf_event_attr:
+  size                             120
+  config                           0x400000000
+  sample_type                      IDENTIFIER
+  read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+  disabled                         1
+  inherit                          1
+  enable_on_exec                   1
+  exclude_guest                    1
+------------------------------------------------------------
+
+and
+
+------------------------------------------------------------
+perf_event_attr:
+  size                             120
+  config                           0x800000000
+  sample_type                      IDENTIFIER
+  read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+  disabled                         1
+  inherit                          1
+  enable_on_exec                   1
+  exclude_guest                    1
+------------------------------------------------------------
+
+ Performance counter stats for 'taskset -c 16 ./triad_loop':
+
+       233,066,666      cpu_core/cycles/                                              (0.43%)
+       604,097,080      cpu_atom/cycles/                                              (99.57%)
+
+perf-record:
+
+If there is no '-e' specified in perf record, on hybrid platform,
+it creates two default 'cycles' and adds them to event list. One
+is for core, the other is for atom.
+
+perf-stat:
+
+If there is no '-e' specified in perf stat, on hybrid platform,
+besides of software events, following events are created and
+added to event list in order.
+
+cpu_core/cycles/,
+cpu_atom/cycles/,
+cpu_core/instructions/,
+cpu_atom/instructions/,
+cpu_core/branches/,
+cpu_atom/branches/,
+cpu_core/branch-misses/,
+cpu_atom/branch-misses/
+
+Of course, both perf-stat and perf-record support to enable
+hybrid event with a specific pmu.
+
+e.g.
+perf stat -e cpu_core/cycles/
+perf stat -e cpu_atom/cycles/
+perf stat -e cpu_core/r1a/
+perf stat -e cpu_atom/L1-icache-loads/
+perf stat -e cpu_core/cycles/,cpu_atom/instructions/
+perf stat -e '{cpu_core/cycles/,cpu_core/instructions/}'
+
+But '{cpu_core/cycles/,cpu_atom/instructions/}' will return
+warning and disable grouping, because the pmus in group are
+not matched (cpu_core vs. cpu_atom).
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
new file mode 100644
index 000000000..fd9241a1b
--- /dev/null
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -0,0 +1 @@
+Documentation for support for Intel Processor Trace within perf tools' has moved to file perf-intel-pt.txt
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
new file mode 100644
index 000000000..0916bbfe6
--- /dev/null
+++ b/tools/perf/Documentation/itrace.txt
@@ -0,0 +1,70 @@
+		i	synthesize instructions events
+		b	synthesize branches events (branch misses for Arm SPE)
+		c	synthesize branches events (calls only)
+		r	synthesize branches events (returns only)
+		x	synthesize transactions events
+		w	synthesize ptwrite events
+		p	synthesize power events (incl. PSB events for Intel PT)
+		o	synthesize other events recorded due to the use
+			of aux-output (refer to perf record)
+		I	synthesize interrupt or similar (asynchronous) events
+			(e.g. Intel PT Event Trace)
+		e	synthesize error events
+		d	create a debug log
+		f	synthesize first level cache events
+		m	synthesize last level cache events
+		M	synthesize memory events
+		t	synthesize TLB events
+		a	synthesize remote access events
+		g	synthesize a call chain (use with i or x)
+		G	synthesize a call chain on existing event records
+		l	synthesize last branch entries (use with i or x)
+		L	synthesize last branch entries on existing event records
+		s       skip initial number of events
+		q	quicker (less detailed) decoding
+		A	approximate IPC
+		Z	prefer to ignore timestamps (so-called "timeless" decoding)
+
+	The default is all events i.e. the same as --itrace=ibxwpe,
+	except for perf script where it is --itrace=ce
+
+	In addition, the period (default 100000, except for perf script where it is 1)
+	for instructions events can be specified in units of:
+
+		i	instructions
+		t	ticks
+		ms	milliseconds
+		us	microseconds
+		ns	nanoseconds (default)
+
+	Also the call chain size (default 16, max. 1024) for instructions or
+	transactions events can be specified.
+
+	Also the number of last branch entries (default 64, max. 1024) for
+	instructions or transactions events can be specified.
+
+	Similar to options g and l, size may also be specified for options G and L.
+	On x86, note that G and L work poorly when data has been recorded with
+	large PEBS. Refer linkperf:perf-intel-pt[1] man page for details.
+
+	It is also possible to skip events generated (instructions, branches, transactions,
+	ptwrite, power) at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+	skips the first million instructions.
+
+	The 'e' option may be followed by flags which affect what errors will or
+	will not be reported. Each flag must be preceded by either '+' or '-'.
+	The flags are:
+		o	overflow
+		l	trace data lost
+
+	If supported, the 'd' option may be followed by flags which affect what
+	debug messages will or will not be logged. Each flag must be preceded
+	by either '+' or '-'. The flags are:
+		a	all perf events
+		e	output only on errors (size configurable - see linkperf:perf-config[1])
+		o	output to stdout
+
+	If supported, the 'q' option may be repeated to increase the effect.
diff --git a/tools/perf/Documentation/jit-interface.txt b/tools/perf/Documentation/jit-interface.txt
new file mode 100644
index 000000000..a8656f564
--- /dev/null
+++ b/tools/perf/Documentation/jit-interface.txt
@@ -0,0 +1,15 @@
+perf supports a simple JIT interface to resolve symbols for dynamic code generated
+by a JIT.
+
+The JIT has to write a /tmp/perf-%d.map  (%d = pid of process) file
+
+This is a text file.
+
+Each line has the following format, fields separated with spaces:
+
+START SIZE symbolname
+
+START and SIZE are hex numbers without 0x.
+symbolname is the rest of the line, so it could contain special characters.
+
+The ownership of the file has to match the process.
diff --git a/tools/perf/Documentation/jitdump-specification.txt b/tools/perf/Documentation/jitdump-specification.txt
new file mode 100644
index 000000000..79936355d
--- /dev/null
+++ b/tools/perf/Documentation/jitdump-specification.txt
@@ -0,0 +1,170 @@
+JITDUMP specification version 2
+Last Revised: 09/15/2016
+Author: Stephane Eranian <eranian@gmail.com>
+
+--------------------------------------------------------
+| Revision  |    Date    | Description                 |
+--------------------------------------------------------
+|   1       | 09/07/2016 | Initial revision            |
+--------------------------------------------------------
+|   2       | 09/15/2016 | Add JIT_CODE_UNWINDING_INFO |
+--------------------------------------------------------
+
+
+I/ Introduction
+
+
+This document describes the jitdump file format. The file is generated by Just-In-time compiler runtimes to save meta-data information about the generated code, such as address, size, and name of generated functions, the native code generated, the source line information. The data may then be used by performance tools, such as Linux perf to generate function and assembly level profiles.
+
+The format is not specific to any particular programming language. It can be extended as need be.
+
+The format of the file is binary. It is self-describing in terms of endianness and is portable across multiple processor architectures.
+
+
+II/ Overview of the format
+
+
+The format requires only sequential accesses, i.e., append only mode. The file starts with a fixed size file header describing the version of the specification, the endianness.
+
+The header is followed by a series of records, each starting with a fixed size header describing the type of record and its size. It is, itself, followed by the payload for the record. Records can have a variable size even for a given type.
+
+Each entry in the file is timestamped. All timestamps must use the same clock source. The CLOCK_MONOTONIC clock source is recommended.
+
+
+III/ Jitdump file header format
+
+Each jitdump file starts with a fixed size header containing the following fields in order:
+
+
+* uint32_t magic     : a magic number tagging the file type. The value is 4-byte long and represents the string "JiTD" in ASCII form. It written is as 0x4A695444. The reader will detect an endian mismatch when it reads 0x4454694a.
+* uint32_t version   : a 4-byte value representing the format version. It is currently set to 1
+* uint32_t total_size: size in bytes of file header
+* uint32_t elf_mach  : ELF architecture encoding (ELF e_machine value as specified in /usr/include/elf.h)
+* uint32_t pad1      : padding. Reserved for future use
+* uint32_t pid       : JIT runtime process identification (OS specific)
+* uint64_t timestamp : timestamp of when the file was created
+* uint64_t flags     : a bitmask of flags
+
+The flags currently defined are as follows:
+ * bit 0: JITDUMP_FLAGS_ARCH_TIMESTAMP : set if the jitdump file is using an architecture-specific timestamp clock source. For instance, on x86, one could use TSC directly
+
+IV/ Record header
+
+The file header is immediately followed by records. Each record starts with a fixed size header describing the record that follows.
+
+The record header is specified in order as follows:
+* uint32_t id        : a value identifying the record type (see below)
+* uint32_t total_size: the size in bytes of the record including the header.
+* uint64_t timestamp : a timestamp of when the record was created.
+
+The following record types are defined:
+ * Value 0 : JIT_CODE_LOAD      : record describing a jitted function
+ * Value 1 : JIT_CODE_MOVE      : record describing an already jitted function which is moved
+ * Value 2 : JIT_CODE_DEBUG_INFO: record describing the debug information for a jitted function
+ * Value 3 : JIT_CODE_CLOSE     : record marking the end of the jit runtime (optional)
+ * Value 4 : JIT_CODE_UNWINDING_INFO: record describing a function unwinding information
+
+ The payload of the record must immediately follow the record header without padding.
+
+V/ JIT_CODE_LOAD record
+
+
+  The record has the following fields following the fixed-size record header in order:
+  * uint32_t pid: OS process id of the runtime generating the jitted code
+  * uint32_t tid: OS thread identification of the runtime thread generating the jitted code
+  * uint64_t vma: virtual address of jitted code start
+  * uint64_t code_addr: code start address for the jitted code. By default vma = code_addr
+  * uint64_t code_size: size in bytes of the generated jitted code
+  * uint64_t code_index: unique identifier for the jitted code (see below)
+  * char[n]: function name in ASCII including the null termination
+  * native code: raw byte encoding of the jitted code
+
+  The record header total_size field is inclusive of all components:
+  * record header
+  * fixed-sized fields
+  * function name string, including termination
+  * native code length
+  * record specific variable data (e.g., array of data entries)
+
+The code_index is used to uniquely identify each jitted function. The index can be a monotonically increasing 64-bit value. Each time a function is jitted it gets a new number. This value is used in case the code for a function is moved and avoids having to issue another JIT_CODE_LOAD record.
+
+The format supports empty functions with no native code.
+
+
+VI/ JIT_CODE_MOVE record
+
+  The record type is optional.
+
+  The record has the following fields following the fixed-size record header in order:
+  * uint32_t pid          : OS process id of the runtime generating the jitted code
+  * uint32_t tid          : OS thread identification of the runtime thread generating the jitted code
+  * uint64_t vma          : new virtual address of jitted code start
+  * uint64_t old_code_addr: previous code address for the same function
+  * uint64_t new_code_addr: alternate new code started address for the jitted code. By default it should be equal to the vma address.
+  * uint64_t code_size    : size in bytes of the jitted code
+  * uint64_t code_index   : index referring to the JIT_CODE_LOAD code_index record of when the function was initially jitted
+
+
+The MOVE record can be used in case an already jitted function is simply moved by the runtime inside the code cache.
+
+The JIT_CODE_MOVE record cannot come before the JIT_CODE_LOAD record for the same function name. The function cannot have changed name, otherwise a new JIT_CODE_LOAD record must be emitted.
+
+The code size of the function cannot change.
+
+
+VII/ JIT_DEBUG_INFO record
+
+The record type is optional.
+
+The record contains source lines debug information, i.e., a way to map a code address back to a source line. This information may be used by the performance tool.
+
+The record has the following fields following the fixed-size record header in order:
+  * uint64_t code_addr: address of function for which the debug information is generated
+  * uint64_t nr_entry : number of debug entries for the function
+  * debug_entry[n]: array of nr_entry debug entries for the function
+
+The debug_entry describes the source line information. It is defined as follows in order:
+* uint64_t code_addr: address of function for which the debug information is generated
+* uint32_t line     : source file line number (starting at 1)
+* uint32_t discrim  : column discriminator, 0 is default
+* char name[n]      : source file name in ASCII, including null termination
+
+The debug_entry entries are saved in sequence but given that they have variable sizes due to the file name string, they cannot be indexed directly.
+They need to be walked sequentially. The next debug_entry is found at sizeof(debug_entry) + strlen(name) + 1.
+
+IMPORTANT:
+  The JIT_CODE_DEBUG for a given function must always be generated BEFORE the JIT_CODE_LOAD for the function. This facilitates greatly the parser for the jitdump file.
+
+
+VIII/ JIT_CODE_CLOSE record
+
+
+The record type is optional.
+
+The record is used as a marker for the end of the jitted runtime. It can be replaced by the end of the file.
+
+The JIT_CODE_CLOSE record does not have any specific fields, the record header contains all the information needed.
+
+
+IX/ JIT_CODE_UNWINDING_INFO
+
+
+The record type is optional.
+
+The record is used to describe the unwinding information for a jitted function.
+
+The record has the following fields following the fixed-size record header in order:
+
+uint64_t unwind_data_size   : the size in bytes of the unwinding data table at the end of the record
+uint64_t eh_frame_hdr_size  : the size in bytes of the DWARF EH Frame Header at the start of the unwinding data table at the end of the record
+uint64_t mapped_size        : the size of the unwinding data mapped in memory
+const char unwinding_data[n]: an array of unwinding data, consisting of the EH Frame Header, followed by the actual EH Frame
+
+
+The EH Frame header follows the Linux Standard Base (LSB) specification as described in the document at https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html
+
+
+The EH Frame follows the LSB specification as described in the document at https://refspecs.linuxbase.org/LSB_3.0.0/LSB-PDA/LSB-PDA/ehframechpt.html
+
+
+NOTE: The mapped_size is generally either the same as unwind_data_size (if the unwinding data was mapped in memory by the running process) or zero (if the unwinding data is not mapped by the process). If the unwinding data was not mapped, then only the EH Frame Header will be read, which can be used to specify FP based unwinding for a function which does not have unwinding information.
diff --git a/tools/perf/Documentation/manpage-1.72.xsl b/tools/perf/Documentation/manpage-1.72.xsl
new file mode 100644
index 000000000..b4d315cb8
--- /dev/null
+++ b/tools/perf/Documentation/manpage-1.72.xsl
@@ -0,0 +1,14 @@
+<!-- manpage-1.72.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles peculiarities in docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the special values for the roff control characters
+     needed for docbook-xsl 1.72.0 -->
+<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
+<xsl:param name="git.docbook.dot"      >&#x2302;</xsl:param>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-base.xsl b/tools/perf/Documentation/manpage-base.xsl
new file mode 100644
index 000000000..a264fa616
--- /dev/null
+++ b/tools/perf/Documentation/manpage-base.xsl
@@ -0,0 +1,35 @@
+<!-- manpage-base.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- these params silence some output from xmlto -->
+<xsl:param name="man.output.quietly" select="1"/>
+<xsl:param name="refentry.meta.get.quietly" select="1"/>
+
+<!-- convert asciidoc callouts to man page format;
+     git.docbook.backslash and git.docbook.dot params
+     must be supplied by another XSL file or other means -->
+<xsl:template match="co">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB(',
+			      substring-after(@id,'-'),')',
+			      $git.docbook.backslash,'fR')"/>
+</xsl:template>
+<xsl:template match="calloutlist">
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>sp&#10;</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:text>&#10;</xsl:text>
+</xsl:template>
+<xsl:template match="callout">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB',
+			      substring-after(@arearefs,'-'),
+			      '. ',$git.docbook.backslash,'fR')"/>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>br&#10;</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-bold-literal.xsl b/tools/perf/Documentation/manpage-bold-literal.xsl
new file mode 100644
index 000000000..608eb5df6
--- /dev/null
+++ b/tools/perf/Documentation/manpage-bold-literal.xsl
@@ -0,0 +1,17 @@
+<!-- manpage-bold-literal.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- render literal text as bold (instead of plain or monospace);
+     this makes literal text easier to distinguish in manpages
+     viewed on a tty -->
+<xsl:template match="literal">
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fB</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fR</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-normal.xsl b/tools/perf/Documentation/manpage-normal.xsl
new file mode 100644
index 000000000..a48f5b11f
--- /dev/null
+++ b/tools/perf/Documentation/manpage-normal.xsl
@@ -0,0 +1,13 @@
+<!-- manpage-normal.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles anything we want to keep away from docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the normal values for the roff control characters -->
+<xsl:param name="git.docbook.backslash">\</xsl:param>
+<xsl:param name="git.docbook.dot"	>.</xsl:param>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-suppress-sp.xsl b/tools/perf/Documentation/manpage-suppress-sp.xsl
new file mode 100644
index 000000000..a63c7632a
--- /dev/null
+++ b/tools/perf/Documentation/manpage-suppress-sp.xsl
@@ -0,0 +1,21 @@
+<!-- manpage-suppress-sp.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles erroneous, inline .sp in manpage output of some
+     versions of docbook-xsl -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- attempt to work around spurious .sp at the tail of the line
+     that some versions of docbook stylesheets seem to add -->
+<xsl:template match="simpara">
+  <xsl:variable name="content">
+    <xsl:apply-templates/>
+  </xsl:variable>
+  <xsl:value-of select="normalize-space($content)"/>
+  <xsl:if test="not(ancestor::authorblurb) and
+                not(ancestor::personblurb)">
+    <xsl:text>&#10;&#10;</xsl:text>
+  </xsl:if>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
new file mode 100644
index 000000000..980fe2c29
--- /dev/null
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -0,0 +1,157 @@
+perf-annotate(1)
+================
+
+NAME
+----
+perf-annotate - Read perf.data (created by perf record) and display annotated code
+
+SYNOPSIS
+--------
+[verse]
+'perf annotate' [-i <file> | --input=file] [symbol_name]
+
+DESCRIPTION
+-----------
+This command reads the input file and displays an annotated version of the
+code. If the object file has debug symbols then the source code will be
+displayed alongside assembly code.
+
+If there is no debug info in the object, then annotated assembly is displayed.
+
+OPTIONS
+-------
+-i::
+--input=<file>::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-d::
+--dsos=<dso[,dso...]>::
+        Only consider symbols in these dsos.
+-s::
+--symbol=<symbol>::
+        Symbol to annotate.
+
+-f::
+--force::
+        Don't do ownership validation.
+
+-v::
+--verbose::
+        Be more verbose. (Show symbol address, etc)
+
+-q::
+--quiet::
+	Do not show any warnings or messages.  (Suppress -v)
+
+-n::
+--show-nr-samples::
+	Show the number of samples for each symbol
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname.
+
+--ignore-vmlinux::
+	Ignore vmlinux files.
+
+--itrace::
+	Options for decoding instruction tracing data. The options are:
+
+include::itrace.txt[]
+
+	To disable decoding entirely, use --no-itrace.
+
+-m::
+--modules::
+        Load module symbols. WARNING: use only with -k and LIVE kernel.
+
+-l::
+--print-line::
+        Print matching source lines (may be slow).
+
+-P::
+--full-paths::
+        Don't shorten the displayed pathnames.
+
+--stdio:: Use the stdio interface.
+
+--stdio2:: Use the stdio2 interface, non-interactive, uses the TUI formatting.
+
+--stdio-color=<mode>::
+	'always', 'never' or 'auto', allowing configuring color output
+	via the command line, in addition to via "color.ui" .perfconfig.
+	Use '--stdio-color always' to generate color even when redirecting
+	to a pipe or file. Using just '--stdio-color' is equivalent to
+	using 'always'.
+
+--tui:: Use the TUI interface. Use of --tui requires a tty, if one is not
+	present, as when piping to other commands, the stdio interface is
+	used. This interfaces starts by centering on the line with more
+	samples, TAB/UNTAB cycles through the lines with more samples.
+
+--gtk:: Use the GTK interface.
+
+-C::
+--cpu=<cpu>:: Only report samples for the list of CPUs provided. Multiple CPUs can
+	be provided as a comma-separated list with no space: 0,1. Ranges of
+	CPUs are specified with -: 0-2. Default is to report samples on all
+	CPUs.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+--show-total-period:: Show a column with the sum of periods.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--objdump=<path>::
+        Path to objdump binary.
+
+--prefix=PREFIX::
+--prefix-strip=N::
+	Remove first N entries from source file path names in executables
+	and add PREFIX. This allows to display source code compiled on systems
+	with different file system layout.
+
+--skip-missing::
+	Skip symbols that cannot be annotated.
+
+--group::
+	Show event group information together
+
+--demangle::
+	Demangle symbol names to human readable form. It's enabled by default,
+	disable with --no-demangle.
+
+--demangle-kernel::
+	Demangle kernel symbol names to human readable form (for C++ kernels).
+
+--percent-type::
+	Set annotation percent type from following choices:
+	  global-period, local-period, global-hits, local-hits
+
+	The local/global keywords set if the percentage is computed
+	in the scope of the function (local) or the whole data (global).
+	The period/hits keywords set the base the percentage is computed
+	on - the samples period or the number of samples (hits).
+
+--percent-limit::
+	Do not show functions which have an overhead under that percent on
+	stdio or stdio2 (Default: 0).  Note that this is about selection of
+	functions to display, not about lines within the function.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-archive.txt b/tools/perf/Documentation/perf-archive.txt
new file mode 100644
index 000000000..ac6ecbb3e
--- /dev/null
+++ b/tools/perf/Documentation/perf-archive.txt
@@ -0,0 +1,22 @@
+perf-archive(1)
+===============
+
+NAME
+----
+perf-archive - Create archive with object files with build-ids found in perf.data file
+
+SYNOPSIS
+--------
+[verse]
+'perf archive' [file]
+
+DESCRIPTION
+-----------
+This command runs perf-buildid-list --with-hits, and collects the files with the
+buildids found so that analysis of perf.data contents can be possible on another
+machine.
+
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-buildid-list[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt
new file mode 100644
index 000000000..bf03222e9
--- /dev/null
+++ b/tools/perf/Documentation/perf-arm-spe.txt
@@ -0,0 +1,218 @@
+perf-arm-spe(1)
+================
+
+NAME
+----
+perf-arm-spe - Support for Arm Statistical Profiling Extension within Perf tools
+
+SYNOPSIS
+--------
+[verse]
+'perf record' -e arm_spe//
+
+DESCRIPTION
+-----------
+
+The SPE (Statistical Profiling Extension) feature provides accurate attribution of latencies and
+ events down to individual instructions. Rather than being interrupt-driven, it picks an
+instruction to sample and then captures data for it during execution. Data includes execution time
+in cycles. For loads and stores it also includes data address, cache miss events, and data origin.
+
+The sampling has 5 stages:
+
+  1. Choose an operation
+  2. Collect data about the operation
+  3. Optionally discard the record based on a filter
+  4. Write the record to memory
+  5. Interrupt when the buffer is full
+
+Choose an operation
+~~~~~~~~~~~~~~~~~~~
+
+This is chosen from a sample population, for SPE this is an IMPLEMENTATION DEFINED choice of all
+architectural instructions or all micro-ops. Sampling happens at a programmable interval. The
+architecture provides a mechanism for the SPE driver to infer the minimum interval at which it should
+sample. This minimum interval is used by the driver if no interval is specified. A pseudo-random
+perturbation is also added to the sampling interval by default.
+
+Collect data about the operation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Program counter, PMU events, timings and data addresses related to the operation are recorded.
+Sampling ensures there is only one sampled operation is in flight.
+
+Optionally discard the record based on a filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Based on programmable criteria, choose whether to keep the record or discard it. If the record is
+discarded then the flow stops here for this sample.
+
+Write the record to memory
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The record is appended to a memory buffer
+
+Interrupt when the buffer is full
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When the buffer fills, an interrupt is sent and the driver signals Perf to collect the records.
+Perf saves the raw data in the perf.data file.
+
+Opening the file
+----------------
+
+Up until this point no decoding of the SPE data was done by either the kernel or Perf. Only when the
+recorded file is opened with 'perf report' or 'perf script' does the decoding happen. When decoding
+the data, Perf generates "synthetic samples" as if these were generated at the time of the
+recording. These samples are the same as if normal sampling was done by Perf without using SPE,
+although they may have more attributes associated with them. For example a normal sample may have
+just the instruction pointer, but an SPE sample can have data addresses and latency attributes.
+
+Why Sampling?
+-------------
+
+ - Sampling, rather than tracing, cuts down the profiling problem to something more manageable for
+ hardware. Only one sampled operation is in flight at a time.
+
+ - Allows precise attribution data, including: Full PC of instruction, data virtual and physical
+ addresses.
+
+ - Allows correlation between an instruction and events, such as TLB and cache miss. (Data source
+ indicates which particular cache was hit, but the meaning is implementation defined because
+ different implementations can have different cache configurations.)
+
+However, SPE does not provide any call-graph information, and relies on statistical methods.
+
+Collisions
+----------
+
+When an operation is sampled while a previous sampled operation has not finished, a collision
+occurs. The new sample is dropped. Collisions affect the integrity of the data, so the sample rate
+should be set to avoid collisions.
+
+The 'sample_collision' PMU event can be used to determine the number of lost samples. Although this
+count is based on collisions _before_ filtering occurs. Therefore this can not be used as an exact
+number for samples dropped that would have made it through the filter, but can be a rough
+guide.
+
+The effect of microarchitectural sampling
+-----------------------------------------
+
+If an implementation samples micro-operations instead of instructions, the results of sampling must
+be weighted accordingly.
+
+For example, if a given instruction A is always converted into two micro-operations, A0 and A1, it
+becomes twice as likely to appear in the sample population.
+
+The coarse effect of conversions, and, if applicable, sampling of speculative operations, can be
+estimated from the 'sample_pop' and 'inst_retired' PMU events.
+
+Kernel Requirements
+-------------------
+
+The ARM_SPE_PMU config must be set to build as either a module or statically.
+
+Depending on CPU model, the kernel may need to be booted with page table isolation disabled
+(kpti=off). If KPTI needs to be disabled, this will fail with a console message "profiling buffer
+inaccessible. Try passing 'kpti=off' on the kernel command line".
+
+Capturing SPE with perf command-line tools
+------------------------------------------
+
+You can record a session with SPE samples:
+
+  perf record -e arm_spe// -- ./mybench
+
+The sample period is set from the -c option, and because the minimum interval is used by default
+it's recommended to set this to a higher value. The value is written to PMSIRR.INTERVAL.
+
+Config parameters
+~~~~~~~~~~~~~~~~~
+
+These are placed between the // in the event and comma separated. For example '-e
+arm_spe/load_filter=1,min_latency=10/'
+
+  branch_filter=1     - collect branches only (PMSFCR.B)
+  event_filter=<mask> - filter on specific events (PMSEVFR) - see bitfield description below
+  jitter=1            - use jitter to avoid resonance when sampling (PMSIRR.RND)
+  load_filter=1       - collect loads only (PMSFCR.LD)
+  min_latency=<n>     - collect only samples with this latency or higher* (PMSLATFR)
+  pa_enable=1         - collect physical address (as well as VA) of loads/stores (PMSCR.PA) - requires privilege
+  pct_enable=1        - collect physical timestamp instead of virtual timestamp (PMSCR.PCT) - requires privilege
+  store_filter=1      - collect stores only (PMSFCR.ST)
+  ts_enable=1         - enable timestamping with value of generic timer (PMSCR.TS)
+
++++*+++ Latency is the total latency from the point at which sampling started on that instruction, rather
+than only the execution latency.
+
+Only some events can be filtered on; these include:
+
+  bit 1     - instruction retired (i.e. omit speculative instructions)
+  bit 3     - L1D refill
+  bit 5     - TLB refill
+  bit 7     - mispredict
+  bit 11    - misaligned access
+
+So to sample just retired instructions:
+
+  perf record -e arm_spe/event_filter=2/ -- ./mybench
+
+or just mispredicted branches:
+
+  perf record -e arm_spe/event_filter=0x80/ -- ./mybench
+
+Viewing the data
+~~~~~~~~~~~~~~~~~
+
+By default perf report and perf script will assign samples to separate groups depending on the
+attributes/events of the SPE record. Because instructions can have multiple events associated with
+them, the samples in these groups are not necessarily unique. For example perf report shows these
+groups:
+
+  Available samples
+  0 arm_spe//
+  0 dummy:u
+  21 l1d-miss
+  897 l1d-access
+  5 llc-miss
+  7 llc-access
+  2 tlb-miss
+  1K tlb-access
+  36 branch-miss
+  0 remote-access
+  900 memory
+
+The arm_spe// and dummy:u events are implementation details and are expected to be empty.
+
+To get a full list of unique samples that are not sorted into groups, set the itrace option to
+generate 'instruction' samples. The period option is also taken into account, so set it to 1
+instruction unless you want to further downsample the already sampled SPE data:
+
+  perf report --itrace=i1i
+
+Memory access details are also stored on the samples and this can be viewed with:
+
+  perf report --mem-mode
+
+Common errors
+~~~~~~~~~~~~~
+
+ - "Cannot find PMU `arm_spe'. Missing kernel support?"
+
+   Module not built or loaded, KPTI not disabled (see above), or running on a VM
+
+ - "Arm SPE CONTEXT packets not found in the traces."
+
+   Root privilege is required to collect context packets. But these only increase the accuracy of
+   assigning PIDs to kernel samples. For userspace sampling this can be ignored.
+
+ - Excessively large perf.data file size
+
+   Increase sampling interval (see above)
+
+
+SEE ALSO
+--------
+
+linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1],
+linkperf:perf-inject[1]
diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
new file mode 100644
index 000000000..a0529c7fa
--- /dev/null
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -0,0 +1,238 @@
+perf-bench(1)
+=============
+
+NAME
+----
+perf-bench - General framework for benchmark suites
+
+SYNOPSIS
+--------
+[verse]
+'perf bench' [<common options>] <subsystem> <suite> [<options>]
+
+DESCRIPTION
+-----------
+This 'perf bench' command is a general framework for benchmark suites.
+
+COMMON OPTIONS
+--------------
+-r::
+--repeat=::
+Specify amount of times to repeat the run (default 10).
+
+-f::
+--format=::
+Specify format style.
+Current available format styles are:
+
+'default'::
+Default style. This is mainly for human reading.
+---------------------
+% perf bench sched pipe                      # with no style specified
+(executing 1000000 pipe operations between two tasks)
+        Total time:5.855 sec
+                5.855061 usecs/op
+		170792 ops/sec
+---------------------
+
+'simple'::
+This simple style is friendly for automated
+processing by scripts.
+---------------------
+% perf bench --format=simple sched pipe      # specified simple
+5.988
+---------------------
+
+SUBSYSTEM
+---------
+
+'sched'::
+	Scheduler and IPC mechanisms.
+
+'syscall'::
+	System call performance (throughput).
+
+'mem'::
+	Memory access performance.
+
+'numa'::
+	NUMA scheduling and MM benchmarks.
+
+'futex'::
+	Futex stressing benchmarks.
+
+'epoll'::
+	Eventpoll (epoll) stressing benchmarks.
+
+'internals'::
+	Benchmark internal perf functionality.
+
+'all'::
+	All benchmark subsystems.
+
+SUITES FOR 'sched'
+~~~~~~~~~~~~~~~~~~
+*messaging*::
+Suite for evaluating performance of scheduler and IPC mechanisms.
+Based on hackbench by Rusty Russell.
+
+Options of *messaging*
+^^^^^^^^^^^^^^^^^^^^^^
+-p::
+--pipe::
+Use pipe() instead of socketpair()
+
+-t::
+--thread::
+Be multi thread instead of multi process
+
+-g::
+--group=::
+Specify number of groups
+
+-l::
+--nr_loops=::
+Specify number of loops
+
+Example of *messaging*
+^^^^^^^^^^^^^^^^^^^^^^
+
+---------------------
+% perf bench sched messaging                 # run with default
+options (20 sender and receiver processes per group)
+(10 groups == 400 processes run)
+
+      Total time:0.308 sec
+
+% perf bench sched messaging -t -g 20        # be multi-thread, with 20 groups
+(20 sender and receiver threads per group)
+(20 groups == 800 threads run)
+
+      Total time:0.582 sec
+---------------------
+
+*pipe*::
+Suite for pipe() system call.
+Based on pipe-test-1m.c by Ingo Molnar.
+
+Options of *pipe*
+^^^^^^^^^^^^^^^^^
+-l::
+--loop=::
+Specify number of loops.
+
+Example of *pipe*
+^^^^^^^^^^^^^^^^^
+
+---------------------
+% perf bench sched pipe
+(executing 1000000 pipe operations between two tasks)
+
+        Total time:8.091 sec
+                8.091833 usecs/op
+                123581 ops/sec
+
+% perf bench sched pipe -l 1000              # loop 1000
+(executing 1000 pipe operations between two tasks)
+
+        Total time:0.016 sec
+                16.948000 usecs/op
+                59004 ops/sec
+---------------------
+
+SUITES FOR 'syscall'
+~~~~~~~~~~~~~~~~~~
+*basic*::
+Suite for evaluating performance of core system call throughput (both usecs/op and ops/sec metrics).
+This uses a single thread simply doing getppid(2), which is a simple syscall where the result is not
+cached by glibc.
+
+
+SUITES FOR 'mem'
+~~~~~~~~~~~~~~~~
+*memcpy*::
+Suite for evaluating performance of simple memory copy in various ways.
+
+Options of *memcpy*
+^^^^^^^^^^^^^^^^^^^
+-l::
+--size::
+Specify size of memory to copy (default: 1MB).
+Available units are B, KB, MB, GB and TB (case insensitive).
+
+-f::
+--function::
+Specify function to copy (default: default).
+Available functions are depend on the architecture.
+On x86-64, x86-64-unrolled, x86-64-movsq and x86-64-movsb are supported.
+
+-l::
+--nr_loops::
+Repeat memcpy invocation this number of times.
+
+-c::
+--cycles::
+Use perf's cpu-cycles event instead of gettimeofday syscall.
+
+*memset*::
+Suite for evaluating performance of simple memory set in various ways.
+
+Options of *memset*
+^^^^^^^^^^^^^^^^^^^
+-l::
+--size::
+Specify size of memory to set (default: 1MB).
+Available units are B, KB, MB, GB and TB (case insensitive).
+
+-f::
+--function::
+Specify function to set (default: default).
+Available functions are depend on the architecture.
+On x86-64, x86-64-unrolled, x86-64-stosq and x86-64-stosb are supported.
+
+-l::
+--nr_loops::
+Repeat memset invocation this number of times.
+
+-c::
+--cycles::
+Use perf's cpu-cycles event instead of gettimeofday syscall.
+
+SUITES FOR 'numa'
+~~~~~~~~~~~~~~~~~
+*mem*::
+Suite for evaluating NUMA workloads.
+
+SUITES FOR 'futex'
+~~~~~~~~~~~~~~~~~~
+*hash*::
+Suite for evaluating hash tables.
+
+*wake*::
+Suite for evaluating wake calls.
+
+*wake-parallel*::
+Suite for evaluating parallel wake calls.
+
+*requeue*::
+Suite for evaluating requeue calls.
+
+*lock-pi*::
+Suite for evaluating futex lock_pi calls.
+
+SUITES FOR 'epoll'
+~~~~~~~~~~~~~~~~~~
+*wait*::
+Suite for evaluating concurrent epoll_wait calls.
+
+*ctl*::
+Suite for evaluating multiple epoll_ctl calls.
+
+SUITES FOR 'internals'
+~~~~~~~~~~~~~~~~~~~~~~
+*synthesize*::
+Suite for evaluating perf's event synthesis performance.
+
+SEE ALSO
+--------
+linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
new file mode 100644
index 000000000..7e44b419d
--- /dev/null
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -0,0 +1,88 @@
+perf-buildid-cache(1)
+=====================
+
+NAME
+----
+perf-buildid-cache - Manage build-id cache.
+
+SYNOPSIS
+--------
+[verse]
+'perf buildid-cache <options>'
+
+DESCRIPTION
+-----------
+This command manages the build-id cache. It can add, remove, update and purge
+files to/from the cache. In the future it should as well set upper limits for
+the space used by the cache, etc.
+This also scans the target binary for SDT (Statically Defined Tracing) and
+record it along with the buildid-cache, which will be used by perf-probe.
+For more details, see linkperf:perf-probe[1].
+
+OPTIONS
+-------
+-a::
+--add=::
+        Add specified file to the cache.
+-f::
+--force::
+	Don't complain, do it.
+-k::
+--kcore::
+        Add specified kcore file to the cache. For the current host that is
+        /proc/kcore which requires root permissions to read. Be aware that
+        running 'perf buildid-cache' as root may update root's build-id cache
+        not the user's. Use the -v option to see where the file is created.
+        Note that the copied file contains only code sections not the whole core
+        image. Note also that files "kallsyms" and "modules" must also be in the
+        same directory and are also copied.  All 3 files are created with read
+        permissions for root only. kcore will not be added if there is already a
+        kcore in the cache (with the same build-id) that has the same modules at
+        the same addresses. Use the -v option to see if a copy of kcore is
+        actually made.
+-r::
+--remove=::
+        Remove a cached binary which has same build-id of specified file
+        from the cache.
+-p::
+--purge=::
+        Purge all cached binaries including older caches which have specified
+	path from the cache.
+-P::
+--purge-all::
+	Purge all cached binaries. This will flush out entire cache.
+-M::
+--missing=::
+	List missing build ids in the cache for the specified file.
+-u::
+--update=::
+	Update specified file of the cache. Note that this doesn't remove
+	older entries since those may be still needed for annotating old
+	(or remote) perf.data. Only if there is already a cache which has
+	exactly same build-id, that is replaced by new one. It can be used
+	to update kallsyms and kernel dso to vmlinux in order to support
+	annotation.
+-l::
+--list::
+	List all valid binaries from cache.
+-v::
+--verbose::
+	Be more verbose.
+
+--target-ns=PID:
+	Obtain mount namespace information from the target pid.  This is
+	used when creating a uprobe for a process that resides in a
+	different mount namespace from the perf(1) utility.
+
+--debuginfod[=URLs]::
+	Specify debuginfod URL to be used when retrieving perf.data binaries,
+	it follows the same syntax as the DEBUGINFOD_URLS variable, like:
+
+	  buildid-cache.debuginfod=http://192.168.122.174:8002
+
+	If the URLs is not specified, the value of DEBUGINFOD_URLS
+	system environment variable is used.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt
new file mode 100644
index 000000000..e1e8fdbe0
--- /dev/null
+++ b/tools/perf/Documentation/perf-buildid-list.txt
@@ -0,0 +1,47 @@
+perf-buildid-list(1)
+====================
+
+NAME
+----
+perf-buildid-list - List the buildids in a perf.data file
+
+SYNOPSIS
+--------
+[verse]
+'perf buildid-list <options>'
+
+DESCRIPTION
+-----------
+This command displays the buildids found in a perf.data file, so that other
+tools can be used to fetch packages with matching symbol tables for use by
+perf report.
+
+It can also be used to show the build id of the running kernel or in an ELF
+file using -i/--input.
+
+OPTIONS
+-------
+-H::
+--with-hits::
+        Show only DSOs with hits.
+-i::
+--input=::
+        Input file name. (default: perf.data unless stdin is a fifo)
+-f::
+--force::
+	Don't do ownership validation.
+-k::
+--kernel::
+	Show running kernel build id.
+-m::
+--kernel-maps::
+	Show buildid, start/end text address, and path of running kernel and
+	its modules.
+-v::
+--verbose::
+	Be more verbose.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-top[1],
+linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt
new file mode 100644
index 000000000..5c5eb2def
--- /dev/null
+++ b/tools/perf/Documentation/perf-c2c.txt
@@ -0,0 +1,336 @@
+perf-c2c(1)
+===========
+
+NAME
+----
+perf-c2c - Shared Data C2C/HITM Analyzer.
+
+SYNOPSIS
+--------
+[verse]
+'perf c2c record' [<options>] <command>
+'perf c2c record' [<options>] \-- [<record command options>] <command>
+'perf c2c report' [<options>]
+
+DESCRIPTION
+-----------
+C2C stands for Cache To Cache.
+
+The perf c2c tool provides means for Shared Data C2C/HITM analysis. It allows
+you to track down the cacheline contentions.
+
+On Intel, the tool is based on load latency and precise store facility events
+provided by Intel CPUs. On PowerPC, the tool uses random instruction sampling
+with thresholding feature. On AMD, the tool uses IBS op pmu (due to hardware
+limitations, perf c2c is not supported on Zen3 cpus).
+
+These events provide:
+  - memory address of the access
+  - type of the access (load and store details)
+  - latency (in cycles) of the load access
+
+The c2c tool provide means to record this data and report back access details
+for cachelines with highest contention - highest number of HITM accesses.
+
+The basic workflow with this tool follows the standard record/report phase.
+User uses the record command to record events data and report command to
+display it.
+
+
+RECORD OPTIONS
+--------------
+-e::
+--event=::
+	Select the PMU event. Use 'perf c2c record -e list'
+	to list available events.
+
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc).
+
+-l::
+--ldlat::
+	Configure mem-loads latency. Supported on Intel and Arm64 processors
+	only. Ignored on other archs.
+
+-k::
+--all-kernel::
+	Configure all used events to run in kernel space.
+
+-u::
+--all-user::
+	Configure all used events to run in user space.
+
+REPORT OPTIONS
+--------------
+-k::
+--vmlinux=<file>::
+	vmlinux pathname
+
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc).
+
+-i::
+--input::
+	Specify the input file to process.
+
+-N::
+--node-info::
+	Show extra node info in report (see NODE INFO section)
+
+-c::
+--coalesce::
+	Specify sorting fields for single cacheline display.
+	Following fields are available: tid,pid,iaddr,dso
+	(see COALESCE)
+
+-g::
+--call-graph::
+	Setup callchains parameters.
+	Please refer to perf-report man page for details.
+
+--stdio::
+	Force the stdio output (see STDIO OUTPUT)
+
+--stats::
+	Display only statistic tables and force stdio mode.
+
+--full-symbols::
+	Display full length of symbols.
+
+--no-source::
+	Do not display Source:Line column.
+
+--show-all::
+	Show all captured HITM lines, with no regard to HITM % 0.0005 limit.
+
+-f::
+--force::
+	Don't do ownership validation.
+
+-d::
+--display::
+	Switch to HITM type (rmt, lcl) or peer snooping type (peer) to display
+	and sort on. Total HITMs (tot) as default, except Arm64 uses peer mode
+	as default.
+
+--stitch-lbr::
+	Show callgraph with stitched LBRs, which may have more complete
+	callgraph. The perf.data file must have been obtained using
+	perf c2c record --call-graph lbr.
+	Disabled by default. In common cases with call stack overflows,
+	it can recreate better call stacks than the default lbr call stack
+	output. But this approach is not full proof. There can be cases
+	where it creates incorrect call stacks from incorrect matches.
+	The known limitations include exception handing such as
+	setjmp/longjmp will have calls/returns not match.
+
+C2C RECORD
+----------
+The perf c2c record command setup options related to HITM cacheline analysis
+and calls standard perf record command.
+
+Following perf record options are configured by default:
+(check perf record man page for details)
+
+  -W,-d,--phys-data,--sample-cpu
+
+Unless specified otherwise with '-e' option, following events are monitored by
+default on Intel:
+
+  cpu/mem-loads,ldlat=30/P
+  cpu/mem-stores/P
+
+following on AMD:
+
+  ibs_op//
+
+and following on PowerPC:
+
+  cpu/mem-loads/
+  cpu/mem-stores/
+
+User can pass any 'perf record' option behind '--' mark, like (to enable
+callchains and system wide monitoring):
+
+  $ perf c2c record -- -g -a
+
+Please check RECORD OPTIONS section for specific c2c record options.
+
+C2C REPORT
+----------
+The perf c2c report command displays shared data analysis.  It comes in two
+display modes: stdio and tui (default).
+
+The report command workflow is following:
+  - sort all the data based on the cacheline address
+  - store access details for each cacheline
+  - sort all cachelines based on user settings
+  - display data
+
+In general perf report output consist of 2 basic views:
+  1) most expensive cachelines list
+  2) offsets details for each cacheline
+
+For each cacheline in the 1) list we display following data:
+(Both stdio and TUI modes follow the same fields output)
+
+  Index
+  - zero based index to identify the cacheline
+
+  Cacheline
+  - cacheline address (hex number)
+
+  Rmt/Lcl Hitm (Display with HITM types)
+  - cacheline percentage of all Remote/Local HITM accesses
+
+  Peer Snoop (Display with peer type)
+  - cacheline percentage of all peer accesses
+
+  LLC Load Hitm - Total, LclHitm, RmtHitm (For display with HITM types)
+  - count of Total/Local/Remote load HITMs
+
+  Load Peer - Total, Local, Remote (For display with peer type)
+  - count of Total/Local/Remote load from peer cache or DRAM
+
+  Total records
+  - sum of all cachelines accesses
+
+  Total loads
+  - sum of all load accesses
+
+  Total stores
+  - sum of all store accesses
+
+  Store Reference - L1Hit, L1Miss, N/A
+    L1Hit - store accesses that hit L1
+    L1Miss - store accesses that missed L1
+    N/A - store accesses with memory level is not available
+
+  Core Load Hit - FB, L1, L2
+  - count of load hits in FB (Fill Buffer), L1 and L2 cache
+
+  LLC Load Hit - LlcHit, LclHitm
+  - count of LLC load accesses, includes LLC hits and LLC HITMs
+
+  RMT Load Hit - RmtHit, RmtHitm
+  - count of remote load accesses, includes remote hits and remote HITMs;
+    on Arm neoverse cores, RmtHit is used to account remote accesses,
+    includes remote DRAM or any upward cache level in remote node
+
+  Load Dram - Lcl, Rmt
+  - count of local and remote DRAM accesses
+
+For each offset in the 2) list we display following data:
+
+  HITM - Rmt, Lcl (Display with HITM types)
+  - % of Remote/Local HITM accesses for given offset within cacheline
+
+  Peer Snoop - Rmt, Lcl (Display with peer type)
+  - % of Remote/Local peer accesses for given offset within cacheline
+
+  Store Refs - L1 Hit, L1 Miss, N/A
+  - % of store accesses that hit L1, missed L1 and N/A (no available) memory
+    level for given offset within cacheline
+
+  Data address - Offset
+  - offset address
+
+  Pid
+  - pid of the process responsible for the accesses
+
+  Tid
+  - tid of the process responsible for the accesses
+
+  Code address
+  - code address responsible for the accesses
+
+  cycles - rmt hitm, lcl hitm, load (Display with HITM types)
+    - sum of cycles for given accesses - Remote/Local HITM and generic load
+
+  cycles - rmt peer, lcl peer, load (Display with peer type)
+    - sum of cycles for given accesses - Remote/Local peer load and generic load
+
+  cpu cnt
+    - number of cpus that participated on the access
+
+  Symbol
+    - code symbol related to the 'Code address' value
+
+  Shared Object
+    - shared object name related to the 'Code address' value
+
+  Source:Line
+    - source information related to the 'Code address' value
+
+  Node
+    - nodes participating on the access (see NODE INFO section)
+
+NODE INFO
+---------
+The 'Node' field displays nodes that accesses given cacheline
+offset. Its output comes in 3 flavors:
+  - node IDs separated by ','
+  - node IDs with stats for each ID, in following format:
+      Node{cpus %hitms %stores} (Display with HITM types)
+      Node{cpus %peers %stores} (Display with peer type)
+  - node IDs with list of affected CPUs in following format:
+      Node{cpu list}
+
+User can switch between above flavors with -N option or
+use 'n' key to interactively switch in TUI mode.
+
+COALESCE
+--------
+User can specify how to sort offsets for cacheline.
+
+Following fields are available and governs the final
+output fields set for cacheline offsets output:
+
+  tid   - coalesced by process TIDs
+  pid   - coalesced by process PIDs
+  iaddr - coalesced by code address, following fields are displayed:
+             Code address, Code symbol, Shared Object, Source line
+  dso   - coalesced by shared object
+
+By default the coalescing is setup with 'pid,iaddr'.
+
+STDIO OUTPUT
+------------
+The stdio output displays data on standard output.
+
+Following tables are displayed:
+  Trace Event Information
+  - overall statistics of memory accesses
+
+  Global Shared Cache Line Event Information
+  - overall statistics on shared cachelines
+
+  Shared Data Cache Line Table
+  - list of most expensive cachelines
+
+  Shared Cache Line Distribution Pareto
+  - list of all accessed offsets for each cacheline
+
+TUI OUTPUT
+----------
+The TUI output provides interactive interface to navigate
+through cachelines list and to display offset details.
+
+For details please refer to the help window by pressing '?' key.
+
+CREDITS
+-------
+Although Don Zickus, Dick Fowles and Joe Mario worked together
+to get this implemented, we got lots of early help from Arnaldo
+Carvalho de Melo, Stephane Eranian, Jiri Olsa and Andi Kleen.
+
+C2C BLOG
+--------
+Check Joe's blog on c2c tool for detailed use case explanation:
+  https://joemario.github.io/blog/2016/09/01/c2c-blog/
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-mem[1]
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
new file mode 100644
index 000000000..39c890ead
--- /dev/null
+++ b/tools/perf/Documentation/perf-config.txt
@@ -0,0 +1,755 @@
+perf-config(1)
+==============
+
+NAME
+----
+perf-config - Get and set variables in a configuration file.
+
+SYNOPSIS
+--------
+[verse]
+'perf config' [<file-option>] [section.name[=value] ...]
+or
+'perf config' [<file-option>] -l | --list
+
+DESCRIPTION
+-----------
+You can manage variables in a configuration file with this command.
+
+OPTIONS
+-------
+
+-l::
+--list::
+	Show current config variables, name and value, for all sections.
+
+--user::
+	For writing and reading options: write to user
+	'$HOME/.perfconfig' file or read it.
+
+--system::
+	For writing and reading options: write to system-wide
+	'$(sysconfdir)/perfconfig' or read it.
+
+CONFIGURATION FILE
+------------------
+
+The perf configuration file contains many variables to change various
+aspects of each of its tools, including output, disk usage, etc.
+The '$HOME/.perfconfig' file is used to store a per-user configuration.
+The file '$(sysconfdir)/perfconfig' can be used to
+store a system-wide default configuration.
+
+One an disable reading config files by setting the PERF_CONFIG environment
+variable to /dev/null, or provide an alternate config file by setting that
+variable.
+
+When reading or writing, the values are read from the system and user
+configuration files by default, and options '--system' and '--user'
+can be used to tell the command to read from or write to only that location.
+
+Syntax
+~~~~~~
+
+The file consist of sections. A section starts with its name
+surrounded by square brackets and continues till the next section
+begins. Each variable must be in a section, and have the form
+'name = value', for example:
+
+	[section]
+		name1 = value1
+		name2 = value2
+
+Section names are case sensitive and can contain any characters except
+newline (double quote `"` and backslash have to be escaped as `\"` and `\\`,
+respectively). Section headers can't span multiple lines.
+
+Example
+~~~~~~~
+
+Given a $HOME/.perfconfig like this:
+
+#
+# This is the config file, and
+# a '#' and ';' character indicates a comment
+#
+
+	[colors]
+		# Color variables
+		top = red, default
+		medium = green, default
+		normal = lightgray, default
+		selected = white, lightgray
+		jump_arrows = blue, default
+		addr = magenta, default
+		root = white, blue
+
+	[tui]
+		# Defaults if linked with libslang
+		report = on
+		annotate = on
+		top = on
+
+	[buildid]
+		# Default, disable using /dev/null
+		dir = ~/.debug
+
+	[annotate]
+		# Defaults
+		hide_src_code = false
+		use_offset = true
+		jump_arrows = true
+		show_nr_jumps = false
+
+	[help]
+		# Format can be man, info, web or html
+		format = man
+		autocorrect = 0
+
+	[ui]
+		show-headers = true
+
+	[call-graph]
+		# fp (framepointer), dwarf
+		record-mode = fp
+		print-type = graph
+		order = caller
+		sort-key = function
+
+	[report]
+		# Defaults
+		sort_order = comm,dso,symbol
+		percent-limit = 0
+		queue-size = 0
+		children = true
+		group = true
+		skip-empty = true
+
+	[llvm]
+		dump-obj = true
+		clang-opt = -g
+
+You can hide source code of annotate feature setting the config to false with
+
+	% perf config annotate.hide_src_code=true
+
+If you want to add or modify several config items, you can do like
+
+	% perf config ui.show-headers=false kmem.default=slab
+
+To modify the sort order of report functionality in user config file(i.e. `~/.perfconfig`), do
+
+	% perf config --user report.sort-order=srcline
+
+To change colors of selected line to other foreground and background colors
+in system config file (i.e. `$(sysconf)/perfconfig`), do
+
+	% perf config --system colors.selected=yellow,green
+
+To query the record mode of call graph, do
+
+	% perf config call-graph.record-mode
+
+If you want to know multiple config key/value pairs, you can do like
+
+	% perf config report.queue-size call-graph.order report.children
+
+To query the config value of sort order of call graph in user config file (i.e. `~/.perfconfig`), do
+
+	% perf config --user call-graph.sort-order
+
+To query the config value of buildid directory in system config file (i.e. `$(sysconf)/perfconfig`), do
+
+	% perf config --system buildid.dir
+
+Variables
+~~~~~~~~~
+
+colors.*::
+	The variables for customizing the colors used in the output for the
+	'report', 'top' and 'annotate' in the TUI. They should specify the
+	foreground and background colors, separated by a comma, for example:
+
+		medium = green, lightgray
+
+	If you want to use the color configured for you terminal, just leave it
+	as 'default', for example:
+
+		medium = default, lightgray
+
+	Available colors:
+	red, yellow, green, cyan, gray, black, blue,
+	white, default, magenta, lightgray
+
+	colors.top::
+		'top' means a overhead percentage which is more than 5%.
+		And values of this variable specify percentage colors.
+		Basic key values are foreground-color 'red' and
+		background-color 'default'.
+	colors.medium::
+		'medium' means a overhead percentage which has more than 0.5%.
+		Default values are 'green' and 'default'.
+	colors.normal::
+		'normal' means the rest of overhead percentages
+		except 'top', 'medium', 'selected'.
+		Default values are 'lightgray' and 'default'.
+	colors.selected::
+		This selects the colors for the current entry in a list of entries
+		from sub-commands (top, report, annotate).
+		Default values are 'black' and 'lightgray'.
+	colors.jump_arrows::
+		Colors for jump arrows on assembly code listings
+		such as 'jns', 'jmp', 'jane', etc.
+		Default values are 'blue', 'default'.
+	colors.addr::
+		This selects colors for addresses from 'annotate'.
+		Default values are 'magenta', 'default'.
+	colors.root::
+		Colors for headers in the output of a sub-commands (top, report).
+		Default values are 'white', 'blue'.
+
+core.*::
+	core.proc-map-timeout::
+		Sets a timeout (in milliseconds) for parsing /proc/<pid>/maps files.
+		Can be overridden by the --proc-map-timeout option on supported
+		subcommands. The default timeout is 500ms.
+
+tui.*, gtk.*::
+	Subcommands that can be configured here are 'top', 'report' and 'annotate'.
+	These values are booleans, for example:
+
+	[tui]
+		top = true
+
+	will make the TUI be the default for the 'top' subcommand. Those will be
+	available if the required libs were detected at tool build time.
+
+buildid.*::
+	buildid.dir::
+		Each executable and shared library in modern distributions comes with a
+		content based identifier that, if available, will be inserted in a
+		'perf.data' file header to, at analysis time find what is needed to do
+		symbol resolution, code annotation, etc.
+
+		The recording tools also stores a hard link or copy in a per-user
+		directory, $HOME/.debug/, of binaries, shared libraries, /proc/kallsyms
+		and /proc/kcore files to be used at analysis time.
+
+		The buildid.dir variable can be used to either change this directory
+		cache location, or to disable it altogether. If you want to disable it,
+		set buildid.dir to /dev/null. The default is $HOME/.debug
+
+buildid-cache.*::
+	buildid-cache.debuginfod=URLs
+		Specify debuginfod URLs to be used when retrieving perf.data binaries,
+		it follows the same syntax as the DEBUGINFOD_URLS variable, like:
+
+		  buildid-cache.debuginfod=http://192.168.122.174:8002
+
+annotate.*::
+	These are in control of addresses, jump function, source code
+	in lines of assembly code from a specific program.
+
+	annotate.disassembler_style:
+		Use this to change the default disassembler style to some other value
+		supported by binutils, such as "intel", see the '-M' option help in the
+		'objdump' man page.
+
+	annotate.hide_src_code::
+		If a program which is analyzed has source code,
+		this option lets 'annotate' print a list of assembly code with the source code.
+		For example, let's see a part of a program. There're four lines.
+		If this option is 'true', they can be printed
+		without source code from a program as below.
+
+		│        push   %rbp
+		│        mov    %rsp,%rbp
+		│        sub    $0x10,%rsp
+		│        mov    (%rdi),%rdx
+
+		But if this option is 'false', source code of the part
+		can be also printed as below. Default is 'false'.
+
+		│      struct rb_node *rb_next(const struct rb_node *node)
+		│      {
+		│        push   %rbp
+		│        mov    %rsp,%rbp
+		│        sub    $0x10,%rsp
+		│              struct rb_node *parent;
+		│
+		│              if (RB_EMPTY_NODE(node))
+		│        mov    (%rdi),%rdx
+		│              return n;
+
+		This option works with tui, stdio2 browsers.
+
+        annotate.use_offset::
+		Basing on a first address of a loaded function, offset can be used.
+		Instead of using original addresses of assembly code,
+		addresses subtracted from a base address can be printed.
+		Let's illustrate an example.
+		If a base address is 0XFFFFFFFF81624d50 as below,
+
+		ffffffff81624d50 <load0>
+
+		an address on assembly code has a specific absolute address as below
+
+		ffffffff816250b8:│  mov    0x8(%r14),%rdi
+
+		but if use_offset is 'true', an address subtracted from a base address is printed.
+		Default is true. This option is only applied to TUI.
+
+		             368:│  mov    0x8(%r14),%rdi
+
+		This option works with tui, stdio2 browsers.
+
+	annotate.jump_arrows::
+		There can be jump instruction among assembly code.
+		Depending on a boolean value of jump_arrows,
+		arrows can be printed or not which represent
+		where do the instruction jump into as below.
+
+		│     ┌──jmp    1333
+		│     │  xchg   %ax,%ax
+		│1330:│  mov    %r15,%r10
+		│1333:└─→cmp    %r15,%r14
+
+		If jump_arrow is 'false', the arrows isn't printed as below.
+		Default is 'false'.
+
+		│      ↓ jmp    1333
+		│        xchg   %ax,%ax
+		│1330:   mov    %r15,%r10
+		│1333:   cmp    %r15,%r14
+
+		This option works with tui browser.
+
+        annotate.show_linenr::
+		When showing source code if this option is 'true',
+		line numbers are printed as below.
+
+		│1628         if (type & PERF_SAMPLE_IDENTIFIER) {
+		│     ↓ jne    508
+		│1628                 data->id = *array;
+		│1629                 array++;
+		│1630         }
+
+		However if this option is 'false', they aren't printed as below.
+		Default is 'false'.
+
+		│             if (type & PERF_SAMPLE_IDENTIFIER) {
+		│     ↓ jne    508
+		│                     data->id = *array;
+		│                     array++;
+		│             }
+
+		This option works with tui, stdio2 browsers.
+
+        annotate.show_nr_jumps::
+		Let's see a part of assembly code.
+
+		│1382:   movb   $0x1,-0x270(%rbp)
+
+		If use this, the number of branches jumping to that address can be printed as below.
+		Default is 'false'.
+
+		│1 1382:   movb   $0x1,-0x270(%rbp)
+
+		This option works with tui, stdio2 browsers.
+
+        annotate.show_total_period::
+		To compare two records on an instruction base, with this option
+		provided, display total number of samples that belong to a line
+		in assembly code. If this option is 'true', total periods are printed
+		instead of percent values as below.
+
+		  302 │      mov    %eax,%eax
+
+		But if this option is 'false', percent values for overhead are printed i.e.
+		Default is 'false'.
+
+		99.93 │      mov    %eax,%eax
+
+		This option works with tui, stdio2, stdio browsers.
+
+	annotate.show_nr_samples::
+		By default perf annotate shows percentage of samples. This option
+		can be used to print absolute number of samples. Ex, when set as
+		false:
+
+		Percent│
+		 74.03 │      mov    %fs:0x28,%rax
+
+		When set as true:
+
+		Samples│
+		     6 │      mov    %fs:0x28,%rax
+
+		This option works with tui, stdio2, stdio browsers.
+
+	annotate.offset_level::
+		Default is '1', meaning just jump targets will have offsets show right beside
+		the instruction. When set to '2' 'call' instructions will also have its offsets
+		shown, 3 or higher will show offsets for all instructions.
+
+		This option works with tui, stdio2 browsers.
+
+	annotate.demangle::
+		Demangle symbol names to human readable form. Default is 'true'.
+
+	annotate.demangle_kernel::
+		Demangle kernel symbol names to human readable form. Default is 'true'.
+
+hist.*::
+	hist.percentage::
+		This option control the way to calculate overhead of filtered entries -
+		that means the value of this option is effective only if there's a
+		filter (by comm, dso or symbol name). Suppose a following example:
+
+		       Overhead  Symbols
+		       ........  .......
+		        33.33%     foo
+		        33.33%     bar
+		        33.33%     baz
+
+	       This is an original overhead and we'll filter out the first 'foo'
+	       entry. The value of 'relative' would increase the overhead of 'bar'
+	       and 'baz' to 50.00% for each, while 'absolute' would show their
+	       current overhead (33.33%).
+
+ui.*::
+	ui.show-headers::
+		This option controls display of column headers (like 'Overhead' and 'Symbol')
+		in 'report' and 'top'. If this option is false, they are hidden.
+		This option is only applied to TUI.
+
+call-graph.*::
+	The following controls the handling of call-graphs (obtained via the
+	-g/--call-graph options).
+
+	call-graph.record-mode::
+		The mode for user space can be 'fp' (frame pointer), 'dwarf'
+		and 'lbr'.  The value 'dwarf' is effective only if libunwind
+		(or a recent version of libdw) is present on the system;
+		the value 'lbr' only works for certain cpus. The method for
+		kernel space is controlled not by this option but by the
+		kernel config (CONFIG_UNWINDER_*).
+
+	call-graph.dump-size::
+		The size of stack to dump in order to do post-unwinding. Default is 8192 (byte).
+		When using dwarf into record-mode, the default size will be used if omitted.
+
+	call-graph.print-type::
+		The print-types can be graph (graph absolute), fractal (graph relative),
+		flat and folded. This option controls a way to show overhead for each callchain
+		entry. Suppose a following example.
+
+                Overhead  Symbols
+                ........  .......
+                  40.00%  foo
+                          |
+                          ---foo
+                             |
+                             |--50.00%--bar
+                             |          main
+                             |
+                              --50.00%--baz
+                                        main
+
+		This output is a 'fractal' format. The 'foo' came from 'bar' and 'baz' exactly
+		half and half so 'fractal' shows 50.00% for each
+		(meaning that it assumes 100% total overhead of 'foo').
+
+		The 'graph' uses absolute overhead value of 'foo' as total so each of
+		'bar' and 'baz' callchain will have 20.00% of overhead.
+		If 'flat' is used, single column and linear exposure of call chains.
+		'folded' mean call chains are displayed in a line, separated by semicolons.
+
+	call-graph.order::
+		This option controls print order of callchains. The default is
+		'callee' which means callee is printed at top and then followed by its
+		caller and so on. The 'caller' prints it in reverse order.
+
+		If this option is not set and report.children or top.children is
+		set to true (or the equivalent command line option is given),
+		the default value of this option is changed to 'caller' for the
+		execution of 'perf report' or 'perf top'. Other commands will
+		still default to 'callee'.
+
+	call-graph.sort-key::
+		The callchains are merged if they contain same information.
+		The sort-key option determines a way to compare the callchains.
+		A value of 'sort-key' can be 'function' or 'address'.
+		The default is 'function'.
+
+	call-graph.threshold::
+		When there're many callchains it'd print tons of lines. So perf omits
+		small callchains under a certain overhead (threshold) and this option
+		control the threshold. Default is 0.5 (%). The overhead is calculated
+		by value depends on call-graph.print-type.
+
+	call-graph.print-limit::
+		This is a maximum number of lines of callchain printed for a single
+		histogram entry. Default is 0 which means no limitation.
+
+report.*::
+	report.sort_order::
+		Allows changing the default sort order from "comm,dso,symbol" to
+		some other default, for instance "sym,dso" may be more fitting for
+		kernel developers.
+	report.percent-limit::
+		This one is mostly the same as call-graph.threshold but works for
+		histogram entries. Entries having an overhead lower than this
+		percentage will not be printed. Default is '0'. If percent-limit
+		is '10', only entries which have more than 10% of overhead will be
+		printed.
+
+	report.queue-size::
+		This option sets up the maximum allocation size of the internal
+		event queue for ordering events. Default is 0, meaning no limit.
+
+	report.children::
+		'Children' means functions called from another function.
+		If this option is true, 'perf report' cumulates callchains of children
+		and show (accumulated) total overhead as well as 'Self' overhead.
+		Please refer to the 'perf report' manual. The default is 'true'.
+
+	report.group::
+		This option is to show event group information together.
+		Example output with this turned on, notice that there is one column
+		per event in the group, ref-cycles and cycles:
+
+		# group: {ref-cycles,cycles}
+		# ========
+		#
+		# Samples: 7K of event 'anon group { ref-cycles, cycles }'
+		# Event count (approx.): 6876107743
+		#
+		#         Overhead  Command      Shared Object               Symbol
+		# ................  .......  .................  ...................
+		#
+		    99.84%  99.76%  noploop  noploop            [.] main
+		     0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
+		     0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
+
+	report.skip-empty::
+		This option can change default stat behavior with empty results.
+		If it's set true, 'perf report --stat' will not show 0 stats.
+
+top.*::
+	top.children::
+		Same as 'report.children'. So if it is enabled, the output of 'top'
+		command will have 'Children' overhead column as well as 'Self' overhead
+		column by default.
+		The default is 'true'.
+
+	top.call-graph::
+		This is identical to 'call-graph.record-mode', except it is
+		applicable only for 'top' subcommand. This option ONLY setup
+		the unwind method. To enable 'perf top' to actually use it,
+		the command line option -g must be specified.
+
+man.*::
+	man.viewer::
+		This option can assign a tool to view manual pages when 'help'
+		subcommand was invoked. Supported tools are 'man', 'woman'
+		(with emacs client) and 'konqueror'. Default is 'man'.
+
+		New man viewer tool can be also added using 'man.<tool>.cmd'
+		or use different path using 'man.<tool>.path' config option.
+
+pager.*::
+	pager.<subcommand>::
+		When the subcommand is run on stdio, determine whether it uses
+		pager or not based on this value. Default is 'unspecified'.
+
+kmem.*::
+	kmem.default::
+		This option decides which allocator is to be analyzed if neither
+		'--slab' nor '--page' option is used. Default is 'slab'.
+
+record.*::
+	record.build-id::
+		This option can be 'cache', 'no-cache', 'skip' or 'mmap'.
+		'cache' is to post-process data and save/update the binaries into
+		the build-id cache (in ~/.debug). This is the default.
+		But if this option is 'no-cache', it will not update the build-id cache.
+		'skip' skips post-processing and does not update the cache.
+		'mmap' skips post-processing and reads build-ids from MMAP events.
+
+	record.call-graph::
+		This is identical to 'call-graph.record-mode', except it is
+		applicable only for 'record' subcommand. This option ONLY setup
+		the unwind method. To enable 'perf record' to actually use it,
+		the command line option -g must be specified.
+
+	record.aio::
+		Use 'n' control blocks in asynchronous (Posix AIO) trace writing
+		mode ('n' default: 1, max: 4).
+
+	record.debuginfod::
+		Specify debuginfod URL to be used when cacheing perf.data binaries,
+		it follows the same syntax as the DEBUGINFOD_URLS variable, like:
+
+		  http://192.168.122.174:8002
+
+		If the URLs is 'system', the value of DEBUGINFOD_URLS system environment
+		variable is used.
+
+diff.*::
+	diff.order::
+		This option sets the number of columns to sort the result.
+		The default is 0, which means sorting by baseline.
+		Setting it to 1 will sort the result by delta (or other
+		compute method selected).
+
+	diff.compute::
+		This options sets the method for computing the diff result.
+		Possible values are 'delta', 'delta-abs', 'ratio' and
+		'wdiff'.  Default is 'delta'.
+
+trace.*::
+	trace.add_events::
+		Allows adding a set of events to add to the ones specified
+		by the user, or use as a default one if none was specified.
+		The initial use case is to add augmented_raw_syscalls.o to
+		activate the 'perf trace' logic that looks for syscall
+		pointer contents after the normal tracepoint payload.
+
+	trace.args_alignment::
+		Number of columns to align the argument list, default is 70,
+		use 40 for the strace default, zero to no alignment.
+
+	trace.no_inherit::
+		Do not follow children threads.
+
+	trace.show_arg_names::
+		Should syscall argument names be printed? If not then trace.show_zeros
+		will be set.
+
+	trace.show_duration::
+		Show syscall duration.
+
+	trace.show_prefix::
+		If set to 'yes' will show common string prefixes in tables. The default
+		is to remove the common prefix in things like "MAP_SHARED", showing just "SHARED".
+
+	trace.show_timestamp::
+		Show syscall start timestamp.
+
+	trace.show_zeros::
+		Do not suppress syscall arguments that are equal to zero.
+
+	trace.tracepoint_beautifiers::
+		Use "libtraceevent" to use that library to augment the tracepoint arguments,
+		"libbeauty", the default, to use the same argument beautifiers used in the
+		strace-like sys_enter+sys_exit lines.
+
+ftrace.*::
+	ftrace.tracer::
+		Can be used to select the default tracer when neither -G nor
+		-F option is not specified. Possible values are 'function' and
+		'function_graph'.
+
+llvm.*::
+	llvm.clang-path::
+		Path to clang. If omit, search it from $PATH.
+
+	llvm.clang-bpf-cmd-template::
+		Cmdline template. Below lines show its default value. Environment
+		variable is used to pass options.
+		"$CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS "\
+		"-DLINUX_VERSION_CODE=$LINUX_VERSION_CODE "	\
+		"$CLANG_OPTIONS $PERF_BPF_INC_OPTIONS $KERNEL_INC_OPTIONS " \
+		"-Wno-unused-value -Wno-pointer-sign "		\
+		"-working-directory $WORKING_DIR "		\
+		"-c \"$CLANG_SOURCE\" -target bpf $CLANG_EMIT_LLVM -O2 -o - $LLVM_OPTIONS_PIPE"
+
+	llvm.clang-opt::
+		Options passed to clang.
+
+	llvm.kbuild-dir::
+		kbuild directory. If not set, use /lib/modules/`uname -r`/build.
+		If set to "" deliberately, skip kernel header auto-detector.
+
+	llvm.kbuild-opts::
+		Options passed to 'make' when detecting kernel header options.
+
+	llvm.dump-obj::
+		Enable perf dump BPF object files compiled by LLVM.
+
+	llvm.opts::
+		Options passed to llc.
+
+samples.*::
+
+	samples.context::
+		Define how many ns worth of time to show
+		around samples in perf report sample context browser.
+
+scripts.*::
+
+	Any option defines a script that is added to the scripts menu
+	in the interactive perf browser and whose output is displayed.
+	The name of the option is the name, the value is a script command line.
+	The script gets the same options passed as a full perf script,
+	in particular -i perfdata file, --cpu, --tid
+
+convert.*::
+
+	convert.queue-size::
+		Limit the size of ordered_events queue, so we could control
+		allocation size of perf data files without proper finished
+		round events.
+stat.*::
+
+	stat.big-num::
+		(boolean) Change the default for "--big-num". To make
+		"--no-big-num" the default, set "stat.big-num=false".
+
+intel-pt.*::
+
+	intel-pt.cache-divisor::
+
+	intel-pt.mispred-all::
+		If set, Intel PT decoder will set the mispred flag on all
+		branches.
+
+	intel-pt.max-loops::
+		If set and non-zero, the maximum number of unconditional
+		branches decoded without consuming any trace packets. If
+		the maximum is exceeded there will be a "Never-ending loop"
+		error. The default is 100000.
+
+auxtrace.*::
+
+	auxtrace.dumpdir::
+		s390 only. The directory to save the auxiliary trace buffer
+		can be changed using this option. Ex, auxtrace.dumpdir=/tmp.
+		If the directory does not exist or has the wrong file type,
+		the current directory is used.
+
+itrace.*::
+
+	debug-log-buffer-size::
+		Log size in bytes to output when using the option --itrace=d+e
+		Refer 'itrace' option of linkperf:perf-script[1] or
+		linkperf:perf-report[1]. The default is 16384.
+
+daemon.*::
+
+	daemon.base::
+		Base path for daemon data. All sessions data are stored under
+		this path.
+
+session-<NAME>.*::
+
+	session-<NAME>.run::
+
+		Defines new record session for daemon. The value is record's
+		command line without the 'record' keyword.
+
+
+SEE ALSO
+--------
+linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-daemon.txt b/tools/perf/Documentation/perf-daemon.txt
new file mode 100644
index 000000000..f558f8e4b
--- /dev/null
+++ b/tools/perf/Documentation/perf-daemon.txt
@@ -0,0 +1,208 @@
+perf-daemon(1)
+==============
+
+
+NAME
+----
+perf-daemon - Run record sessions on background
+
+
+SYNOPSIS
+--------
+[verse]
+'perf daemon'
+'perf daemon' [<options>]
+'perf daemon start'  [<options>]
+'perf daemon stop'   [<options>]
+'perf daemon signal' [<options>]
+'perf daemon ping'   [<options>]
+
+
+DESCRIPTION
+-----------
+This command allows to run simple daemon process that starts and
+monitors configured record sessions.
+
+You can imagine 'perf daemon' of background process with several
+'perf record' child tasks, like:
+
+  # ps axjf
+  ...
+       1  916507 ... perf daemon start
+  916507  916508 ...  \_ perf record --control=fifo:control,ack -m 10M -e cycles --overwrite --switch-output -a
+  916507  916509 ...  \_ perf record --control=fifo:control,ack -m 20M -e sched:* --overwrite --switch-output -a
+
+Not every 'perf record' session is suitable for running under daemon.
+User need perf session that either produces data on query, like the
+flight recorder sessions in above example or session that is configured
+to produce data periodically, like with --switch-output configuration
+for time and size.
+
+Each session is started with control setup (with perf record --control
+options).
+
+Sessions are configured through config file, see CONFIG FILE section
+with EXAMPLES.
+
+
+OPTIONS
+-------
+-v::
+--verbose::
+	Be more verbose.
+
+--config=<PATH>::
+	Config file path. If not provided, perf will check system and default
+	locations (/etc/perfconfig, $HOME/.perfconfig).
+
+--base=<PATH>::
+	Base directory path. Each daemon instance is running on top
+	of base directory. Only one instance of server can run on
+	top of one directory at the time.
+
+All generic options are available also under commands.
+
+
+START COMMAND
+-------------
+The start command creates the daemon process.
+
+-f::
+--foreground::
+	Do not put the process in background.
+
+
+STOP COMMAND
+------------
+The stop command stops all the session and the daemon process.
+
+
+SIGNAL COMMAND
+--------------
+The signal command sends signal to configured sessions.
+
+--session::
+	Send signal to specific session.
+
+
+PING COMMAND
+------------
+The ping command sends control ping to configured sessions.
+
+--session::
+	Send ping to specific session.
+
+
+CONFIG FILE
+-----------
+The daemon is configured within standard perf config file by
+following new variables:
+
+daemon.base:
+	Base path for daemon data. All sessions data are
+	stored under this path.
+
+session-<NAME>.run:
+	Defines new record session. The value is record's command
+	line without the 'record' keyword.
+
+Each perf record session is run in daemon.base/<NAME> directory.
+
+
+EXAMPLES
+--------
+Example with 2 record sessions:
+
+  # cat ~/.perfconfig
+  [daemon]
+  base=/opt/perfdata
+
+  [session-cycles]
+  run = -m 10M -e cycles --overwrite --switch-output -a
+
+  [session-sched]
+  run = -m 20M -e sched:* --overwrite --switch-output -a
+
+
+Starting the daemon:
+
+  # perf daemon start
+
+
+Check sessions:
+
+  # perf daemon
+  [603349:daemon] base: /opt/perfdata
+  [603350:cycles] perf record -m 10M -e cycles --overwrite --switch-output -a
+  [603351:sched] perf record -m 20M -e sched:* --overwrite --switch-output -a
+
+First line is daemon process info with configured daemon base.
+
+
+Check sessions with more info:
+
+  # perf daemon -v
+  [603349:daemon] base: /opt/perfdata
+    output:  /opt/perfdata/output
+    lock:    /opt/perfdata/lock
+    up:      1 minutes
+  [603350:cycles] perf record -m 10M -e cycles --overwrite --switch-output -a
+    base:    /opt/perfdata/session-cycles
+    output:  /opt/perfdata/session-cycles/output
+    control: /opt/perfdata/session-cycles/control
+    ack:     /opt/perfdata/session-cycles/ack
+    up:      1 minutes
+  [603351:sched] perf record -m 20M -e sched:* --overwrite --switch-output -a
+    base:    /opt/perfdata/session-sched
+    output:  /opt/perfdata/session-sched/output
+    control: /opt/perfdata/session-sched/control
+    ack:     /opt/perfdata/session-sched/ack
+    up:      1 minutes
+
+The 'base' path is daemon/session base.
+The 'lock' file is daemon's lock file guarding that no other
+daemon is running on top of the base.
+The 'output' file is perf record output for specific session.
+The 'control' and 'ack' files are perf control files.
+The 'up' number shows minutes daemon/session is running.
+
+
+Make sure control session is online:
+
+  # perf daemon ping
+  OK   cycles
+  OK   sched
+
+
+Send USR2 signal to session 'cycles' to generate perf.data file:
+
+  # perf daemon signal --session cycles
+  signal 12 sent to session 'cycles [603452]'
+
+  # tail -2  /opt/perfdata/session-cycles/output
+  [ perf record: dump data: Woken up 1 times ]
+  [ perf record: Dump perf.data.2020123017013149 ]
+
+
+Send USR2 signal to all sessions:
+
+  # perf daemon signal
+  signal 12 sent to session 'cycles [603452]'
+  signal 12 sent to session 'sched [603453]'
+
+  # tail -2  /opt/perfdata/session-cycles/output
+  [ perf record: dump data: Woken up 1 times ]
+  [ perf record: Dump perf.data.2020123017024689 ]
+  # tail -2  /opt/perfdata/session-sched/output
+  [ perf record: dump data: Woken up 1 times ]
+  [ perf record: Dump perf.data.2020123017024713 ]
+
+
+Stop daemon:
+
+  # perf daemon stop
+
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-config[1]
diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt
new file mode 100644
index 000000000..417bf17e2
--- /dev/null
+++ b/tools/perf/Documentation/perf-data.txt
@@ -0,0 +1,54 @@
+perf-data(1)
+============
+
+NAME
+----
+perf-data - Data file related processing
+
+SYNOPSIS
+--------
+[verse]
+'perf data' [<common options>] <command> [<options>]",
+
+DESCRIPTION
+-----------
+Data file related processing.
+
+COMMANDS
+--------
+convert::
+	Converts perf data file into another format.
+	It's possible to set data-convert debug variable to get debug messages from conversion,
+	like:
+	  perf --debug data-convert data convert ...
+
+OPTIONS for 'convert'
+---------------------
+--to-ctf::
+	Triggers the CTF conversion, specify the path of CTF data directory.
+
+--to-json::
+	Triggers JSON conversion. Specify the JSON filename to output.
+
+--tod::
+	Convert time to wall clock time.
+
+-i::
+	Specify input perf data file path.
+
+-f::
+--force::
+	Don't complain, do it.
+
+-v::
+--verbose::
+        Be more verbose (show counter open errors, etc).
+
+--all::
+	Convert all events, including non-sample events (comm, fork, ...), to output.
+	Default is off, only convert samples.
+
+SEE ALSO
+--------
+linkperf:perf[1]
+[1] Common Trace Format - http://www.efficios.com/ctf
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
new file mode 100644
index 000000000..f3067a4af
--- /dev/null
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -0,0 +1,305 @@
+perf-diff(1)
+============
+
+NAME
+----
+perf-diff - Read perf.data files and display the differential profile
+
+SYNOPSIS
+--------
+[verse]
+'perf diff' [baseline file] [data file1] [[data file2] ... ]
+
+DESCRIPTION
+-----------
+This command displays the performance difference amongst two or more perf.data
+files captured via perf record.
+
+If no parameters are passed it will assume perf.data.old and perf.data.
+
+The differential profile is displayed only for events matching both
+specified perf.data files.
+
+If no parameters are passed the samples will be sorted by dso and symbol.
+As the perf.data files could come from different binaries, the symbols addresses
+could vary. So perf diff is based on the comparison of the files and
+symbols name.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+--kallsyms=<file>::
+        kallsyms pathname
+
+-m::
+--modules::
+        Load module symbols. WARNING: use only with -k and LIVE kernel
+
+-d::
+--dsos=::
+	Only consider symbols in these dsos. CSV that understands
+	file://filename entries.  This option will affect the percentage
+	of the Baseline/Delta column.  See --percentage for more info.
+
+-C::
+--comms=::
+	Only consider symbols in these comms. CSV that understands
+	file://filename entries.  This option will affect the percentage
+	of the Baseline/Delta column.  See --percentage for more info.
+
+-S::
+--symbols=::
+	Only consider these symbols. CSV that understands
+	file://filename entries.  This option will affect the percentage
+	of the Baseline/Delta column.  See --percentage for more info.
+
+-s::
+--sort=::
+	Sort by key(s): pid, comm, dso, symbol, cpu, parent, srcline.
+	Please see description of --sort in the perf-report man page.
+
+-t::
+--field-separator=::
+
+	Use a special separator character and don't pad with spaces, replacing
+	all occurrences of this separator in symbol names (and other output)
+	with a '.' character, that thus it's the only non valid separator.
+
+-v::
+--verbose::
+	Be verbose, for instance, show the raw counts in addition to the
+	diff.
+
+-q::
+--quiet::
+	Do not show any warnings or messages.  (Suppress -v)
+
+-f::
+--force::
+        Don't do ownership validation.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-b::
+--baseline-only::
+        Show only items with match in baseline.
+
+-c::
+--compute::
+        Differential computation selection - delta, ratio, wdiff, cycles,
+        delta-abs (default is delta-abs).  Default can be changed using
+        diff.compute config option.  See COMPARISON METHODS section for
+        more info.
+
+--cycles-hist::
+	Report a histogram and the standard deviation for cycles data.
+	It can help us to judge if the reported cycles data is noisy or
+	not. This option should be used with '-c cycles'.
+
+-p::
+--period::
+        Show period values for both compared hist entries.
+
+-F::
+--formula::
+        Show formula for given computation.
+
+-o::
+--order::
+       Specify compute sorting column number.  0 means sorting by baseline
+       overhead and 1 (default) means sorting by computed value of column 1
+       (data from the first file other base baseline).  Values more than 1
+       can be used only if enough data files are provided.
+       The default value can be set using the diff.order config option.
+
+--percentage::
+	Determine how to display the overhead percentage of filtered entries.
+	Filters can be applied by --comms, --dsos and/or --symbols options.
+
+	"relative" means it's relative to filtered entries only so that the
+	sum of shown entries will be always 100%.  "absolute" means it retains
+	the original value before and after the filter is applied.
+
+--time::
+	Analyze samples within given time window. It supports time
+	percent with multiple time ranges. Time string is 'a%/n,b%/m,...'
+	or 'a%-b%,c%-%d,...'.
+
+	For example:
+
+	Select the second 10% time slice to diff:
+
+	  perf diff --time 10%/2
+
+	Select from 0% to 10% time slice to diff:
+
+	  perf diff --time 0%-10%
+
+	Select the first and the second 10% time slices to diff:
+
+	  perf diff --time 10%/1,10%/2
+
+	Select from 0% to 10% and 30% to 40% slices to diff:
+
+	  perf diff --time 0%-10%,30%-40%
+
+	It also supports analyzing samples within a given time window
+	<start>,<stop>. Times have the format seconds.nanoseconds. If 'start'
+	is not given (i.e. time string is ',x.y') then analysis starts at
+	the beginning of the file. If stop time is not given (i.e. time
+	string is 'x.y,') then analysis goes to the end of the file.
+	Multiple ranges can be separated by spaces, which requires the argument
+	to be quoted e.g. --time "1234.567,1234.789 1235,"
+	Time string is'a1.b1,c1.d1:a2.b2,c2.d2'. Use ':' to separate timestamps
+	for different perf.data files.
+
+	For example, we get the timestamp information from 'perf script'.
+
+	  perf script -i perf.data.old
+	    mgen 13940 [000]  3946.361400: ...
+
+	  perf script -i perf.data
+	    mgen 13940 [000]  3971.150589 ...
+
+	  perf diff --time 3946.361400,:3971.150589,
+
+	It analyzes the perf.data.old from the timestamp 3946.361400 to
+	the end of perf.data.old and analyzes the perf.data from the
+	timestamp 3971.150589 to the end of perf.data.
+
+--cpu:: Only diff samples for the list of CPUs provided. Multiple CPUs can
+	be provided as a comma-separated list with no space: 0,1. Ranges of
+	CPUs are specified with -: 0-2. Default is to report samples on all
+	CPUs.
+
+--pid=::
+	Only diff samples for given process ID (comma separated list).
+
+--tid=::
+	Only diff samples for given thread ID (comma separated list).
+
+--stream::
+	Enable hot streams comparison. Stream can be a callchain which is
+	aggregated by the branch records from samples.
+
+COMPARISON
+----------
+The comparison is governed by the baseline file. The baseline perf.data
+file is iterated for samples. All other perf.data files specified on
+the command line are searched for the baseline sample pair. If the pair
+is found, specified computation is made and result is displayed.
+
+All samples from non-baseline perf.data files, that do not match any
+baseline entry, are displayed with empty space within baseline column
+and possible computation results (delta) in their related column.
+
+Example files samples:
+- file A with samples f1, f2, f3, f4,    f6
+- file B with samples     f2,     f4, f5
+- file C with samples f1, f2,         f5
+
+Example output:
+  x - computation takes place for pair
+  b - baseline sample percentage
+
+- perf diff A B C
+
+  baseline/A compute/B compute/C  samples
+  ---------------------------------------
+  b                    x          f1
+  b          x         x          f2
+  b                               f3
+  b          x                    f4
+  b                               f6
+             x         x          f5
+
+- perf diff B A C
+
+  baseline/B compute/A compute/C  samples
+  ---------------------------------------
+  b          x         x          f2
+  b          x                    f4
+  b                    x          f5
+             x         x          f1
+             x                    f3
+             x                    f6
+
+- perf diff C B A
+
+  baseline/C compute/B compute/A  samples
+  ---------------------------------------
+  b                    x          f1
+  b          x         x          f2
+  b          x                    f5
+                       x          f3
+             x         x          f4
+                       x          f6
+
+COMPARISON METHODS
+------------------
+delta
+~~~~~
+If specified the 'Delta' column is displayed with value 'd' computed as:
+
+  d = A->period_percent - B->period_percent
+
+with:
+  - A/B being matching hist entry from data/baseline file specified
+    (or perf.data/perf.data.old) respectively.
+
+  - period_percent being the % of the hist entry period value within
+    single data file
+
+  - with filtering by -C, -d and/or -S, period_percent might be changed
+    relative to how entries are filtered.  Use --percentage=absolute to
+    prevent such fluctuation.
+
+delta-abs
+~~~~~~~~~
+Same as 'delta` method, but sort the result with the absolute values.
+
+ratio
+~~~~~
+If specified the 'Ratio' column is displayed with value 'r' computed as:
+
+  r = A->period / B->period
+
+with:
+  - A/B being matching hist entry from data/baseline file specified
+    (or perf.data/perf.data.old) respectively.
+
+  - period being the hist entry period value
+
+wdiff:WEIGHT-B,WEIGHT-A
+~~~~~~~~~~~~~~~~~~~~~~~
+If specified the 'Weighted diff' column is displayed with value 'd' computed as:
+
+   d = B->period * WEIGHT-A - A->period * WEIGHT-B
+
+  - A/B being matching hist entry from data/baseline file specified
+    (or perf.data/perf.data.old) respectively.
+
+  - period being the hist entry period value
+
+  - WEIGHT-A/WEIGHT-B being user supplied weights in the the '-c' option
+    behind ':' separator like '-c wdiff:1,2'.
+    - WEIGHT-A being the weight of the data file
+    - WEIGHT-B being the weight of the baseline data file
+
+cycles
+~~~~~~
+If specified the '[Program Block Range] Cycles Diff' column is displayed.
+It displays the cycles difference of same program basic block amongst
+two perf.data. The program basic block is the code between two branches.
+
+'[Program Block Range]' indicates the range of a program basic block.
+Source line is reported if it can be found otherwise uses symbol+offset
+instead.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-dlfilter.txt b/tools/perf/Documentation/perf-dlfilter.txt
new file mode 100644
index 000000000..fb22e3b31
--- /dev/null
+++ b/tools/perf/Documentation/perf-dlfilter.txt
@@ -0,0 +1,281 @@
+perf-dlfilter(1)
+================
+
+NAME
+----
+perf-dlfilter - Filter sample events using a dynamically loaded shared
+object file
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [--dlfilter file.so ] [ --dlarg arg ]...
+
+DESCRIPTION
+-----------
+
+This option is used to process data through a custom filter provided by a
+dynamically loaded shared object file. Arguments can be passed using --dlarg
+and retrieved using perf_dlfilter_fns.args().
+
+If 'file.so' does not contain "/", then it will be found either in the current
+directory, or perf tools exec path which is ~/libexec/perf-core/dlfilters for
+a local build and install (refer perf --exec-path), or the dynamic linker
+paths.
+
+API
+---
+
+The API for filtering consists of the following:
+
+[source,c]
+----
+#include <perf/perf_dlfilter.h>
+
+struct perf_dlfilter_fns perf_dlfilter_fns;
+
+int start(void **data, void *ctx);
+int stop(void *data, void *ctx);
+int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx);
+int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, void *ctx);
+const char *filter_description(const char **long_description);
+----
+
+If implemented, 'start' will be called at the beginning, before any
+calls to 'filter_event' or 'filter_event_early'. Return 0 to indicate success,
+or return a negative error code. '*data' can be assigned for use by other
+functions. 'ctx' is needed for calls to perf_dlfilter_fns, but most
+perf_dlfilter_fns are not valid when called from 'start'.
+
+If implemented, 'stop' will be called at the end, after any calls to
+'filter_event' or 'filter_event_early'. Return 0 to indicate success, or
+return a negative error code. 'data' is set by 'start'. 'ctx' is needed
+for calls to perf_dlfilter_fns, but most perf_dlfilter_fns are not valid
+when called from 'stop'.
+
+If implemented, 'filter_event' will be called for each sample event.
+Return 0 to keep the sample event, 1 to filter it out, or return a negative
+error code. 'data' is set by 'start'. 'ctx' is needed for calls to
+'perf_dlfilter_fns'.
+
+'filter_event_early' is the same as 'filter_event' except it is called before
+internal filtering.
+
+If implemented, 'filter_description' should return a one-line description
+of the filter, and optionally a longer description.
+
+The perf_dlfilter_sample structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+'filter_event' and 'filter_event_early' are passed a perf_dlfilter_sample
+structure, which contains the following fields:
+[source,c]
+----
+/*
+ * perf sample event information (as per perf script and <linux/perf_event.h>)
+ */
+struct perf_dlfilter_sample {
+	__u32 size; /* Size of this structure (for compatibility checking) */
+	__u16 ins_lat;		/* Refer PERF_SAMPLE_WEIGHT_TYPE in <linux/perf_event.h> */
+	__u16 p_stage_cyc;	/* Refer PERF_SAMPLE_WEIGHT_TYPE in <linux/perf_event.h> */
+	__u64 ip;
+	__s32 pid;
+	__s32 tid;
+	__u64 time;
+	__u64 addr;
+	__u64 id;
+	__u64 stream_id;
+	__u64 period;
+	__u64 weight;		/* Refer PERF_SAMPLE_WEIGHT_TYPE in <linux/perf_event.h> */
+	__u64 transaction;	/* Refer PERF_SAMPLE_TRANSACTION in <linux/perf_event.h> */
+	__u64 insn_cnt;	/* For instructions-per-cycle (IPC) */
+	__u64 cyc_cnt;		/* For instructions-per-cycle (IPC) */
+	__s32 cpu;
+	__u32 flags;		/* Refer PERF_DLFILTER_FLAG_* above */
+	__u64 data_src;		/* Refer PERF_SAMPLE_DATA_SRC in <linux/perf_event.h> */
+	__u64 phys_addr;	/* Refer PERF_SAMPLE_PHYS_ADDR in <linux/perf_event.h> */
+	__u64 data_page_size;	/* Refer PERF_SAMPLE_DATA_PAGE_SIZE in <linux/perf_event.h> */
+	__u64 code_page_size;	/* Refer PERF_SAMPLE_CODE_PAGE_SIZE in <linux/perf_event.h> */
+	__u64 cgroup;		/* Refer PERF_SAMPLE_CGROUP in <linux/perf_event.h> */
+	__u8  cpumode;		/* Refer CPUMODE_MASK etc in <linux/perf_event.h> */
+	__u8  addr_correlates_sym; /* True => resolve_addr() can be called */
+	__u16 misc;		/* Refer perf_event_header in <linux/perf_event.h> */
+	__u32 raw_size;		/* Refer PERF_SAMPLE_RAW in <linux/perf_event.h> */
+	const void *raw_data;	/* Refer PERF_SAMPLE_RAW in <linux/perf_event.h> */
+	__u64 brstack_nr;	/* Number of brstack entries */
+	const struct perf_branch_entry *brstack; /* Refer <linux/perf_event.h> */
+	__u64 raw_callchain_nr;	/* Number of raw_callchain entries */
+	const __u64 *raw_callchain; /* Refer <linux/perf_event.h> */
+	const char *event;
+	__s32 machine_pid;
+	__s32 vcpu;
+};
+----
+
+Note: 'machine_pid' and 'vcpu' are not original members, but were added together later.
+'size' can be used to determine their presence at run time.
+PERF_DLFILTER_HAS_MACHINE_PID will be defined if they are present at compile time.
+For example:
+[source,c]
+----
+#include <perf/perf_dlfilter.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+static inline bool have_machine_pid(const struct perf_dlfilter_sample *sample)
+{
+#ifdef PERF_DLFILTER_HAS_MACHINE_PID
+	return sample->size >= offsetof(struct perf_dlfilter_sample, vcpu) + sizeof(sample->vcpu);
+#else
+	return false;
+#endif
+}
+----
+
+The perf_dlfilter_fns structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The 'perf_dlfilter_fns' structure is populated with function pointers when the
+file is loaded. The functions can be called by 'filter_event' or
+'filter_event_early'.
+
+[source,c]
+----
+struct perf_dlfilter_fns {
+	const struct perf_dlfilter_al *(*resolve_ip)(void *ctx);
+	const struct perf_dlfilter_al *(*resolve_addr)(void *ctx);
+	char **(*args)(void *ctx, int *dlargc);
+	__s32 (*resolve_address)(void *ctx, __u64 address, struct perf_dlfilter_al *al);
+	const __u8 *(*insn)(void *ctx, __u32 *length);
+	const char *(*srcline)(void *ctx, __u32 *line_number);
+	struct perf_event_attr *(*attr)(void *ctx);
+	__s32 (*object_code)(void *ctx, __u64 ip, void *buf, __u32 len);
+	void *(*reserved[120])(void *);
+};
+----
+
+'resolve_ip' returns information about ip.
+
+'resolve_addr' returns information about addr (if addr_correlates_sym).
+
+'args' returns arguments from --dlarg options.
+
+'resolve_address' provides information about 'address'. al->size must be set
+before calling. Returns 0 on success, -1 otherwise.
+
+'insn' returns instruction bytes and length.
+
+'srcline' return source file name and line number.
+
+'attr' returns perf_event_attr, refer <linux/perf_event.h>.
+
+'object_code' reads object code and returns the number of bytes read.
+
+The perf_dlfilter_al structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The 'perf_dlfilter_al' structure contains information about an address.
+
+[source,c]
+----
+/*
+ * Address location (as per perf script)
+ */
+struct perf_dlfilter_al {
+	__u32 size; /* Size of this structure (for compatibility checking) */
+	__u32 symoff;
+	const char *sym;
+	__u64 addr; /* Mapped address (from dso) */
+	__u64 sym_start;
+	__u64 sym_end;
+	const char *dso;
+	__u8  sym_binding; /* STB_LOCAL, STB_GLOBAL or STB_WEAK, refer <elf.h> */
+	__u8  is_64_bit; /* Only valid if dso is not NULL */
+	__u8  is_kernel_ip; /* True if in kernel space */
+	__u32 buildid_size;
+	__u8 *buildid;
+	/* Below members are only populated by resolve_ip() */
+	__u8 filtered; /* true if this sample event will be filtered out */
+	const char *comm;
+};
+----
+
+perf_dlfilter_sample flags
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The 'flags' member of 'perf_dlfilter_sample' corresponds with the flags field
+of perf script. The bits of the flags are as follows:
+
+[source,c]
+----
+/* Definitions for perf_dlfilter_sample flags */
+enum {
+	PERF_DLFILTER_FLAG_BRANCH	= 1ULL << 0,
+	PERF_DLFILTER_FLAG_CALL		= 1ULL << 1,
+	PERF_DLFILTER_FLAG_RETURN	= 1ULL << 2,
+	PERF_DLFILTER_FLAG_CONDITIONAL	= 1ULL << 3,
+	PERF_DLFILTER_FLAG_SYSCALLRET	= 1ULL << 4,
+	PERF_DLFILTER_FLAG_ASYNC	= 1ULL << 5,
+	PERF_DLFILTER_FLAG_INTERRUPT	= 1ULL << 6,
+	PERF_DLFILTER_FLAG_TX_ABORT	= 1ULL << 7,
+	PERF_DLFILTER_FLAG_TRACE_BEGIN	= 1ULL << 8,
+	PERF_DLFILTER_FLAG_TRACE_END	= 1ULL << 9,
+	PERF_DLFILTER_FLAG_IN_TX	= 1ULL << 10,
+	PERF_DLFILTER_FLAG_VMENTRY	= 1ULL << 11,
+	PERF_DLFILTER_FLAG_VMEXIT	= 1ULL << 12,
+};
+----
+
+EXAMPLE
+-------
+
+Filter out everything except branches from "foo" to "bar":
+
+[source,c]
+----
+#include <perf/perf_dlfilter.h>
+#include <string.h>
+
+struct perf_dlfilter_fns perf_dlfilter_fns;
+
+int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx)
+{
+	const struct perf_dlfilter_al *al;
+	const struct perf_dlfilter_al *addr_al;
+
+	if (!sample->ip || !sample->addr_correlates_sym)
+		return 1;
+
+	al = perf_dlfilter_fns.resolve_ip(ctx);
+	if (!al || !al->sym || strcmp(al->sym, "foo"))
+		return 1;
+
+	addr_al = perf_dlfilter_fns.resolve_addr(ctx);
+	if (!addr_al || !addr_al->sym || strcmp(addr_al->sym, "bar"))
+		return 1;
+
+	return 0;
+}
+----
+
+To build the shared object, assuming perf has been installed for the local user
+i.e. perf_dlfilter.h is in ~/include/perf :
+
+	gcc -c -I ~/include -fpic dlfilter-example.c
+	gcc -shared -o dlfilter-example.so dlfilter-example.o
+
+To use the filter with perf script:
+
+	perf script --dlfilter dlfilter-example.so
+
+NOTES
+-----
+
+The dlfilter .so file will be dependent on shared libraries. If those change,
+it may be necessary to rebuild the .so. Also there may be unexpected results
+if the .so uses different versions of the shared libraries that perf uses.
+Versions can be checked using the ldd command.
+
+SEE ALSO
+--------
+linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt
new file mode 100644
index 000000000..9af8b8dfb
--- /dev/null
+++ b/tools/perf/Documentation/perf-evlist.txt
@@ -0,0 +1,45 @@
+perf-evlist(1)
+==============
+
+NAME
+----
+perf-evlist - List the event names in a perf.data file
+
+SYNOPSIS
+--------
+[verse]
+'perf evlist <options>'
+
+DESCRIPTION
+-----------
+This command displays the names of events sampled in a perf.data file.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-f::
+--force::
+	Don't complain, do it.
+
+-F::
+--freq=::
+	Show just the sample frequency used for each event.
+
+-v::
+--verbose::
+	Show all fields.
+
+-g::
+--group::
+	Show event group information.
+
+--trace-fields::
+	Show tracepoint field names.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-list[1],
+linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt
new file mode 100644
index 000000000..df4595563
--- /dev/null
+++ b/tools/perf/Documentation/perf-ftrace.txt
@@ -0,0 +1,148 @@
+perf-ftrace(1)
+==============
+
+NAME
+----
+perf-ftrace - simple wrapper for kernel's ftrace functionality
+
+
+SYNOPSIS
+--------
+[verse]
+'perf ftrace' {trace|latency} <command>
+
+DESCRIPTION
+-----------
+The 'perf ftrace' command provides a collection of subcommands which use
+kernel's ftrace infrastructure.
+
+  'perf ftrace trace' is a simple wrapper of the ftrace.  It only supports
+  single thread tracing currently and just reads trace_pipe in text and then
+  write it to stdout.
+
+  'perf ftrace latency' calculates execution latency of a given function
+  (optionally with BPF) and display it as a histogram.
+
+The following options apply to perf ftrace.
+
+COMMON OPTIONS
+--------------
+
+-p::
+--pid=::
+	Trace on existing process id (comma separated list).
+
+--tid=::
+	Trace on existing thread id (comma separated list).
+
+-a::
+--all-cpus::
+	Force system-wide collection.  Scripts run without a <command>
+	normally use -a by default, while scripts run with a <command>
+	normally don't - this option allows the latter to be run in
+	system-wide mode.
+
+-C::
+--cpu=::
+	Only trace for the list of CPUs provided.  Multiple CPUs can
+	be provided as a comma separated list with no space like: 0,1.
+	Ranges of CPUs are specified with -: 0-2.
+	Default is to trace on all online CPUs.
+
+-v::
+--verbose::
+        Increase the verbosity level.
+
+
+OPTIONS for 'perf ftrace trace'
+-------------------------------
+
+-t::
+--tracer=::
+	Tracer to use when neither -G nor -F option is not
+	specified: function_graph or function.
+
+-F::
+--funcs::
+        List available functions to trace. It accepts a pattern to
+        only list interested functions.
+
+-D::
+--delay::
+	Time (ms) to wait before starting tracing after program start.
+
+-m::
+--buffer-size::
+	Set the size of per-cpu tracing buffer, <size> is expected to
+	be a number with appended unit character - B/K/M/G.
+
+--inherit::
+	Trace children processes spawned by our target.
+
+-T::
+--trace-funcs=::
+	Select function tracer and set function filter on the given
+	function (or a glob pattern). Multiple functions can be given
+	by using this option more than once. The function argument also
+	can be a glob pattern. It will be passed to 'set_ftrace_filter'
+	in tracefs.
+
+-N::
+--notrace-funcs=::
+	Select function tracer and do not trace functions given by the
+	argument.  Like -T option, this can be used more than once to
+	specify multiple functions (or glob patterns).  It will be
+	passed to 'set_ftrace_notrace' in tracefs.
+
+--func-opts::
+	List of options allowed to set:
+	  call-graph - Display kernel stack trace for function tracer.
+	  irq-info   - Display irq context info for function tracer.
+
+-G::
+--graph-funcs=::
+	Select function_graph tracer and set graph filter on the given
+	function (or a glob pattern). This is useful to trace for
+	functions executed from the given function. This can be used more
+	than once to specify multiple functions. It will be passed to
+	'set_graph_function' in tracefs.
+
+-g::
+--nograph-funcs=::
+	Select function_graph tracer and set graph notrace filter on the
+	given function (or a glob pattern). Like -G option, this is useful
+	for the function_graph tracer only and disables tracing for function
+	executed from the given function. This can be used more than once to
+	specify multiple functions. It will be passed to 'set_graph_notrace'
+	in tracefs.
+
+--graph-opts::
+	List of options allowed to set:
+	  nosleep-time - Measure on-CPU time only for function_graph tracer.
+	  noirqs       - Ignore functions that happen inside interrupt.
+	  verbose      - Show process names, PIDs, timestamps, etc.
+	  thresh=<n>   - Setup trace duration threshold in microseconds.
+	  depth=<n>    - Set max depth for function graph tracer to follow.
+
+
+OPTIONS for 'perf ftrace latency'
+---------------------------------
+
+-T::
+--trace-funcs=::
+	Set the function name to get the histogram.  Unlike perf ftrace trace,
+	it only allows single function to calculate the histogram.
+
+-b::
+--use-bpf::
+	Use BPF to measure function latency instead of using the ftrace (it
+	uses function_graph tracer internally).
+
+-n::
+--use-nsec::
+	Use nano-second instead of micro-second as a base unit of the histogram.
+
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-trace[1]
diff --git a/tools/perf/Documentation/perf-help.txt b/tools/perf/Documentation/perf-help.txt
new file mode 100644
index 000000000..514391818
--- /dev/null
+++ b/tools/perf/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
+perf-help(1)
+============
+
+NAME
+----
+perf-help - display help information about perf
+
+SYNOPSIS
+--------
+'perf help' [-a|--all] [COMMAND]
+
+DESCRIPTION
+-----------
+
+With no options and no COMMAND given, the synopsis of the 'perf'
+command and a list of the most commonly used perf commands are printed
+on the standard output.
+
+If the option '--all' or '-a' is given, then all available commands are
+printed on the standard output.
+
+If a perf command is named, a manual page for that command is brought
+up. The 'man' program is used by default for this purpose, but this
+can be overridden by other options or configuration variables.
+
+Note that `perf --help ...` is identical to `perf help ...` because the
+former is internally converted into the latter.
+
+OPTIONS
+-------
+-a::
+--all::
+	Prints all the available commands on the standard output. This
+	option supersedes any other option.
+
+PERF
+----
+Part of the linkperf:perf[1] suite
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
new file mode 100644
index 000000000..c972032f4
--- /dev/null
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -0,0 +1,119 @@
+perf-inject(1)
+==============
+
+NAME
+----
+perf-inject - Filter to augment the events stream with additional information
+
+SYNOPSIS
+--------
+[verse]
+'perf inject <options>'
+
+DESCRIPTION
+-----------
+perf-inject reads a perf-record event stream and repipes it to stdout.  At any
+point the processing code can inject other events into the event stream - in
+this case build-ids (-b option) are read and injected as needed into the event
+stream.
+
+Build-ids are just the first user of perf-inject - potentially anything that
+needs userspace processing to augment the events stream with additional
+information could make use of this facility.
+
+OPTIONS
+-------
+-b::
+--build-ids::
+	Inject build-ids of DSOs hit by samples into the output stream.
+	This means it needs to process all SAMPLE records to find the DSOs.
+
+--buildid-all::
+	Inject build-ids of all DSOs into the output stream regardless of hits
+	and skip SAMPLE processing.
+
+--known-build-ids=::
+	Override build-ids to inject using these comma-separated pairs of
+	build-id and path. Understands file://filename to read these pairs
+	from a file, which can be generated with perf buildid-list.
+
+-v::
+--verbose::
+	Be more verbose.
+-i::
+--input=::
+	Input file name. (default: stdin)
+-o::
+--output=::
+	Output file name. (default: stdout)
+-s::
+--sched-stat::
+	Merge sched_stat and sched_switch for getting events where and how long
+	tasks slept. sched_switch contains a callchain where a task slept and
+	sched_stat contains a timeslice how long a task slept.
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname
+
+--ignore-vmlinux::
+	Ignore vmlinux files.
+
+--kallsyms=<file>::
+	kallsyms pathname
+
+--itrace::
+	Decode Instruction Tracing data, replacing it with synthesized events.
+	Options are:
+
+include::itrace.txt[]
+
+--strip::
+	Use with --itrace to strip out non-synthesized events.
+
+-j::
+--jit::
+	Process jitdump files by injecting the mmap records corresponding to jitted
+	functions. This option also generates the ELF images for each jitted function
+	found in the jitdumps files captured in the input perf.data file. Use this option
+	if you are monitoring environment using JIT runtimes, such as Java, DART or V8.
+
+-f::
+--force::
+	Don't complain, do it.
+
+--vm-time-correlation[=OPTIONS]::
+	Some architectures may capture AUX area data which contains timestamps
+	affected by virtualization. This option will update those timestamps
+	in place, to correlate with host timestamps. The in-place update means
+	that an output file is not specified, and instead the input file is
+	modified.  The options are architecture specific, except that they may
+	start with "dry-run" which will cause the file to be processed but
+	without updating it. Currently this option is supported only by
+	Intel PT, refer linkperf:perf-intel-pt[1]
+
+--guest-data=<path>,<pid>[,<time offset>[,<time scale>]]::
+	Insert events from a perf.data file recorded in a virtual machine at
+	the same time as the input perf.data file was recorded on the host.
+	The Process ID (PID) of the QEMU hypervisor process must be provided,
+	and the time offset and time scale (multiplier) will likely be needed
+	to convert guest time stamps into host time stamps. For example, for
+	x86 the TSC Offset and Multiplier could be provided for a virtual machine
+	using Linux command line option no-kvmclock.
+	Currently only mmap, mmap2, comm, task, context_switch, ksymbol,
+	and text_poke events are inserted, as well as build ID information.
+	The QEMU option -name debug-threads=on is needed so that thread names
+	can be used to determine which thread is running which VCPU. Note
+	libvirt seems to use this by default.
+	When using perf record in the guest, option --sample-identifier
+	should be used, and also --buildid-all and --switch-events may be
+	useful.
+
+:GMEXAMPLECMD: inject
+:GMEXAMPLESUBCMD:
+include::guestmount.txt[]
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1],
+linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
new file mode 100644
index 000000000..a764367fc
--- /dev/null
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -0,0 +1,1858 @@
+perf-intel-pt(1)
+================
+
+NAME
+----
+perf-intel-pt - Support for Intel Processor Trace within perf tools
+
+SYNOPSIS
+--------
+[verse]
+'perf record' -e intel_pt//
+
+DESCRIPTION
+-----------
+
+Intel Processor Trace (Intel PT) is an extension of Intel Architecture that
+collects information about software execution such as control flow, execution
+modes and timings and formats it into highly compressed binary packets.
+Technical details are documented in the Intel 64 and IA-32 Architectures
+Software Developer Manuals, Chapter 36 Intel Processor Trace.
+
+Intel PT is first supported in Intel Core M and 5th generation Intel Core
+processors that are based on the Intel micro-architecture code name Broadwell.
+
+Trace data is collected by 'perf record' and stored within the perf.data file.
+See below for options to 'perf record'.
+
+Trace data must be 'decoded' which involves walking the object code and matching
+the trace data packets. For example a TNT packet only tells whether a
+conditional branch was taken or not taken, so to make use of that packet the
+decoder must know precisely which instruction was being executed.
+
+Decoding is done on-the-fly.  The decoder outputs samples in the same format as
+samples output by perf hardware events, for example as though the "instructions"
+or "branches" events had been recorded.  Presently 3 tools support this:
+'perf script', 'perf report' and 'perf inject'.  See below for more information
+on using those tools.
+
+The main distinguishing feature of Intel PT is that the decoder can determine
+the exact flow of software execution.  Intel PT can be used to understand why
+and how did software get to a certain point, or behave a certain way.  The
+software does not have to be recompiled, so Intel PT works with debug or release
+builds, however the executed images are needed - which makes use in JIT-compiled
+environments, or with self-modified code, a challenge.  Also symbols need to be
+provided to make sense of addresses.
+
+A limitation of Intel PT is that it produces huge amounts of trace data
+(hundreds of megabytes per second per core) which takes a long time to decode,
+for example two or three orders of magnitude longer than it took to collect.
+Another limitation is the performance impact of tracing, something that will
+vary depending on the use-case and architecture.
+
+
+Quickstart
+----------
+
+It is important to start small.  That is because it is easy to capture vastly
+more data than can possibly be processed.
+
+The simplest thing to do with Intel PT is userspace profiling of small programs.
+Data is captured with 'perf record' e.g. to trace 'ls' userspace-only:
+
+	perf record -e intel_pt//u ls
+
+And profiled with 'perf report' e.g.
+
+	perf report
+
+To also trace kernel space presents a problem, namely kernel self-modifying
+code.  A fairly good kernel image is available in /proc/kcore but to get an
+accurate image a copy of /proc/kcore needs to be made under the same conditions
+as the data capture. 'perf record' can make a copy of /proc/kcore if the option
+--kcore is used, but access to /proc/kcore is restricted e.g.
+
+	sudo perf record -o pt_ls --kcore -e intel_pt// -- ls
+
+which will create a directory named 'pt_ls' and put the perf.data file (named
+simply 'data') and copies of /proc/kcore, /proc/kallsyms and /proc/modules into
+it.  The other tools understand the directory format, so to use 'perf report'
+becomes:
+
+	sudo perf report -i pt_ls
+
+Because samples are synthesized after-the-fact, the sampling period can be
+selected for reporting. e.g. sample every microsecond
+
+	sudo perf report pt_ls --itrace=i1usge
+
+See the sections below for more information about the --itrace option.
+
+Beware the smaller the period, the more samples that are produced, and the
+longer it takes to process them.
+
+Also note that the coarseness of Intel PT timing information will start to
+distort the statistical value of the sampling as the sampling period becomes
+smaller.
+
+To represent software control flow, "branches" samples are produced.  By default
+a branch sample is synthesized for every single branch.  To get an idea what
+data is available you can use the 'perf script' tool with all itrace sampling
+options, which will list all the samples.
+
+	perf record -e intel_pt//u ls
+	perf script --itrace=ibxwpe
+
+An interesting field that is not printed by default is 'flags' which can be
+displayed as follows:
+
+	perf script --itrace=ibxwpe -F+flags
+
+The flags are "bcrosyiABExghDt" which stand for branch, call, return, conditional,
+system, asynchronous, interrupt, transaction abort, trace begin, trace end,
+in transaction, VM-entry, VM-exit, interrupt disabled, and interrupt disable
+toggle respectively.
+
+perf script also supports higher level ways to dump instruction traces:
+
+	perf script --insn-trace --xed
+
+Dump all instructions. This requires installing the xed tool (see XED below)
+Dumping all instructions in a long trace can be fairly slow. It is usually better
+to start with higher level decoding, like
+
+	perf script --call-trace
+
+or
+
+	perf script --call-ret-trace
+
+and then select a time range of interest. The time range can then be examined
+in detail with
+
+	perf script --time starttime,stoptime --insn-trace --xed
+
+While examining the trace it's also useful to filter on specific CPUs using
+the -C option
+
+	perf script --time starttime,stoptime --insn-trace --xed -C 1
+
+Dump all instructions in time range on CPU 1.
+
+Another interesting field that is not printed by default is 'ipc' which can be
+displayed as follows:
+
+	perf script --itrace=be -F+ipc
+
+There are two ways that instructions-per-cycle (IPC) can be calculated depending
+on the recording.
+
+If the 'cyc' config term (see config terms section below) was used, then IPC is
+calculated using the cycle count from CYC packets, otherwise MTC packets are
+used - refer to the 'mtc' config term.  When MTC is used, however, the values
+are less accurate because the timing is less accurate.
+
+Because Intel PT does not update the cycle count on every branch or instruction,
+the values will often be zero.  When there are values, they will be the number
+of instructions and number of cycles since the last update, and thus represent
+the average IPC since the last IPC for that event type.  Note IPC for "branches"
+events is calculated separately from IPC for "instructions" events.
+
+Even with the 'cyc' config term, it is possible to produce IPC information for
+every change of timestamp, but at the expense of accuracy.  That is selected by
+specifying the itrace 'A' option.  Due to the granularity of timestamps, the
+actual number of cycles increases even though the cycles reported does not.
+The number of instructions is known, but if IPC is reported, cycles can be too
+low and so IPC is too high.  Note that inaccuracy decreases as the period of
+sampling increases i.e. if the number of cycles is too low by a small amount,
+that becomes less significant if the number of cycles is large.  It may also be
+useful to use the 'A' option in conjunction with dlfilter-show-cycles.so to
+provide higher granularity cycle information.
+
+Also note that the IPC instruction count may or may not include the current
+instruction.  If the cycle count is associated with an asynchronous branch
+(e.g. page fault or interrupt), then the instruction count does not include the
+current instruction, otherwise it does.  That is consistent with whether or not
+that instruction has retired when the cycle count is updated.
+
+Another note, in the case of "branches" events, non-taken branches are not
+presently sampled, so IPC values for them do not appear e.g. a CYC packet with a
+TNT packet that starts with a non-taken branch.  To see every possible IPC
+value, "instructions" events can be used e.g. --itrace=i0ns
+
+While it is possible to create scripts to analyze the data, an alternative
+approach is available to export the data to a sqlite or postgresql database.
+Refer to script export-to-sqlite.py or export-to-postgresql.py for more details,
+and to script exported-sql-viewer.py for an example of using the database.
+
+There is also script intel-pt-events.py which provides an example of how to
+unpack the raw data for power events and PTWRITE. The script also displays
+branches, and supports 2 additional modes selected by option:
+
+ --insn-trace - instruction trace
+ --src-trace - source trace
+
+As mentioned above, it is easy to capture too much data.  One way to limit the
+data captured is to use 'snapshot' mode which is explained further below.
+Refer to 'new snapshot option' and 'Intel PT modes of operation' further below.
+
+Another problem that will be experienced is decoder errors.  They can be caused
+by inability to access the executed image, self-modified or JIT-ed code, or the
+inability to match side-band information (such as context switches and mmaps)
+which results in the decoder not knowing what code was executed.
+
+There is also the problem of perf not being able to copy the data fast enough,
+resulting in data lost because the buffer was full.  See 'Buffer handling' below
+for more details.
+
+
+perf record
+-----------
+
+new event
+~~~~~~~~~
+
+The Intel PT kernel driver creates a new PMU for Intel PT.  PMU events are
+selected by providing the PMU name followed by the "config" separated by slashes.
+An enhancement has been made to allow default "config" e.g. the option
+
+	-e intel_pt//
+
+will use a default config value.  Currently that is the same as
+
+	-e intel_pt/tsc,noretcomp=0/
+
+which is the same as
+
+	-e intel_pt/tsc=1,noretcomp=0/
+
+Note there are now new config terms - see section 'config terms' further below.
+
+The config terms are listed in /sys/devices/intel_pt/format.  They are bit
+fields within the config member of the struct perf_event_attr which is
+passed to the kernel by the perf_event_open system call.  They correspond to bit
+fields in the IA32_RTIT_CTL MSR.  Here is a list of them and their definitions:
+
+	$ grep -H . /sys/bus/event_source/devices/intel_pt/format/*
+	/sys/bus/event_source/devices/intel_pt/format/cyc:config:1
+	/sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22
+	/sys/bus/event_source/devices/intel_pt/format/mtc:config:9
+	/sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17
+	/sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11
+	/sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27
+	/sys/bus/event_source/devices/intel_pt/format/tsc:config:10
+
+Note that the default config must be overridden for each term i.e.
+
+	-e intel_pt/noretcomp=0/
+
+is the same as:
+
+	-e intel_pt/tsc=1,noretcomp=0/
+
+So, to disable TSC packets use:
+
+	-e intel_pt/tsc=0/
+
+It is also possible to specify the config value explicitly:
+
+	-e intel_pt/config=0x400/
+
+Note that, as with all events, the event is suffixed with event modifiers:
+
+	u	userspace
+	k	kernel
+	h	hypervisor
+	G	guest
+	H	host
+	p	precise ip
+
+'h', 'G' and 'H' are for virtualization which are not used by Intel PT.
+'p' is also not relevant to Intel PT.  So only options 'u' and 'k' are
+meaningful for Intel PT.
+
+perf_event_attr is displayed if the -vv option is used e.g.
+
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             6
+	size                             112
+	config                           0x400
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|CPU|IDENTIFIER
+	read_format                      ID
+	disabled                         1
+	inherit                          1
+	exclude_kernel                   1
+	exclude_hv                       1
+	enable_on_exec                   1
+	sample_id_all                    1
+	------------------------------------------------------------
+	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
+	------------------------------------------------------------
+
+
+config terms
+~~~~~~~~~~~~
+
+The June 2015 version of Intel 64 and IA-32 Architectures Software Developer
+Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features.
+Some of the features are reflect in new config terms.  All the config terms are
+described below.
+
+tsc		Always supported.  Produces TSC timestamp packets to provide
+		timing information.  In some cases it is possible to decode
+		without timing information, for example a per-thread context
+		that does not overlap executable memory maps.
+
+		The default config selects tsc (i.e. tsc=1).
+
+noretcomp	Always supported.  Disables "return compression" so a TIP packet
+		is produced when a function returns.  Causes more packets to be
+		produced but might make decoding more reliable.
+
+		The default config does not select noretcomp (i.e. noretcomp=0).
+
+psb_period	Allows the frequency of PSB packets to be specified.
+
+		The PSB packet is a synchronization packet that provides a
+		starting point for decoding or recovery from errors.
+
+		Support for psb_period is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/psb_cyc
+
+		which contains "1" if the feature is supported and "0"
+		otherwise.
+
+		Valid values are given by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/psb_periods
+
+		which contains a hexadecimal value, the bits of which represent
+		valid values e.g. bit 2 set means value 2 is valid.
+
+		The psb_period value is converted to the approximate number of
+		trace bytes between PSB packets as:
+
+			2 ^ (value + 11)
+
+		e.g. value 3 means 16KiB bytes between PSBs
+
+		If an invalid value is entered, the error message
+		will give a list of valid values e.g.
+
+			$ perf record -e intel_pt/psb_period=15/u uname
+			Invalid psb_period for intel_pt. Valid values are: 0-5
+
+		If MTC packets are selected, the default config selects a value
+		of 3 (i.e. psb_period=3) or the nearest lower value that is
+		supported (0 is always supported).  Otherwise the default is 0.
+
+		If decoding is expected to be reliable and the buffer is large
+		then a large PSB period can be used.
+
+		Because a TSC packet is produced with PSB, the PSB period can
+		also affect the granularity to timing information in the absence
+		of MTC or CYC.
+
+mtc		Produces MTC timing packets.
+
+		MTC packets provide finer grain timestamp information than TSC
+		packets.  MTC packets record time using the hardware crystal
+		clock (CTC) which is related to TSC packets using a TMA packet.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/mtc
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+		The frequency of MTC packets can also be specified - see
+		mtc_period below.
+
+mtc_period	Specifies how frequently MTC packets are produced - see mtc
+		above for how to determine if MTC packets are supported.
+
+		Valid values are given by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/mtc_periods
+
+		which contains a hexadecimal value, the bits of which represent
+		valid values e.g. bit 2 set means value 2 is valid.
+
+		The mtc_period value is converted to the MTC frequency as:
+
+			CTC-frequency / (2 ^ value)
+
+		e.g. value 3 means one eighth of CTC-frequency
+
+		Where CTC is the hardware crystal clock, the frequency of which
+		can be related to TSC via values provided in cpuid leaf 0x15.
+
+		If an invalid value is entered, the error message
+		will give a list of valid values e.g.
+
+			$ perf record -e intel_pt/mtc_period=15/u uname
+			Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9
+
+		The default value is 3 or the nearest lower value
+		that is supported (0 is always supported).
+
+cyc		Produces CYC timing packets.
+
+		CYC packets provide even finer grain timestamp information than
+		MTC and TSC packets.  A CYC packet contains the number of CPU
+		cycles since the last CYC packet. Unlike MTC and TSC packets,
+		CYC packets are only sent when another packet is also sent.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/psb_cyc
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+		The number of CYC packets produced can be reduced by specifying
+		a threshold - see cyc_thresh below.
+
+cyc_thresh	Specifies how frequently CYC packets are produced - see cyc
+		above for how to determine if CYC packets are supported.
+
+		Valid cyc_thresh values are given by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds
+
+		which contains a hexadecimal value, the bits of which represent
+		valid values e.g. bit 2 set means value 2 is valid.
+
+		The cyc_thresh value represents the minimum number of CPU cycles
+		that must have passed before a CYC packet can be sent.  The
+		number of CPU cycles is:
+
+			2 ^ (value - 1)
+
+		e.g. value 4 means 8 CPU cycles must pass before a CYC packet
+		can be sent.  Note a CYC packet is still only sent when another
+		packet is sent, not at, e.g. every 8 CPU cycles.
+
+		If an invalid value is entered, the error message
+		will give a list of valid values e.g.
+
+			$ perf record -e intel_pt/cyc,cyc_thresh=15/u uname
+			Invalid cyc_thresh for intel_pt. Valid values are: 0-12
+
+		CYC packets are not requested by default.
+
+pt		Specifies pass-through which enables the 'branch' config term.
+
+		The default config selects 'pt' if it is available, so a user will
+		never need to specify this term.
+
+branch		Enable branch tracing.  Branch tracing is enabled by default so to
+		disable branch tracing use 'branch=0'.
+
+		The default config selects 'branch' if it is available.
+
+ptw		Enable PTWRITE packets which are produced when a ptwrite instruction
+		is executed.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/ptwrite
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+		As an alternative, refer to "Emulated PTWRITE" further below.
+
+fup_on_ptw	Enable a FUP packet to follow the PTWRITE packet.  The FUP packet
+		provides the address of the ptwrite instruction.  In the absence of
+		fup_on_ptw, the decoder will use the address of the previous branch
+		if branch tracing is enabled, otherwise the address will be zero.
+		Note that fup_on_ptw will work even when branch tracing is disabled.
+
+pwr_evt		Enable power events.  The power events provide information about
+		changes to the CPU C-state.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/power_event_trace
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+event		Enable Event Trace.  The events provide information about asynchronous
+		events.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/event_trace
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+notnt		Disable TNT packets.  Without TNT packets, it is not possible to walk
+		executable code to reconstruct control flow, however FUP, TIP, TIP.PGE
+		and TIP.PGD packets still indicate asynchronous control flow, and (if
+		return compression is disabled - see noretcomp) return statements.
+		The advantage of eliminating TNT packets is reducing the size of the
+		trace and corresponding tracing overhead.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/tnt_disable
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+
+AUX area sampling option
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To select Intel PT "sampling" the AUX area sampling option can be used:
+
+	--aux-sample
+
+Optionally it can be followed by the sample size in bytes e.g.
+
+	--aux-sample=8192
+
+In addition, the Intel PT event to sample must be defined e.g.
+
+	-e intel_pt//u
+
+Samples on other events will be created containing Intel PT data e.g. the
+following will create Intel PT samples on the branch-misses event, note the
+events must be grouped using {}:
+
+	perf record --aux-sample -e '{intel_pt//u,branch-misses:u}'
+
+An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to
+events.  In this case, the grouping is implied e.g.
+
+	perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u
+
+is the same as:
+
+	perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}'
+
+but allows for also using an address filter e.g.:
+
+	perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls
+
+It is important to select a sample size that is big enough to contain at least
+one PSB packet.  If not a warning will be displayed:
+
+	Intel PT sample size (%zu) may be too small for PSB period (%zu)
+
+The calculation used for that is: if sample_size <= psb_period + 256 display the
+warning.  When sampling is used, psb_period defaults to 0 (2KiB).
+
+The default sample size is 4KiB.
+
+The sample size is passed in aux_sample_size in struct perf_event_attr.  The
+sample size is limited by the maximum event size which is 64KiB.  It is
+difficult to know how big the event might be without the trace sample attached,
+but the tool validates that the sample size is not greater than 60KiB.
+
+
+new snapshot option
+~~~~~~~~~~~~~~~~~~~
+
+The difference between full trace and snapshot from the kernel's perspective is
+that in full trace we don't overwrite trace data that the user hasn't collected
+yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let
+the trace run and overwrite older data in the buffer so that whenever something
+interesting happens, we can stop it and grab a snapshot of what was going on
+around that interesting moment.
+
+To select snapshot mode a new option has been added:
+
+	-S
+
+Optionally it can be followed by the snapshot size e.g.
+
+	-S0x100000
+
+The default snapshot size is the auxtrace mmap size.  If neither auxtrace mmap size
+nor snapshot size is specified, then the default is 4MiB for privileged users
+(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users.
+If an unprivileged user does not specify mmap pages, the mmap pages will be
+reduced as described in the 'new auxtrace mmap size option' section below.
+
+The snapshot size is displayed if the option -vv is used e.g.
+
+	Intel PT snapshot size: %zu
+
+
+new auxtrace mmap size option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Intel PT buffer size is specified by an addition to the -m option e.g.
+
+	-m,16
+
+selects a buffer size of 16 pages i.e. 64KiB.
+
+Note that the existing functionality of -m is unchanged.  The auxtrace mmap size
+is specified by the optional addition of a comma and the value.
+
+The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users
+(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users.
+If an unprivileged user does not specify mmap pages, the mmap pages will be
+reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the
+user is likely to get an error as they exceed their mlock limit (Max locked
+memory as shown in /proc/self/limits).  Note that perf does not count the first
+512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu
+against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus
+their mlock limit (which defaults to 64KiB but is not multiplied by the number
+of cpus).
+
+In full-trace mode, powers of two are allowed for buffer size, with a minimum
+size of 2 pages.  In snapshot mode or sampling mode, it is the same but the
+minimum size is 1 page.
+
+The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g.
+
+	mmap length 528384
+	auxtrace mmap length 4198400
+
+
+Intel PT modes of operation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Intel PT can be used in 3 modes:
+	full-trace mode
+	sample mode
+	snapshot mode
+
+Full-trace mode traces continuously e.g.
+
+	perf record -e intel_pt//u uname
+
+Sample mode attaches a Intel PT sample to other events e.g.
+
+	perf record --aux-sample -e intel_pt//u -e branch-misses:u
+
+Snapshot mode captures the available data when a signal is sent or "snapshot"
+control command is issued. e.g. using a signal
+
+	perf record -v -e intel_pt//u -S ./loopy 1000000000 &
+	[1] 11435
+	kill -USR2 11435
+	Recording AUX area tracing snapshot
+
+Note that the signal sent is SIGUSR2.
+Note that "Recording AUX area tracing snapshot" is displayed because the -v
+option is used.
+
+The advantage of using "snapshot" control command is that the access is
+controlled by access to a FIFO e.g.
+
+	$ mkfifo perf.control
+	$ mkfifo perf.ack
+	$ cat perf.ack &
+	[1] 15235
+	$ sudo ~/bin/perf record --control fifo:perf.control,perf.ack -S -e intel_pt//u -- sleep 60 &
+	[2] 15243
+	$ ps -e | grep perf
+	15244 pts/1    00:00:00 perf
+	$ kill -USR2 15244
+	bash: kill: (15244) - Operation not permitted
+	$ echo snapshot > perf.control
+	ack
+
+The 3 Intel PT modes of operation cannot be used together.
+
+
+Buffer handling
+~~~~~~~~~~~~~~~
+
+There may be buffer limitations (i.e. single ToPa entry) which means that actual
+buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER).  In order to
+provide other sizes, and in particular an arbitrarily large size, multiple
+buffers are logically concatenated.  However an interrupt must be used to switch
+between buffers.  That has two potential problems:
+	a) the interrupt may not be handled in time so that the current buffer
+	becomes full and some trace data is lost.
+	b) the interrupts may slow the system and affect the performance
+	results.
+
+If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event
+which the tools report as an error.
+
+In full-trace mode, the driver waits for data to be copied out before allowing
+the (logical) buffer to wrap-around.  If data is not copied out quickly enough,
+again 'truncated' is set in the PERF_RECORD_AUX event.  If the driver has to
+wait, the intel_pt event gets disabled.  Because it is difficult to know when
+that happens, perf tools always re-enable the intel_pt event after copying out
+data.
+
+
+Intel PT and build ids
+~~~~~~~~~~~~~~~~~~~~~~
+
+By default "perf record" post-processes the event stream to find all build ids
+for executables for all addresses sampled.  Deliberately, Intel PT is not
+decoded for that purpose (it would take too long).  Instead the build ids for
+all executables encountered (due to mmap, comm or task events) are included
+in the perf.data file.
+
+To see buildids included in the perf.data file use the command:
+
+	perf buildid-list
+
+If the perf.data file contains Intel PT data, that is the same as:
+
+	perf buildid-list --with-hits
+
+
+Snapshot mode and event disabling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to make a snapshot, the intel_pt event is disabled using an IOCTL,
+namely PERF_EVENT_IOC_DISABLE.  However doing that can also disable the
+collection of side-band information.  In order to prevent that,  a dummy
+software event has been introduced that permits tracking events (like mmaps) to
+continue to be recorded while intel_pt is disabled.  That is important to ensure
+there is complete side-band information to allow the decoding of subsequent
+snapshots.
+
+A test has been created for that.  To find the test:
+
+	perf test list
+	...
+	23: Test using a dummy software event to keep tracking
+
+To run the test:
+
+	perf test 23
+	23: Test using a dummy software event to keep tracking     : Ok
+
+
+perf record modes (nothing new here)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+perf record essentially operates in one of three modes:
+	per thread
+	per cpu
+	workload only
+
+"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a
+workload).
+"per cpu" is selected by -C or -a.
+"workload only" mode is selected by not using the other options but providing a
+command to run (i.e. the workload).
+
+In per-thread mode an exact list of threads is traced.  There is no inheritance.
+Each thread has its own event buffer.
+
+In per-cpu mode all processes (or processes from the selected cgroup i.e. -G
+option, or processes selected with -p or -u) are traced.  Each cpu has its own
+buffer. Inheritance is allowed.
+
+In workload-only mode, the workload is traced but with per-cpu buffers.
+Inheritance is allowed.  Note that you can now trace a workload in per-thread
+mode by using the --per-thread option.
+
+
+Privileged vs non-privileged users
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users
+have memory limits imposed upon them.  That affects what buffer sizes they can
+have as outlined above.
+
+The v4.2 kernel introduced support for a context switch metadata event,
+PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes
+are scheduled out and in, just not by whom, which is left for the
+PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context,
+which in turn requires CAP_PERFMON or CAP_SYS_ADMIN.
+
+Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context
+switches") commit, that introduces these metadata events for further info.
+
+When working with kernels < v4.2, the following considerations must be taken,
+as the sched:sched_switch tracepoints will be used to receive such information:
+
+Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are
+not permitted to use tracepoints which means there is insufficient side-band
+information to decode Intel PT in per-cpu mode, and potentially workload-only
+mode too if the workload creates new processes.
+
+Note also, that to use tracepoints, read-access to debugfs is required.  So if
+debugfs is not mounted or the user does not have read-access, it will again not
+be possible to decode Intel PT in per-cpu mode.
+
+
+sched_switch tracepoint
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The sched_switch tracepoint is used to provide side-band data for Intel PT
+decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't
+available.
+
+The sched_switch events are automatically added. e.g. the second event shown
+below:
+
+	$ perf record -vv -e intel_pt//u uname
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             6
+	size                             112
+	config                           0x400
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|CPU|IDENTIFIER
+	read_format                      ID
+	disabled                         1
+	inherit                          1
+	exclude_kernel                   1
+	exclude_hv                       1
+	enable_on_exec                   1
+	sample_id_all                    1
+	------------------------------------------------------------
+	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             2
+	size                             112
+	config                           0x108
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER
+	read_format                      ID
+	inherit                          1
+	sample_id_all                    1
+	exclude_guest                    1
+	------------------------------------------------------------
+	sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             1
+	size                             112
+	config                           0x9
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|IDENTIFIER
+	read_format                      ID
+	disabled                         1
+	inherit                          1
+	exclude_kernel                   1
+	exclude_hv                       1
+	mmap                             1
+	comm                             1
+	enable_on_exec                   1
+	task                             1
+	sample_id_all                    1
+	mmap2                            1
+	comm_exec                        1
+	------------------------------------------------------------
+	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
+	mmap size 528384B
+	AUX area mmap length 4194304
+	perf event ring buffer mmapped per cpu
+	Synthesizing auxtrace information
+	Linux
+	[ perf record: Woken up 1 times to write data ]
+	[ perf record: Captured and wrote 0.042 MB perf.data ]
+
+Note, the sched_switch event is only added if the user is permitted to use it
+and only in per-cpu mode.
+
+Note also, the sched_switch event is only added if TSC packets are requested.
+That is because, in the absence of timing information, the sched_switch events
+cannot be matched against the Intel PT trace.
+
+
+perf script
+-----------
+
+By default, perf script will decode trace data found in the perf.data file.
+This can be further controlled by new option --itrace.
+
+
+New --itrace option
+~~~~~~~~~~~~~~~~~~~
+
+Having no option is the same as
+
+	--itrace
+
+which, in turn, is the same as
+
+	--itrace=cepwx
+
+The letters are:
+
+	i	synthesize "instructions" events
+	b	synthesize "branches" events
+	x	synthesize "transactions" events
+	w	synthesize "ptwrite" events
+	p	synthesize "power" events (incl. PSB events)
+	c	synthesize branches events (calls only)
+	r	synthesize branches events (returns only)
+	o	synthesize PEBS-via-PT events
+	I	synthesize Event Trace events
+	e	synthesize tracing error events
+	d	create a debug log
+	g	synthesize a call chain (use with i or x)
+	G	synthesize a call chain on existing event records
+	l	synthesize last branch entries (use with i or x)
+	L	synthesize last branch entries on existing event records
+	s	skip initial number of events
+	q	quicker (less detailed) decoding
+	A	approximate IPC
+	Z	prefer to ignore timestamps (so-called "timeless" decoding)
+
+"Instructions" events look like they were recorded by "perf record -e
+instructions".
+
+"Branches" events look like they were recorded by "perf record -e branches". "c"
+and "r" can be combined to get calls and returns.
+
+"Transactions" events correspond to the start or end of transactions. The
+'flags' field can be used in perf script to determine whether the event is a
+transaction start, commit or abort.
+
+Note that "instructions", "branches" and "transactions" events depend on code
+flow packets which can be disabled by using the config term "branch=0".  Refer
+to the config terms section above.
+
+"ptwrite" events record the payload of the ptwrite instruction and whether
+"fup_on_ptw" was used.  "ptwrite" events depend on PTWRITE packets which are
+recorded only if the "ptw" config term was used.  Refer to the config terms
+section above.  perf script "synth" field displays "ptwrite" information like
+this: "ip: 0 payload: 0x123456789abcdef0"  where "ip" is 1 if "fup_on_ptw" was
+used.
+
+"Power" events correspond to power event packets and CBR (core-to-bus ratio)
+packets.  While CBR packets are always recorded when tracing is enabled, power
+event packets are recorded only if the "pwr_evt" config term was used.  Refer to
+the config terms section above.  The power events record information about
+C-state changes, whereas CBR is indicative of CPU frequency.  perf script
+"event,synth" fields display information like this:
+
+	cbr:  cbr: 22 freq: 2189 MHz (200%)
+	mwait:  hints: 0x60 extensions: 0x1
+	pwre:  hw: 0 cstate: 2 sub-cstate: 0
+	exstop:  ip: 1
+	pwrx:  deepest cstate: 2 last cstate: 2 wake reason: 0x4
+
+Where:
+
+	"cbr" includes the frequency and the percentage of maximum non-turbo
+	"mwait" shows mwait hints and extensions
+	"pwre" shows C-state transitions (to a C-state deeper than C0) and
+	whether	initiated by hardware
+	"exstop" indicates execution stopped and whether the IP was recorded
+	exactly,
+	"pwrx" indicates return to C0
+
+For more details refer to the Intel 64 and IA-32 Architectures Software
+Developer Manuals.
+
+PSB events show when a PSB+ occurred and also the byte-offset in the trace.
+Emitting a PSB+ can cause a CPU a slight delay. When doing timing analysis
+of code with Intel PT, it is useful to know if a timing bubble was caused
+by Intel PT or not.
+
+Error events show where the decoder lost the trace.  Error events
+are quite important.  Users must know if what they are seeing is a complete
+picture or not. The "e" option may be followed by flags which affect what errors
+will or will not be reported.  Each flag must be preceded by either '+' or '-'.
+The flags supported by Intel PT are:
+
+		-o	Suppress overflow errors
+		-l	Suppress trace data lost errors
+
+For example, for errors but not overflow or data lost errors:
+
+	--itrace=e-o-l
+
+The "d" option will cause the creation of a file "intel_pt.log" containing all
+decoded packets and instructions.  Note that this option slows down the decoder
+and that the resulting file may be very large.  The "d" option may be followed
+by flags which affect what debug messages will or will not be logged. Each flag
+must be preceded by either '+' or '-'. The flags support by Intel PT are:
+
+		-a	Suppress logging of perf events
+		+a	Log all perf events
+		+e	Output only on decoding errors (size configurable)
+		+o	Output to stdout instead of "intel_pt.log"
+
+By default, logged perf events are filtered by any specified time ranges, but
+flag +a overrides that.  The +e flag can be useful for analyzing errors.  By
+default, the log size in that case is 16384 bytes, but can be altered by
+linkperf:perf-config[1] e.g. perf config itrace.debug-log-buffer-size=30000
+
+In addition, the period of the "instructions" event can be specified. e.g.
+
+	--itrace=i10us
+
+sets the period to 10us i.e. one  instruction sample is synthesized for each 10
+microseconds of trace.  Alternatives to "us" are "ms" (milliseconds),
+"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions).
+
+"ms", "us" and "ns" are converted to TSC ticks.
+
+The timing information included with Intel PT does not give the time of every
+instruction.  Consequently, for the purpose of sampling, the decoder estimates
+the time since the last timing packet based on 1 tick per instruction.  The time
+on the sample is *not* adjusted and reflects the last known value of TSC.
+
+For Intel PT, the default period is 100us.
+
+Setting it to a zero period means "as often as possible".
+
+In the case of Intel PT that is the same as a period of 1 and a unit of
+'instructions' (i.e. --itrace=i1i).
+
+Also the call chain size (default 16, max. 1024) for instructions or
+transactions events can be specified. e.g.
+
+	--itrace=ig32
+	--itrace=xg32
+
+Also the number of last branch entries (default 64, max. 1024) for instructions or
+transactions events can be specified. e.g.
+
+       --itrace=il10
+       --itrace=xl10
+
+Note that last branch entries are cleared for each sample, so there is no overlap
+from one sample to the next.
+
+The G and L options are designed in particular for sample mode, and work much
+like g and l but add call chain and branch stack to the other selected events
+instead of synthesized events. For example, to record branch-misses events for
+'ls' and then add a call chain derived from the Intel PT trace:
+
+	perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' -- ls
+	perf report --itrace=Ge
+
+Although in fact G is a default for perf report, so that is the same as just:
+
+	perf report
+
+One caveat with the G and L options is that they work poorly with "Large PEBS".
+Large PEBS means PEBS records will be accumulated by hardware and the written
+into the event buffer in one go.  That reduces interrupts, but can give very
+late timestamps.  Because the Intel PT trace is synchronized by timestamps,
+the PEBS events do not match the trace.  Currently, Large PEBS is used only in
+certain circumstances:
+	- hardware supports it
+	- PEBS is used
+	- event period is specified, instead of frequency
+	- the sample type is limited to the following flags:
+		PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR |
+		PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID |
+		PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER |
+		PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR |
+		PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER |
+		PERF_SAMPLE_PERIOD (and sometimes) | PERF_SAMPLE_TIME
+Because Intel PT sample mode uses a different sample type to the list above,
+Large PEBS is not used with Intel PT sample mode. To avoid Large PEBS in other
+cases, avoid specifying the event period i.e. avoid the 'perf record' -c option,
+--count option, or 'period' config term.
+
+To disable trace decoding entirely, use the option --no-itrace.
+
+It is also possible to skip events generated (instructions, branches, transactions)
+at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+skips the first million instructions.
+
+The q option changes the way the trace is decoded.  The decoding is much faster
+but much less detailed.  Specifically, with the q option, the decoder does not
+decode TNT packets, and does not walk object code, but gets the ip from FUP and
+TIP packets.  The q option can be used with the b and i options but the period
+is not used.  The q option decodes more quickly, but is useful only if the
+control flow of interest is represented or indicated by FUP, TIP, TIP.PGE, or
+TIP.PGD packets (refer below).  However the q option could be used to find time
+ranges that could then be decoded fully using the --time option.
+
+What will *not* be decoded with the (single) q option:
+
+	- direct calls and jmps
+	- conditional branches
+	- non-branch instructions
+
+What *will* be decoded with the (single) q option:
+
+	- asynchronous branches such as interrupts
+	- indirect branches
+	- function return target address *if* the noretcomp config term (refer
+	config terms section) was used
+	- start of (control-flow) tracing
+	- end of (control-flow) tracing, if it is not out of context
+	- power events, ptwrite, transaction start and abort
+	- instruction pointer associated with PSB packets
+
+Note the q option does not specify what events will be synthesized e.g. the p
+option must be used also to show power events.
+
+Repeating the q option (double-q i.e. qq) results in even faster decoding and even
+less detail.  The decoder decodes only extended PSB (PSB+) packets, getting the
+instruction pointer if there is a FUP packet within PSB+ (i.e. between PSB and
+PSBEND).  Note PSB packets occur regularly in the trace based on the psb_period
+config term (refer config terms section).  There will be a FUP packet if the
+PSB+ occurs while control flow is being traced.
+
+What will *not* be decoded with the qq option:
+
+	- everything except instruction pointer associated with PSB packets
+
+What *will* be decoded with the qq option:
+
+	- instruction pointer associated with PSB packets
+
+The Z option is equivalent to having recorded a trace without TSC
+(i.e. config term tsc=0). It can be useful to avoid timestamp issues when
+decoding a trace of a virtual machine.
+
+
+dlfilter-show-cycles.so
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Cycles can be displayed using dlfilter-show-cycles.so in which case the itrace A
+option can be useful to provide higher granularity cycle information:
+
+	perf script --itrace=A --call-trace --dlfilter dlfilter-show-cycles.so
+
+To see a list of dlfilters:
+
+	perf script -v --list-dlfilters
+
+See also linkperf:perf-dlfilters[1]
+
+
+dump option
+~~~~~~~~~~~
+
+perf script has an option (-D) to "dump" the events i.e. display the binary
+data.
+
+When -D is used, Intel PT packets are displayed.  The packet decoder does not
+pay attention to PSB packets, but just decodes the bytes - so the packets seen
+by the actual decoder may not be identical in places where the data is corrupt.
+One example of that would be when the buffer-switching interrupt has been too
+slow, and the buffer has been filled completely.  In that case, the last packet
+in the buffer might be truncated and immediately followed by a PSB as the trace
+continues in the next buffer.
+
+To disable the display of Intel PT packets, combine the -D option with
+--no-itrace.
+
+
+perf report
+-----------
+
+By default, perf report will decode trace data found in the perf.data file.
+This can be further controlled by new option --itrace exactly the same as
+perf script, with the exception that the default is --itrace=igxe.
+
+
+perf inject
+-----------
+
+perf inject also accepts the --itrace option in which case tracing data is
+removed and replaced with the synthesized events. e.g.
+
+	perf inject --itrace -i perf.data -o perf.data.new
+
+Below is an example of using Intel PT with autofdo.  It requires autofdo
+(https://github.com/google/autofdo) and gcc version 5.  The bubble
+sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial)
+amended to take the number of elements as a parameter.
+
+	$ gcc-5 -O3 sort.c -o sort_optimized
+	$ ./sort_optimized 30000
+	Bubble sorting array of 30000 elements
+	2254 ms
+
+	$ cat ~/.perfconfig
+	[intel-pt]
+		mispred-all = on
+
+	$ perf record -e intel_pt//u ./sort 3000
+	Bubble sorting array of 3000 elements
+	58 ms
+	[ perf record: Woken up 2 times to write data ]
+	[ perf record: Captured and wrote 3.939 MB perf.data ]
+	$ perf inject -i perf.data -o inj --itrace=i100usle --strip
+	$ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1
+	$ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo
+	$ ./sort_autofdo 30000
+	Bubble sorting array of 30000 elements
+	2155 ms
+
+Note there is currently no advantage to using Intel PT instead of LBR, but
+that may change in the future if greater use is made of the data.
+
+
+PEBS via Intel PT
+-----------------
+
+Some hardware has the feature to redirect PEBS records to the Intel PT trace.
+Recording is selected by using the aux-output config term e.g.
+
+	perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname
+
+Originally, software only supported redirecting at most one PEBS event because it
+was not able to differentiate one event from another. To overcome that, more recent
+kernels and perf tools add support for the PERF_RECORD_AUX_OUTPUT_HW_ID side-band event.
+To check for the presence of that event in a PEBS-via-PT trace:
+
+	perf script -D --no-itrace | grep PERF_RECORD_AUX_OUTPUT_HW_ID
+
+To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g.
+
+	perf script --itrace=oe
+
+XED
+---
+
+include::build-xed.txt[]
+
+
+Tracing Virtual Machines (kernel only)
+--------------------------------------
+
+Currently, kernel tracing is supported with either "timeless" decoding
+(i.e. no TSC timestamps) or VM Time Correlation. VM Time Correlation is an extra step
+using 'perf inject' and requires unchanging VMX TSC Offset and no VMX TSC Scaling.
+
+Other limitations and caveats
+
+ VMX controls may suppress packets needed for decoding resulting in decoding errors
+ VMX controls may block the perf NMI to the host potentially resulting in lost trace data
+ Guest kernel self-modifying code (e.g. jump labels or JIT-compiled eBPF) will result in decoding errors
+ Guest thread information is unknown
+ Guest VCPU is unknown but may be able to be inferred from the host thread
+ Callchains are not supported
+
+Example using "timeless" decoding
+
+Start VM
+
+ $ sudo virsh start kubuntu20.04
+ Domain kubuntu20.04 started
+
+Mount the guest file system.  Note sshfs needs -o direct_io to enable reading of proc files.  root access is needed to read /proc/kcore.
+
+ $ mkdir vm0
+ $ sshfs -o direct_io root@vm0:/ vm0
+
+Copy the guest /proc/kallsyms, /proc/modules and /proc/kcore
+
+ $ perf buildid-cache -v --kcore vm0/proc/kcore
+ kcore added to build-id cache directory /home/user/.debug/[kernel.kcore]/9600f316a53a0f54278885e8d9710538ec5f6a08/2021021807494306
+ $ KALLSYMS=/home/user/.debug/[kernel.kcore]/9600f316a53a0f54278885e8d9710538ec5f6a08/2021021807494306/kallsyms
+
+Find the VM process
+
+ $ ps -eLl | grep 'KVM\|PID'
+ F S   UID     PID    PPID     LWP  C PRI  NI ADDR SZ WCHAN  TTY          TIME CMD
+ 3 S 64055    1430       1    1440  1  80   0 - 1921718 -    ?        00:02:47 CPU 0/KVM
+ 3 S 64055    1430       1    1441  1  80   0 - 1921718 -    ?        00:02:41 CPU 1/KVM
+ 3 S 64055    1430       1    1442  1  80   0 - 1921718 -    ?        00:02:38 CPU 2/KVM
+ 3 S 64055    1430       1    1443  2  80   0 - 1921718 -    ?        00:03:18 CPU 3/KVM
+
+Start an open-ended perf record, tracing the VM process, do something on the VM, and then ctrl-C to stop.
+TSC is not supported and tsc=0 must be specified.  That means mtc is useless, so add mtc=0.
+However, IPC can still be determined, hence cyc=1 can be added.
+Only kernel decoding is supported, so 'k' must be specified.
+Intel PT traces both the host and the guest so --guest and --host need to be specified.
+Without timestamps, --per-thread must be specified to distinguish threads.
+
+ $ sudo perf kvm --guest --host --guestkallsyms $KALLSYMS record --kcore -e intel_pt/tsc=0,mtc=0,cyc=1/k -p 1430 --per-thread
+ ^C
+ [ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 5.829 MB ]
+
+perf script can be used to provide an instruction trace
+
+ $ perf script --guestkallsyms $KALLSYMS --insn-trace --xed -F+ipc | grep -C10 vmresume | head -21
+       CPU 0/KVM  1440  ffffffff82133cdd __vmx_vcpu_run+0x3d ([kernel.kallsyms])                movq  0x48(%rax), %r9
+       CPU 0/KVM  1440  ffffffff82133ce1 __vmx_vcpu_run+0x41 ([kernel.kallsyms])                movq  0x50(%rax), %r10
+       CPU 0/KVM  1440  ffffffff82133ce5 __vmx_vcpu_run+0x45 ([kernel.kallsyms])                movq  0x58(%rax), %r11
+       CPU 0/KVM  1440  ffffffff82133ce9 __vmx_vcpu_run+0x49 ([kernel.kallsyms])                movq  0x60(%rax), %r12
+       CPU 0/KVM  1440  ffffffff82133ced __vmx_vcpu_run+0x4d ([kernel.kallsyms])                movq  0x68(%rax), %r13
+       CPU 0/KVM  1440  ffffffff82133cf1 __vmx_vcpu_run+0x51 ([kernel.kallsyms])                movq  0x70(%rax), %r14
+       CPU 0/KVM  1440  ffffffff82133cf5 __vmx_vcpu_run+0x55 ([kernel.kallsyms])                movq  0x78(%rax), %r15
+       CPU 0/KVM  1440  ffffffff82133cf9 __vmx_vcpu_run+0x59 ([kernel.kallsyms])                movq  (%rax), %rax
+       CPU 0/KVM  1440  ffffffff82133cfc __vmx_vcpu_run+0x5c ([kernel.kallsyms])                callq  0xffffffff82133c40
+       CPU 0/KVM  1440  ffffffff82133c40 vmx_vmenter+0x0 ([kernel.kallsyms])            jz 0xffffffff82133c46
+       CPU 0/KVM  1440  ffffffff82133c42 vmx_vmenter+0x2 ([kernel.kallsyms])            vmresume         IPC: 0.11 (50/445)
+           :1440  1440  ffffffffbb678b06 native_write_msr+0x6 ([guest.kernel.kallsyms])                 nopl  %eax, (%rax,%rax,1)
+           :1440  1440  ffffffffbb678b0b native_write_msr+0xb ([guest.kernel.kallsyms])                 retq     IPC: 0.04 (2/41)
+           :1440  1440  ffffffffbb666646 lapic_next_deadline+0x26 ([guest.kernel.kallsyms])             data16 nop
+           :1440  1440  ffffffffbb666648 lapic_next_deadline+0x28 ([guest.kernel.kallsyms])             xor %eax, %eax
+           :1440  1440  ffffffffbb66664a lapic_next_deadline+0x2a ([guest.kernel.kallsyms])             popq  %rbp
+           :1440  1440  ffffffffbb66664b lapic_next_deadline+0x2b ([guest.kernel.kallsyms])             retq     IPC: 0.16 (4/25)
+           :1440  1440  ffffffffbb74607f clockevents_program_event+0x8f ([guest.kernel.kallsyms])               test %eax, %eax
+           :1440  1440  ffffffffbb746081 clockevents_program_event+0x91 ([guest.kernel.kallsyms])               jz 0xffffffffbb74603c    IPC: 0.06 (2/30)
+           :1440  1440  ffffffffbb74603c clockevents_program_event+0x4c ([guest.kernel.kallsyms])               popq  %rbx
+           :1440  1440  ffffffffbb74603d clockevents_program_event+0x4d ([guest.kernel.kallsyms])               popq  %r12
+
+Example using VM Time Correlation
+
+Start VM
+
+ $ sudo virsh start kubuntu20.04
+ Domain kubuntu20.04 started
+
+Mount the guest file system.  Note sshfs needs -o direct_io to enable reading of proc files.  root access is needed to read /proc/kcore.
+
+ $ mkdir -p vm0
+ $ sshfs -o direct_io root@vm0:/ vm0
+
+Copy the guest /proc/kallsyms, /proc/modules and /proc/kcore
+
+ $ perf buildid-cache -v --kcore vm0/proc/kcore
+ same kcore found in /home/user/.debug/[kernel.kcore]/cc9c55a98c5e4ec0aeda69302554aabed5cd6491/2021021312450777
+ $ KALLSYMS=/home/user/.debug/\[kernel.kcore\]/cc9c55a98c5e4ec0aeda69302554aabed5cd6491/2021021312450777/kallsyms
+
+Find the VM process
+
+ $ ps -eLl | grep 'KVM\|PID'
+ F S   UID     PID    PPID     LWP  C PRI  NI ADDR SZ WCHAN  TTY          TIME CMD
+ 3 S 64055   16998       1   17005 13  80   0 - 1818189 -    ?        00:00:16 CPU 0/KVM
+ 3 S 64055   16998       1   17006  4  80   0 - 1818189 -    ?        00:00:05 CPU 1/KVM
+ 3 S 64055   16998       1   17007  3  80   0 - 1818189 -    ?        00:00:04 CPU 2/KVM
+ 3 S 64055   16998       1   17008  4  80   0 - 1818189 -    ?        00:00:05 CPU 3/KVM
+
+Start an open-ended perf record, tracing the VM process, do something on the VM, and then ctrl-C to stop.
+IPC can be determined, hence cyc=1 can be added.
+Only kernel decoding is supported, so 'k' must be specified.
+Intel PT traces both the host and the guest so --guest and --host need to be specified.
+
+ $ sudo perf kvm --guest --host --guestkallsyms $KALLSYMS record --kcore -e intel_pt/cyc=1/k -p 16998
+ ^C[ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 9.041 MB perf.data.kvm ]
+
+Now 'perf inject' can be used to determine the VMX TCS Offset. Note, Intel PT TSC packets are
+only 7-bytes, so the TSC Offset might differ from the actual value in the 8th byte. That will
+have no effect i.e. the resulting timestamps will be correct anyway.
+
+ $ perf inject -i perf.data.kvm --vm-time-correlation=dry-run
+ ERROR: Unknown TSC Offset for VMCS 0x1bff6a
+ VMCS: 0x1bff6a  TSC Offset 0xffffe42722c64c41
+ ERROR: Unknown TSC Offset for VMCS 0x1cbc08
+ VMCS: 0x1cbc08  TSC Offset 0xffffe42722c64c41
+ ERROR: Unknown TSC Offset for VMCS 0x1c3ce8
+ VMCS: 0x1c3ce8  TSC Offset 0xffffe42722c64c41
+ ERROR: Unknown TSC Offset for VMCS 0x1cbce9
+ VMCS: 0x1cbce9  TSC Offset 0xffffe42722c64c41
+
+Each virtual CPU has a different Virtual Machine Control Structure (VMCS)
+shown above with the calculated TSC Offset. For an unchanging TSC Offset
+they should all be the same for the same virtual machine.
+
+Now that the TSC Offset is known, it can be provided to 'perf inject'
+
+ $ perf inject -i perf.data.kvm --vm-time-correlation="dry-run 0xffffe42722c64c41"
+
+Note the options for 'perf inject' --vm-time-correlation are:
+
+ [ dry-run ] [ <TSC Offset> [ : <VMCS> [ , <VMCS> ]... ]  ]...
+
+So it is possible to specify different TSC Offsets for different VMCS.
+The option "dry-run" will cause the file to be processed but without updating it.
+Note it is also possible to get a intel_pt.log file by adding option --itrace=d
+
+There were no errors so, do it for real
+
+ $ perf inject -i perf.data.kvm --vm-time-correlation=0xffffe42722c64c41 --force
+
+'perf script' can be used to see if there are any decoder errors
+
+ $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --itrace=e-o
+
+There were none.
+
+'perf script' can be used to provide an instruction trace showing timestamps
+
+ $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --insn-trace --xed -F+ipc | grep -C10 vmresume | head -21
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cdd __vmx_vcpu_run+0x3d ([kernel.kallsyms])                 movq  0x48(%rax), %r9
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce1 __vmx_vcpu_run+0x41 ([kernel.kallsyms])                 movq  0x50(%rax), %r10
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce5 __vmx_vcpu_run+0x45 ([kernel.kallsyms])                 movq  0x58(%rax), %r11
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce9 __vmx_vcpu_run+0x49 ([kernel.kallsyms])                 movq  0x60(%rax), %r12
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ced __vmx_vcpu_run+0x4d ([kernel.kallsyms])                 movq  0x68(%rax), %r13
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cf1 __vmx_vcpu_run+0x51 ([kernel.kallsyms])                 movq  0x70(%rax), %r14
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cf5 __vmx_vcpu_run+0x55 ([kernel.kallsyms])                 movq  0x78(%rax), %r15
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cf9 __vmx_vcpu_run+0x59 ([kernel.kallsyms])                 movq  (%rax), %rax
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cfc __vmx_vcpu_run+0x5c ([kernel.kallsyms])                 callq  0xffffffff82133c40
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133c40 vmx_vmenter+0x0 ([kernel.kallsyms])             jz 0xffffffff82133c46
+       CPU 1/KVM 17006 [001] 11500.262866075:  ffffffff82133c42 vmx_vmenter+0x2 ([kernel.kallsyms])             vmresume         IPC: 0.05 (40/769)
+          :17006 17006 [001] 11500.262869216:  ffffffff82200cb0 asm_sysvec_apic_timer_interrupt+0x0 ([guest.kernel.kallsyms])           clac
+          :17006 17006 [001] 11500.262869216:  ffffffff82200cb3 asm_sysvec_apic_timer_interrupt+0x3 ([guest.kernel.kallsyms])           pushq  $0xffffffffffffffff
+          :17006 17006 [001] 11500.262869216:  ffffffff82200cb5 asm_sysvec_apic_timer_interrupt+0x5 ([guest.kernel.kallsyms])           callq  0xffffffff82201160
+          :17006 17006 [001] 11500.262869216:  ffffffff82201160 error_entry+0x0 ([guest.kernel.kallsyms])               cld
+          :17006 17006 [001] 11500.262869216:  ffffffff82201161 error_entry+0x1 ([guest.kernel.kallsyms])               pushq  %rsi
+          :17006 17006 [001] 11500.262869216:  ffffffff82201162 error_entry+0x2 ([guest.kernel.kallsyms])               movq  0x8(%rsp), %rsi
+          :17006 17006 [001] 11500.262869216:  ffffffff82201167 error_entry+0x7 ([guest.kernel.kallsyms])               movq  %rdi, 0x8(%rsp)
+          :17006 17006 [001] 11500.262869216:  ffffffff8220116c error_entry+0xc ([guest.kernel.kallsyms])               pushq  %rdx
+          :17006 17006 [001] 11500.262869216:  ffffffff8220116d error_entry+0xd ([guest.kernel.kallsyms])               pushq  %rcx
+          :17006 17006 [001] 11500.262869216:  ffffffff8220116e error_entry+0xe ([guest.kernel.kallsyms])               pushq  %rax
+
+
+Tracing Virtual Machines (including user space)
+-----------------------------------------------
+
+It is possible to use perf record to record sideband events within a virtual machine, so that an Intel PT trace on the host can be decoded.
+Sideband events from the guest perf.data file can be injected into the host perf.data file using perf inject.
+
+Here is an example of the steps needed:
+
+On the guest machine:
+
+Check that no-kvmclock kernel command line option was used to boot:
+
+Note, this is essential to enable time correlation between host and guest machines.
+
+ $ cat /proc/cmdline
+ BOOT_IMAGE=/boot/vmlinuz-5.10.0-16-amd64 root=UUID=cb49c910-e573-47e0-bce7-79e293df8e1d ro no-kvmclock
+
+There is no BPF support at present so, if possible, disable JIT compiling:
+
+ $ echo 0 | sudo tee /proc/sys/net/core/bpf_jit_enable
+ 0
+
+Start perf record to collect sideband events:
+
+ $ sudo perf record -o guest-sideband-testing-guest-perf.data --sample-identifier --buildid-all --switch-events --kcore -a -e dummy
+
+On the host machine:
+
+Start perf record to collect Intel PT trace:
+
+Note, the host trace will get very big, very fast, so the steps from starting to stopping the host trace really need to be done so that they happen in the shortest time possible.
+
+ $ sudo perf record -o guest-sideband-testing-host-perf.data -m,64M --kcore -a -e intel_pt/cyc/
+
+On the guest machine:
+
+Run a small test case, just 'uname' in this example:
+
+ $ uname
+ Linux
+
+On the host machine:
+
+Stop the Intel PT trace:
+
+ ^C
+ [ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 76.122 MB guest-sideband-testing-host-perf.data ]
+
+On the guest machine:
+
+Stop the Intel PT trace:
+
+ ^C
+ [ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 1.247 MB guest-sideband-testing-guest-perf.data ]
+
+And then copy guest-sideband-testing-guest-perf.data to the host (not shown here).
+
+On the host machine:
+
+With the 2 perf.data recordings, and with their ownership changed to the user.
+
+Identify the TSC Offset:
+
+ $ perf inject -i guest-sideband-testing-host-perf.data --vm-time-correlation=dry-run
+ VMCS: 0x103fc6  TSC Offset 0xfffffa6ae070cb20
+ VMCS: 0x103ff2  TSC Offset 0xfffffa6ae070cb20
+ VMCS: 0x10fdaa  TSC Offset 0xfffffa6ae070cb20
+ VMCS: 0x24d57c  TSC Offset 0xfffffa6ae070cb20
+
+Correct Intel PT TSC timestamps for the guest machine:
+
+ $ perf inject -i guest-sideband-testing-host-perf.data --vm-time-correlation=0xfffffa6ae070cb20 --force
+
+Identify the guest machine PID:
+
+ $ perf script -i guest-sideband-testing-host-perf.data --no-itrace --show-task-events | grep KVM
+       CPU 0/KVM     0 [000]     0.000000: PERF_RECORD_COMM: CPU 0/KVM:13376/13381
+       CPU 1/KVM     0 [000]     0.000000: PERF_RECORD_COMM: CPU 1/KVM:13376/13382
+       CPU 2/KVM     0 [000]     0.000000: PERF_RECORD_COMM: CPU 2/KVM:13376/13383
+       CPU 3/KVM     0 [000]     0.000000: PERF_RECORD_COMM: CPU 3/KVM:13376/13384
+
+Note, the QEMU option -name debug-threads=on is needed so that thread names
+can be used to determine which thread is running which VCPU as above. libvirt seems to use this by default.
+
+Create a guestmount, assuming the guest machine is 'vm_to_test':
+
+ $ mkdir -p ~/guestmount/13376
+ $ sshfs -o direct_io vm_to_test:/ ~/guestmount/13376
+
+Inject the guest perf.data file into the host perf.data file:
+
+Note, due to the guestmount option, guest object files and debug files will be copied into the build ID cache from the guest machine, with the notable exception of VDSO.
+If needed, VDSO can be copied manually in a fashion similar to that used by the perf-archive script.
+
+ $ perf inject -i guest-sideband-testing-host-perf.data -o inj --guestmount ~/guestmount --guest-data=guest-sideband-testing-guest-perf.data,13376,0xfffffa6ae070cb20
+
+Show an excerpt from the result.  In this case the CPU and time range have been to chosen to show interaction between guest and host when 'uname' is starting to run on the guest machine:
+
+Notes:
+
+	- the CPU displayed, [002] in this case, is always the host CPU
+	- events happening in the virtual machine start with VM:13376 VCPU:003, which shows the hypervisor PID 13376 and the VCPU number
+	- only calls and errors are displayed i.e. --itrace=ce
+	- branches entering and exiting the virtual machine are split, and show as 2 branches to/from "0 [unknown] ([unknown])"
+
+ $ perf script -i inj --itrace=ce -F+machine_pid,+vcpu,+addr,+pid,+tid,-period --ns --time 7919.408803365,7919.408804631 -C 2
+       CPU 3/KVM 13376/13384 [002]  7919.408803365:      branches:  ffffffffc0f8ebe0 vmx_vcpu_enter_exit+0xc0 ([kernel.kallsyms]) => ffffffffc0f8edc0 __vmx_vcpu_run+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803365:      branches:  ffffffffc0f8edd5 __vmx_vcpu_run+0x15 ([kernel.kallsyms]) => ffffffffc0f8eca0 vmx_update_host_rsp+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803365:      branches:  ffffffffc0f8ee1b __vmx_vcpu_run+0x5b ([kernel.kallsyms]) => ffffffffc0f8ed60 vmx_vmenter+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803461:      branches:  ffffffffc0f8ed62 vmx_vmenter+0x2 ([kernel.kallsyms]) =>                0 [unknown] ([unknown])
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408803461:      branches:                 0 [unknown] ([unknown]) =>     7f851c9b5a5c init_cacheinfo+0x3ac (/usr/lib/x86_64-linux-gnu/libc-2.31.so)
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408803567:      branches:      7f851c9b5a5a init_cacheinfo+0x3aa (/usr/lib/x86_64-linux-gnu/libc-2.31.so) =>                0 [unknown] ([unknown])
+       CPU 3/KVM 13376/13384 [002]  7919.408803567:      branches:                 0 [unknown] ([unknown]) => ffffffffc0f8ed80 vmx_vmexit+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803596:      branches:  ffffffffc0f6619a vmx_vcpu_run+0x26a ([kernel.kallsyms]) => ffffffffb2255c60 x86_virt_spec_ctrl+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803801:      branches:  ffffffffc0f66445 vmx_vcpu_run+0x515 ([kernel.kallsyms]) => ffffffffb2290b30 native_write_msr+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803850:      branches:  ffffffffc0f661f8 vmx_vcpu_run+0x2c8 ([kernel.kallsyms]) => ffffffffc1092300 kvm_load_host_xsave_state+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803850:      branches:  ffffffffc1092327 kvm_load_host_xsave_state+0x27 ([kernel.kallsyms]) => ffffffffc1092220 kvm_load_host_xsave_state.part.0+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803862:      branches:  ffffffffc0f662cf vmx_vcpu_run+0x39f ([kernel.kallsyms]) => ffffffffc0f63f90 vmx_recover_nmi_blocking+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803862:      branches:  ffffffffc0f662e9 vmx_vcpu_run+0x3b9 ([kernel.kallsyms]) => ffffffffc0f619a0 __vmx_complete_interrupts+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803872:      branches:  ffffffffc109cfb2 vcpu_enter_guest+0x752 ([kernel.kallsyms]) => ffffffffc0f5f570 vmx_handle_exit_irqoff+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803881:      branches:  ffffffffc109d028 vcpu_enter_guest+0x7c8 ([kernel.kallsyms]) => ffffffffb234f900 __srcu_read_lock+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803897:      branches:  ffffffffc109d06f vcpu_enter_guest+0x80f ([kernel.kallsyms]) => ffffffffc0f72e30 vmx_handle_exit+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803897:      branches:  ffffffffc0f72e3d vmx_handle_exit+0xd ([kernel.kallsyms]) => ffffffffc0f727c0 __vmx_handle_exit+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803897:      branches:  ffffffffc0f72b15 __vmx_handle_exit+0x355 ([kernel.kallsyms]) => ffffffffc0f60ae0 vmx_flush_pml_buffer+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803903:      branches:  ffffffffc0f72994 __vmx_handle_exit+0x1d4 ([kernel.kallsyms]) => ffffffffc10b7090 kvm_emulate_cpuid+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803903:      branches:  ffffffffc10b70f1 kvm_emulate_cpuid+0x61 ([kernel.kallsyms]) => ffffffffc10b6e10 kvm_cpuid+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803941:      branches:  ffffffffc10b7125 kvm_emulate_cpuid+0x95 ([kernel.kallsyms]) => ffffffffc1093110 kvm_skip_emulated_instruction+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803941:      branches:  ffffffffc109311f kvm_skip_emulated_instruction+0xf ([kernel.kallsyms]) => ffffffffc0f5e180 vmx_get_rflags+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803951:      branches:  ffffffffc109312a kvm_skip_emulated_instruction+0x1a ([kernel.kallsyms]) => ffffffffc0f5fd30 vmx_skip_emulated_instruction+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803951:      branches:  ffffffffc0f5fd79 vmx_skip_emulated_instruction+0x49 ([kernel.kallsyms]) => ffffffffc0f5fb50 skip_emulated_instruction+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803956:      branches:  ffffffffc0f5fc68 skip_emulated_instruction+0x118 ([kernel.kallsyms]) => ffffffffc0f6a940 vmx_cache_reg+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803964:      branches:  ffffffffc0f5fc11 skip_emulated_instruction+0xc1 ([kernel.kallsyms]) => ffffffffc0f5f9e0 vmx_set_interrupt_shadow+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803980:      branches:  ffffffffc109f8b1 vcpu_run+0x71 ([kernel.kallsyms]) => ffffffffc10ad2f0 kvm_cpu_has_pending_timer+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803980:      branches:  ffffffffc10ad2fb kvm_cpu_has_pending_timer+0xb ([kernel.kallsyms]) => ffffffffc10b0490 apic_has_pending_timer+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803991:      branches:  ffffffffc109f899 vcpu_run+0x59 ([kernel.kallsyms]) => ffffffffc109c860 vcpu_enter_guest+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803993:      branches:  ffffffffc109cd4c vcpu_enter_guest+0x4ec ([kernel.kallsyms]) => ffffffffc0f69140 vmx_prepare_switch_to_guest+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803996:      branches:  ffffffffc109cd7d vcpu_enter_guest+0x51d ([kernel.kallsyms]) => ffffffffb234f930 __srcu_read_unlock+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803996:      branches:  ffffffffc109cd9c vcpu_enter_guest+0x53c ([kernel.kallsyms]) => ffffffffc0f609b0 vmx_sync_pir_to_irr+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408803996:      branches:  ffffffffc0f60a6d vmx_sync_pir_to_irr+0xbd ([kernel.kallsyms]) => ffffffffc10adc20 kvm_lapic_find_highest_irr+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804010:      branches:  ffffffffc0f60abd vmx_sync_pir_to_irr+0x10d ([kernel.kallsyms]) => ffffffffc0f60820 vmx_set_rvi+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804019:      branches:  ffffffffc109ceca vcpu_enter_guest+0x66a ([kernel.kallsyms]) => ffffffffb2249840 fpregs_assert_state_consistent+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804021:      branches:  ffffffffc109cf10 vcpu_enter_guest+0x6b0 ([kernel.kallsyms]) => ffffffffc0f65f30 vmx_vcpu_run+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804024:      branches:  ffffffffc0f6603b vmx_vcpu_run+0x10b ([kernel.kallsyms]) => ffffffffb229bed0 __get_current_cr3_fast+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804024:      branches:  ffffffffc0f66055 vmx_vcpu_run+0x125 ([kernel.kallsyms]) => ffffffffb2253050 cr4_read_shadow+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804030:      branches:  ffffffffc0f6608d vmx_vcpu_run+0x15d ([kernel.kallsyms]) => ffffffffc10921e0 kvm_load_guest_xsave_state+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804030:      branches:  ffffffffc1092207 kvm_load_guest_xsave_state+0x27 ([kernel.kallsyms]) => ffffffffc1092110 kvm_load_guest_xsave_state.part.0+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804032:      branches:  ffffffffc0f660c6 vmx_vcpu_run+0x196 ([kernel.kallsyms]) => ffffffffb22061a0 perf_guest_get_msrs+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804032:      branches:  ffffffffb22061a9 perf_guest_get_msrs+0x9 ([kernel.kallsyms]) => ffffffffb220cda0 intel_guest_get_msrs+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804039:      branches:  ffffffffc0f66109 vmx_vcpu_run+0x1d9 ([kernel.kallsyms]) => ffffffffc0f652c0 clear_atomic_switch_msr+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804040:      branches:  ffffffffc0f66119 vmx_vcpu_run+0x1e9 ([kernel.kallsyms]) => ffffffffc0f73f60 intel_pmu_lbr_is_enabled+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804042:      branches:  ffffffffc0f73f81 intel_pmu_lbr_is_enabled+0x21 ([kernel.kallsyms]) => ffffffffc10b68e0 kvm_find_cpuid_entry+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804045:      branches:  ffffffffc0f66454 vmx_vcpu_run+0x524 ([kernel.kallsyms]) => ffffffffc0f61ff0 vmx_update_hv_timer+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804057:      branches:  ffffffffc0f66142 vmx_vcpu_run+0x212 ([kernel.kallsyms]) => ffffffffc10af100 kvm_wait_lapic_expire+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804057:      branches:  ffffffffc0f66156 vmx_vcpu_run+0x226 ([kernel.kallsyms]) => ffffffffb2255c60 x86_virt_spec_ctrl+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804057:      branches:  ffffffffc0f66161 vmx_vcpu_run+0x231 ([kernel.kallsyms]) => ffffffffc0f8eb20 vmx_vcpu_enter_exit+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804057:      branches:  ffffffffc0f8eb44 vmx_vcpu_enter_exit+0x24 ([kernel.kallsyms]) => ffffffffb2353e10 rcu_note_context_switch+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804057:      branches:  ffffffffb2353e1c rcu_note_context_switch+0xc ([kernel.kallsyms]) => ffffffffb2353db0 rcu_qs+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804066:      branches:  ffffffffc0f8ebe0 vmx_vcpu_enter_exit+0xc0 ([kernel.kallsyms]) => ffffffffc0f8edc0 __vmx_vcpu_run+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804066:      branches:  ffffffffc0f8edd5 __vmx_vcpu_run+0x15 ([kernel.kallsyms]) => ffffffffc0f8eca0 vmx_update_host_rsp+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804066:      branches:  ffffffffc0f8ee1b __vmx_vcpu_run+0x5b ([kernel.kallsyms]) => ffffffffc0f8ed60 vmx_vmenter+0x0 ([kernel.kallsyms])
+       CPU 3/KVM 13376/13384 [002]  7919.408804162:      branches:  ffffffffc0f8ed62 vmx_vmenter+0x2 ([kernel.kallsyms]) =>                0 [unknown] ([unknown])
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804162:      branches:                 0 [unknown] ([unknown]) =>     7f851c9b5a5c init_cacheinfo+0x3ac (/usr/lib/x86_64-linux-gnu/libc-2.31.so)
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804273:      branches:      7f851cb7c0e4 _dl_init+0x74 (/usr/lib/x86_64-linux-gnu/ld-2.31.so) =>     7f851cb7bf50 call_init.part.0+0x0 (/usr/lib/x86_64-linux-gnu/ld-2.31.so)
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804526:      branches:      55e0c00136f0 _start+0x0 (/usr/bin/uname) => ffffffff83200ac0 asm_exc_page_fault+0x0 ([kernel.kallsyms])
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804526:      branches:  ffffffff83200ac3 asm_exc_page_fault+0x3 ([kernel.kallsyms]) => ffffffff83201290 error_entry+0x0 ([kernel.kallsyms])
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804534:      branches:  ffffffff832012fa error_entry+0x6a ([kernel.kallsyms]) => ffffffff830b59a0 sync_regs+0x0 ([kernel.kallsyms])
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804631:      branches:  ffffffff83200ad9 asm_exc_page_fault+0x19 ([kernel.kallsyms]) => ffffffff830b8210 exc_page_fault+0x0 ([kernel.kallsyms])
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804631:      branches:  ffffffff830b82a4 exc_page_fault+0x94 ([kernel.kallsyms]) => ffffffff830b80e0 __kvm_handle_async_pf+0x0 ([kernel.kallsyms])
+ VM:13376 VCPU:003            uname  3404/3404  [002]  7919.408804631:      branches:  ffffffff830b80ed __kvm_handle_async_pf+0xd ([kernel.kallsyms]) => ffffffff830b80c0 kvm_read_and_reset_apf_flags+0x0 ([kernel.kallsyms])
+
+
+Tracing Virtual Machines - Guest Code
+-------------------------------------
+
+A common case for KVM test programs is that the test program acts as the
+hypervisor, creating, running and destroying the virtual machine, and
+providing the guest object code from its own object code. In this case,
+the VM is not running an OS, but only the functions loaded into it by the
+hypervisor test program, and conveniently, loaded at the same virtual
+addresses. To support that, option "--guest-code" has been added to perf script
+and perf kvm report.
+
+Here is an example tracing a test program from the kernel's KVM selftests:
+
+ # perf record --kcore -e intel_pt/cyc/ -- tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test
+ [ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 0.280 MB perf.data ]
+ # perf script --guest-code --itrace=bep --ns -F-period,+addr,+flags
+ [SNIP]
+   tsc_msrs_test 18436 [007] 10897.962087733:      branches:   call                   ffffffffc13b2ff5 __vmx_vcpu_run+0x15 (vmlinux) => ffffffffc13b2f50 vmx_update_host_rsp+0x0 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962087733:      branches:   return                 ffffffffc13b2f5d vmx_update_host_rsp+0xd (vmlinux) => ffffffffc13b2ffa __vmx_vcpu_run+0x1a (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962087733:      branches:   call                   ffffffffc13b303b __vmx_vcpu_run+0x5b (vmlinux) => ffffffffc13b2f80 vmx_vmenter+0x0 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962087836:      branches:   vmentry                ffffffffc13b2f82 vmx_vmenter+0x2 (vmlinux) =>                0 [unknown] ([unknown])
+   [guest/18436] 18436 [007] 10897.962087836:      branches:   vmentry                               0 [unknown] ([unknown]) =>           402c81 guest_code+0x131 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test)
+   [guest/18436] 18436 [007] 10897.962087836:      branches:   call                             402c81 guest_code+0x131 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) =>           40dba0 ucall+0x0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test)
+   [guest/18436] 18436 [007] 10897.962088248:      branches:   vmexit                           40dba0 ucall+0x0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) =>                0 [unknown] ([unknown])
+   tsc_msrs_test 18436 [007] 10897.962088248:      branches:   vmexit                                0 [unknown] ([unknown]) => ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962088248:      branches:   jmp                    ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux) => ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962088256:      branches:   return                 ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux) => ffffffffc13b3040 __vmx_vcpu_run+0x60 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962088270:      branches:   return                 ffffffffc13b30b6 __vmx_vcpu_run+0xd6 (vmlinux) => ffffffffc13b2f2e vmx_vcpu_enter_exit+0x4e (vmlinux)
+ [SNIP]
+   tsc_msrs_test 18436 [007] 10897.962089321:      branches:   call                   ffffffffc13b2ff5 __vmx_vcpu_run+0x15 (vmlinux) => ffffffffc13b2f50 vmx_update_host_rsp+0x0 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962089321:      branches:   return                 ffffffffc13b2f5d vmx_update_host_rsp+0xd (vmlinux) => ffffffffc13b2ffa __vmx_vcpu_run+0x1a (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962089321:      branches:   call                   ffffffffc13b303b __vmx_vcpu_run+0x5b (vmlinux) => ffffffffc13b2f80 vmx_vmenter+0x0 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962089424:      branches:   vmentry                ffffffffc13b2f82 vmx_vmenter+0x2 (vmlinux) =>                0 [unknown] ([unknown])
+   [guest/18436] 18436 [007] 10897.962089424:      branches:   vmentry                               0 [unknown] ([unknown]) =>           40dba0 ucall+0x0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test)
+   [guest/18436] 18436 [007] 10897.962089701:      branches:   jmp                              40dc1b ucall+0x7b (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) =>           40dc39 ucall+0x99 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test)
+   [guest/18436] 18436 [007] 10897.962089701:      branches:   jcc                              40dc3c ucall+0x9c (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) =>           40dc20 ucall+0x80 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test)
+   [guest/18436] 18436 [007] 10897.962089701:      branches:   jcc                              40dc3c ucall+0x9c (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) =>           40dc20 ucall+0x80 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test)
+   [guest/18436] 18436 [007] 10897.962089701:      branches:   jcc                              40dc37 ucall+0x97 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) =>           40dc50 ucall+0xb0 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test)
+   [guest/18436] 18436 [007] 10897.962089878:      branches:   vmexit                           40dc55 ucall+0xb5 (/home/user/git/work/tools/testing/selftests/kselftest_install/kvm/tsc_msrs_test) =>                0 [unknown] ([unknown])
+   tsc_msrs_test 18436 [007] 10897.962089878:      branches:   vmexit                                0 [unknown] ([unknown]) => ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962089878:      branches:   jmp                    ffffffffc13b2fa0 vmx_vmexit+0x0 (vmlinux) => ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962089887:      branches:   return                 ffffffffc13b2fd2 vmx_vmexit+0x32 (vmlinux) => ffffffffc13b3040 __vmx_vcpu_run+0x60 (vmlinux)
+   tsc_msrs_test 18436 [007] 10897.962089901:      branches:   return                 ffffffffc13b30b6 __vmx_vcpu_run+0xd6 (vmlinux) => ffffffffc13b2f2e vmx_vcpu_enter_exit+0x4e (vmlinux)
+ [SNIP]
+
+ # perf kvm --guest-code --guest --host report -i perf.data --stdio | head -20
+
+ # To display the perf.data header info, please use --header/--header-only options.
+ #
+ #
+ # Total Lost Samples: 0
+ #
+ # Samples: 12  of event 'instructions'
+ # Event count (approx.): 2274583
+ #
+ # Children      Self  Command        Shared Object         Symbol
+ # ........  ........  .............  ....................  ...........................................
+ #
+    54.70%     0.00%  tsc_msrs_test  [kernel.vmlinux]      [k] entry_SYSCALL_64_after_hwframe
+            |
+            ---entry_SYSCALL_64_after_hwframe
+               do_syscall_64
+               |
+               |--29.44%--syscall_exit_to_user_mode
+               |          exit_to_user_mode_prepare
+               |          task_work_run
+               |          __fput
+
+
+Event Trace
+-----------
+
+Event Trace records information about asynchronous events, for example interrupts,
+faults, VM exits and entries.  The information is recorded in CFE and EVD packets,
+and also the Interrupt Flag is recorded on the MODE.Exec packet.  The CFE packet
+contains a type field to identify one of the following:
+
+	 1	INTR		interrupt, fault, exception, NMI
+	 2	IRET		interrupt return
+	 3	SMI		system management interrupt
+	 4	RSM		resume from system management mode
+	 5	SIPI		startup interprocessor interrupt
+	 6	INIT		INIT signal
+	 7	VMENTRY		VM-Entry
+	 8	VMEXIT		VM-Entry
+	 9	VMEXIT_INTR	VM-Exit due to interrupt
+	10	SHUTDOWN	Shutdown
+
+For more details, refer to the Intel 64 and IA-32 Architectures Software
+Developer Manuals (version 076 or later).
+
+The capability to do Event Trace is indicated by the
+/sys/bus/event_source/devices/intel_pt/caps/event_trace file.
+
+Event trace is selected for recording using the "event" config term. e.g.
+
+	perf record -e intel_pt/event/u uname
+
+Event trace events are output using the --itrace I option. e.g.
+
+	perf script --itrace=Ie
+
+perf script displays events containing CFE type, vector and event data,
+in the form:
+
+	  evt:   hw int            (t)  cfe: INTR IP: 1 vector: 3 PFA: 0x8877665544332211
+
+The IP flag indicates if the event binds to an IP, which includes any case where
+flow control packet generation is enabled, as well as when CFE packet IP bit is
+set.
+
+perf script displays events containing changes to the Interrupt Flag in the form:
+
+	iflag:   t                      IFLAG: 1->0 via branch
+
+where "via branch" indicates a branch (interrupt or return from interrupt) and
+"non branch" indicates an instruction such as CFI, STI or POPF).
+
+In addition, the current state of the interrupt flag is indicated by the presence
+or absence of the "D" (interrupt disabled) perf script flag.  If the interrupt
+flag is changed, then the "t" flag is also included i.e.
+
+		no flag, interrupts enabled IF=1
+	t	interrupts become disabled IF=1 -> IF=0
+	D	interrupts are disabled IF=0
+	Dt	interrupts become enabled  IF=0 -> IF=1
+
+The intel-pt-events.py script illustrates how to access Event Trace information
+using a Python script.
+
+
+TNT Disable
+-----------
+
+TNT packets are disabled using the "notnt" config term. e.g.
+
+	perf record -e intel_pt/notnt/u uname
+
+In that case the --itrace q option is forced because walking executable code
+to reconstruct the control flow is not possible.
+
+
+Emulated PTWRITE
+----------------
+
+Later perf tools support a method to emulate the ptwrite instruction, which
+can be useful if hardware does not support the ptwrite instruction.
+
+Instead of using the ptwrite instruction, a function is used which produces
+a trace that encodes the payload data into TNT packets.  Here is an example
+of the function:
+
+ #include <stdint.h>
+
+ void perf_emulate_ptwrite(uint64_t x)
+ __attribute__((externally_visible, noipa, no_instrument_function, naked));
+
+ #define PERF_EMULATE_PTWRITE_8_BITS \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"        \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"        \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"        \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"        \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"        \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"        \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"        \
+                 "1: shl %rax\n"     \
+                 "   jc 1f\n"
+
+ /* Undefined instruction */
+ #define PERF_EMULATE_PTWRITE_UD2        ".byte 0x0f, 0x0b\n"
+
+ #define PERF_EMULATE_PTWRITE_MAGIC        PERF_EMULATE_PTWRITE_UD2 ".ascii \"perf,ptwrite  \"\n"
+
+ void perf_emulate_ptwrite(uint64_t x __attribute__ ((__unused__)))
+ {
+          /* Assumes SysV ABI : x passed in rdi */
+         __asm__ volatile (
+                 "jmp 1f\n"
+                 PERF_EMULATE_PTWRITE_MAGIC
+                 "1: mov %rdi, %rax\n"
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 PERF_EMULATE_PTWRITE_8_BITS
+                 "1: ret\n"
+         );
+ }
+
+For example, a test program with the function above:
+
+ #include <stdio.h>
+ #include <stdint.h>
+ #include <stdlib.h>
+
+ #include "perf_emulate_ptwrite.h"
+
+ int main(int argc, char *argv[])
+ {
+         uint64_t x = 0;
+
+         if (argc > 1)
+                 x = strtoull(argv[1], NULL, 0);
+         perf_emulate_ptwrite(x);
+         return 0;
+ }
+
+Can be compiled and traced:
+
+ $ gcc -Wall -Wextra -O3 -g -o eg_ptw eg_ptw.c
+ $ perf record -e intel_pt//u ./eg_ptw 0x1234567890abcdef
+ [ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 0.017 MB perf.data ]
+ $ perf script --itrace=ew
+           eg_ptw 19875 [007]  8061.235912:     ptwrite:  IP: 0 payload: 0x1234567890abcdef      55701249a196 perf_emulate_ptwrite+0x16 (/home/user/eg_ptw)
+ $
+
+
+Pipe mode
+---------
+Pipe mode is a problem for Intel PT and possibly other auxtrace users.
+It's not recommended to use a pipe as data output with Intel PT because
+of the following reason.
+
+Essentially the auxtrace buffers do not behave like the regular perf
+event buffers.  That is because the head and tail are updated by
+software, but in the auxtrace case the data is written by hardware.
+So the head and tail do not get updated as data is written.
+
+In the Intel PT case, the head and tail are updated only when the trace
+is disabled by software, for example:
+    - full-trace, system wide : when buffer passes watermark
+    - full-trace, not system-wide : when buffer passes watermark or
+                                    context switches
+    - snapshot mode : as above but also when a snapshot is made
+    - sample mode : as above but also when a sample is made
+
+That means finished-round ordering doesn't work.  An auxtrace buffer
+can turn up that has data that extends back in time, possibly to the
+very beginning of tracing.
+
+For a perf.data file, that problem is solved by going through the trace
+and queuing up the auxtrace buffers in advance.
+
+For pipe mode, the order of events and timestamps can presumably
+be messed up.
+
+
+EXAMPLE
+-------
+
+Examples can be found on perf wiki page "Perf tools support for Intel® Processor Trace":
+
+https://perf.wiki.kernel.org/index.php/Perf_tools_support_for_Intel%C2%AE_Processor_Trace
+
+
+SEE ALSO
+--------
+
+linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1],
+linkperf:perf-inject[1]
diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt
new file mode 100644
index 000000000..04d510364
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===============
+
+NAME
+----
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+--------
+[verse]
+'perf iostat' list
+'perf iostat' <ports> \-- <command> [<options>]
+
+DESCRIPTION
+-----------
+Mode is intended to provide four I/O performance metrics per each PCIe root port:
+
+- Inbound Read   - I/O devices below root port read from the host memory, in MB
+
+- Inbound Write  - I/O devices below root port write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+list::
+	List all PCIe root ports.
+
+<ports>::
+	Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+--------
+
+1. List all PCIe root ports (example for 2-S platform):
+
+   $ perf iostat list
+   S0-uncore_iio_0<0000:00>
+   S1-uncore_iio_0<0000:80>
+   S0-uncore_iio_1<0000:17>
+   S1-uncore_iio_1<0000:85>
+   S0-uncore_iio_2<0000:3a>
+   S1-uncore_iio_2<0000:ae>
+   S0-uncore_iio_3<0000:5d>
+   S1-uncore_iio_3<0000:d7>
+
+2. Collect metrics for all PCIe root ports:
+
+   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+    Performance counter stats for 'system wide':
+
+      port             Inbound Read(MB)    Inbound Write(MB)    Outbound Read(MB)   Outbound Write(MB)
+   0000:00                    1                    0                    2                    3
+   0000:80                    0                    0                    0                    0
+   0000:17               352552                   43                    0                   21
+   0000:85                    0                    0                    0                    0
+   0000:3a                    3                    0                    0                    0
+   0000:ae                    0                    0                    0                    0
+   0000:5d                    0                    0                    0                    0
+   0000:d7                    0                    0                    0                    0
+
+3. Collect metrics for comma-separated list of PCIe root ports:
+
+   $ perf iostat 0000:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+    Performance counter stats for 'system wide':
+
+      port             Inbound Read(MB)    Inbound Write(MB)    Outbound Read(MB)   Outbound Write(MB)
+   0000:17               358559                   44                    0                   22
+   0000:3a                    3                    2                    0                    0
+
+        197.081983474 seconds time elapsed
+
+SEE ALSO
+--------
+linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-kallsyms.txt b/tools/perf/Documentation/perf-kallsyms.txt
new file mode 100644
index 000000000..c97527df8
--- /dev/null
+++ b/tools/perf/Documentation/perf-kallsyms.txt
@@ -0,0 +1,24 @@
+perf-kallsyms(1)
+================
+
+NAME
+----
+perf-kallsyms - Searches running kernel for symbols
+
+SYNOPSIS
+--------
+[verse]
+'perf kallsyms' [<options>] symbol_name[,symbol_name...]
+
+DESCRIPTION
+-----------
+This command searches the running kernel kallsyms file for the given symbol(s)
+and prints information about it, including the DSO, the kallsyms begin/end
+addresses and the addresses in the ELF kallsyms symbol table (for symbols in
+modules).
+
+OPTIONS
+-------
+-v::
+--verbose::
+	Increase verbosity level, showing details about symbol table loading, etc.
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
new file mode 100644
index 000000000..f378ac593
--- /dev/null
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -0,0 +1,80 @@
+perf-kmem(1)
+============
+
+NAME
+----
+perf-kmem - Tool to trace/measure kernel memory properties
+
+SYNOPSIS
+--------
+[verse]
+'perf kmem' [<options>] {record|stat}
+
+DESCRIPTION
+-----------
+There are two variants of perf kmem:
+
+  'perf kmem [<options>] record [<perf-record-options>] <command>' to
+  record the kmem events of an arbitrary workload. Additional 'perf
+  record' options may be specified after record, such as '-o' to
+  change the output file name.
+
+  'perf kmem [<options>] stat' to report kernel memory statistics.
+
+OPTIONS
+-------
+-i <file>::
+--input=<file>::
+	For stat, select the input file (default: perf.data unless stdin is a
+	fifo)
+
+-f::
+--force::
+	Don't do ownership validation
+
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
+--caller::
+	Show per-callsite statistics
+
+--alloc::
+	Show per-allocation statistics
+
+-s <key[,key2...]>::
+--sort=<key[,key2...]>::
+	Sort the output (default: 'frag,hit,bytes' for slab and 'bytes,hit'
+	for page).  Available sort keys are 'ptr, callsite, bytes, hit,
+	pingpong, frag' for slab and 'page, callsite, bytes, hit, order,
+	migtype, gfp' for page.  This option should be preceded by one of the
+	mode selection options - i.e. --slab, --page, --alloc and/or --caller.
+
+-l <num>::
+--line=<num>::
+	Print n lines only
+
+--raw-ip::
+	Print raw ip instead of symbol
+
+--slab::
+	Analyze SLAB allocator events.
+
+--page::
+	Analyze page allocator events
+
+--live::
+	Show live page stat.  The perf kmem shows total allocation stat by
+	default, but this option shows live (currently allocated) pages
+	instead.  (This option works with --page option only)
+
+--time=<start>,<stop>::
+	Only analyze samples within given time window: <start>,<stop>. Times
+	have the format seconds.microseconds. If start is not given (i.e., time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e, time string is 'x.y,') then analysis goes
+	to end of file.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt
new file mode 100644
index 000000000..2ad3f5d9f
--- /dev/null
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -0,0 +1,153 @@
+perf-kvm(1)
+===========
+
+NAME
+----
+perf-kvm - Tool to trace/measure kvm guest os
+
+SYNOPSIS
+--------
+[verse]
+'perf kvm' [--host] [--guest] [--guestmount=<path>
+	[--guestkallsyms=<path> --guestmodules=<path> | --guestvmlinux=<path>]]
+	{top|record|report|diff|buildid-list} [<options>]
+'perf kvm' [--host] [--guest] [--guestkallsyms=<path> --guestmodules=<path>
+	| --guestvmlinux=<path>] {top|record|report|diff|buildid-list|stat} [<options>]
+'perf kvm stat [record|report|live] [<options>]
+
+DESCRIPTION
+-----------
+There are a couple of variants of perf kvm:
+
+  'perf kvm [options] top <command>' to generates and displays
+  a performance counter profile of guest os in realtime
+  of an arbitrary workload.
+
+  'perf kvm record <command>' to record the performance counter profile
+  of an arbitrary workload and save it into a perf data file. We set the
+  default behavior of perf kvm as --guest, so if neither --host nor --guest
+  is input, the perf data file name is perf.data.guest. If --host is input,
+  the perf data file name is perf.data.kvm. If you want to record data into
+  perf.data.host, please input --host --no-guest. The behaviors are shown as
+  following:
+    Default('')         ->  perf.data.guest
+    --host              ->  perf.data.kvm
+    --guest             ->  perf.data.guest
+    --host --guest      ->  perf.data.kvm
+    --host --no-guest   ->  perf.data.host
+
+  'perf kvm report' to display the performance counter profile information
+  recorded via perf kvm record.
+
+  'perf kvm diff' to displays the performance difference amongst two perf.data
+  files captured via perf record.
+
+  'perf kvm buildid-list' to  display the buildids found in a perf data file,
+  so that other tools can be used to fetch packages with matching symbol tables
+  for use by perf report. As buildid is read from /sys/kernel/notes in os, then
+  if you want to list the buildid for guest, please make sure your perf data file
+  was captured with --guestmount in perf kvm record.
+
+  'perf kvm stat <command>' to run a command and gather performance counter
+  statistics.
+  Especially, perf 'kvm stat record/report' generates a statistical analysis
+  of KVM events. Currently, vmexit, mmio (x86 only) and ioport (x86 only)
+  events are supported. 'perf kvm stat record <command>' records kvm events
+  and the events between start and end <command>.
+  And this command produces a file which contains tracing results of kvm
+  events.
+
+  'perf kvm stat report' reports statistical data which includes events
+  handled time, samples, and so on.
+
+  'perf kvm stat live' reports statistical data in a live mode (similar to
+  record + report but with statistical data updated live at a given display
+  rate).
+
+OPTIONS
+-------
+-i::
+--input=<path>::
+        Input file name, for the 'report', 'diff' and 'buildid-list' subcommands.
+-o::
+--output=<path>::
+        Output file name, for the 'record' subcommand. Doesn't work with 'report',
+        just redirect the output to a file when using 'report'.
+--host::
+        Collect host side performance profile.
+--guest::
+        Collect guest side performance profile.
+
+:GMEXAMPLECMD: kvm --host --guest
+:GMEXAMPLESUBCMD: top
+include::guest-files.txt[]
+
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc).
+
+STAT REPORT OPTIONS
+-------------------
+--vcpu=<value>::
+       analyze events which occur on this vcpu. (default: all vcpus)
+
+--event=<value>::
+       event to be analyzed. Possible values: vmexit, mmio (x86 only),
+       ioport (x86 only). (default: vmexit)
+-k::
+--key=<value>::
+       Sorting key. Possible values: sample (default, sort by samples
+       number), time (sort by average time).
+-p::
+--pid=::
+    Analyze events only for given process ID(s) (comma separated list).
+
+STAT LIVE OPTIONS
+-----------------
+-d::
+--display::
+        Time in seconds between display updates
+
+-m::
+--mmap-pages=::
+    Number of mmap data pages (must be a power of two) or size
+    specification with appended unit character - B/K/M/G. The
+    size is rounded up to have nearest pages power of two value.
+
+-a::
+--all-cpus::
+        System-wide collection from all CPUs.
+
+-p::
+--pid=::
+    Analyze events only for given process ID(s) (comma separated list).
+
+--vcpu=<value>::
+       analyze events which occur on this vcpu. (default: all vcpus)
+
+
+--event=<value>::
+       event to be analyzed. Possible values: vmexit,
+       mmio (x86 only), ioport (x86 only).
+       (default: vmexit)
+
+-k::
+--key=<value>::
+       Sorting key. Possible values: sample (default, sort by samples
+       number), time (sort by average time).
+
+--duration=<value>::
+       Show events other than HLT (x86 only) or Wait state (s390 only)
+       that take longer than duration usecs.
+
+--proc-map-timeout::
+	When processing pre-existing threads /proc/XXX/mmap, it may take
+	a long time, because the file may be huge. A time out is needed
+	in such cases.
+	This option sets the time out limit. The default value is 500 ms.
+
+SEE ALSO
+--------
+linkperf:perf-top[1], linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-diff[1], linkperf:perf-buildid-list[1],
+linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-kwork.txt b/tools/perf/Documentation/perf-kwork.txt
new file mode 100644
index 000000000..482d6c52e
--- /dev/null
+++ b/tools/perf/Documentation/perf-kwork.txt
@@ -0,0 +1,180 @@
+perf-kowrk(1)
+=============
+
+NAME
+----
+perf-kwork - Tool to trace/measure kernel work properties (latencies)
+
+SYNOPSIS
+--------
+[verse]
+'perf kwork' {record|report|latency|timehist}
+
+DESCRIPTION
+-----------
+There are several variants of 'perf kwork':
+
+  'perf kwork record <command>' to record the kernel work
+  of an arbitrary workload.
+
+  'perf kwork report' to report the per kwork runtime.
+
+  'perf kwork latency' to report the per kwork latencies.
+
+  'perf kwork timehist' provides an analysis of kernel work events.
+
+    Example usage:
+        perf kwork record -- sleep 1
+        perf kwork report
+        perf kwork report -b
+        perf kwork latency
+        perf kwork latency -b
+        perf kwork timehist
+
+   By default it shows the individual work events such as irq, workqeueu,
+   including the run time and delay (time between raise and actually entry):
+
+      Runtime start      Runtime end        Cpu     Kwork name                 Runtime     Delaytime
+                                                    (TYPE)NAME:NUM             (msec)      (msec)
+   -----------------  -----------------  ------  -------------------------  ----------  ----------
+      1811186.976062     1811186.976327  [0000]  (s)RCU:9                        0.266       0.114
+      1811186.978452     1811186.978547  [0000]  (s)SCHED:7                      0.095       0.171
+      1811186.980327     1811186.980490  [0000]  (s)SCHED:7                      0.162       0.083
+      1811186.981221     1811186.981271  [0000]  (s)SCHED:7                      0.050       0.077
+      1811186.984267     1811186.984318  [0000]  (s)SCHED:7                      0.051       0.075
+      1811186.987252     1811186.987315  [0000]  (s)SCHED:7                      0.063       0.081
+      1811186.987785     1811186.987843  [0006]  (s)RCU:9                        0.058       0.645
+      1811186.988319     1811186.988383  [0000]  (s)SCHED:7                      0.064       0.143
+      1811186.989404     1811186.989607  [0002]  (s)TIMER:1                      0.203       0.111
+      1811186.989660     1811186.989732  [0002]  (s)SCHED:7                      0.072       0.310
+      1811186.991295     1811186.991407  [0002]  eth0:10                         0.112
+      1811186.991639     1811186.991734  [0002]  (s)NET_RX:3                     0.095       0.277
+      1811186.989860     1811186.991826  [0002]  (w)vmstat_shepherd              1.966       0.345
+    ...
+
+   Times are in msec.usec.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace=::
+	Display verbose dump of the sched data.
+
+-f::
+--force::
+	Don't complain, do it.
+
+-k::
+--kwork::
+	List of kwork to profile (irq, softirq, workqueue, etc)
+
+-v::
+--verbose::
+	Be more verbose. (show symbol address, etc)
+
+OPTIONS for 'perf kwork report'
+----------------------------
+
+-b::
+--use-bpf::
+	Use BPF to measure kwork runtime
+
+-C::
+--cpu::
+	Only show events for the given CPU(s) (comma separated list).
+
+-i::
+--input::
+	Input file name. (default: perf.data unless stdin is a fifo)
+
+-n::
+--name::
+	Only show events for the given name.
+
+-s::
+--sort::
+	Sort by key(s): runtime, max, count
+
+-S::
+--with-summary::
+	Show summary with statistics
+
+--time::
+	Only analyze samples within given time window: <start>,<stop>. Times
+	have the format seconds.microseconds. If start is not given (i.e., time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e, time string is 'x.y,') then analysis goes
+	to end of file.
+
+OPTIONS for 'perf kwork latency'
+----------------------------
+
+-b::
+--use-bpf::
+	Use BPF to measure kwork latency
+
+-C::
+--cpu::
+	Only show events for the given CPU(s) (comma separated list).
+
+-i::
+--input::
+	Input file name. (default: perf.data unless stdin is a fifo)
+
+-n::
+--name::
+	Only show events for the given name.
+
+-s::
+--sort::
+	Sort by key(s): avg, max, count
+
+--time::
+	Only analyze samples within given time window: <start>,<stop>. Times
+	have the format seconds.microseconds. If start is not given (i.e., time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e, time string is 'x.y,') then analysis goes
+	to end of file.
+
+OPTIONS for 'perf kwork timehist'
+---------------------------------
+
+-C::
+--cpu::
+	Only show events for the given CPU(s) (comma separated list).
+
+-g::
+--call-graph::
+	Display call chains if present (default off).
+
+-i::
+--input::
+	Input file name. (default: perf.data unless stdin is a fifo)
+
+-k::
+--vmlinux=<file>::
+	Vmlinux pathname
+
+-n::
+--name::
+	Only show events for the given name.
+
+--kallsyms=<file>::
+	Kallsyms pathname
+
+--max-stack::
+	Maximum number of functions to display in backtrace, default 5.
+
+--symfs=<directory>::
+    Look for files with symbols relative to this directory.
+
+--time::
+	Only analyze samples within given time window: <start>,<stop>. Times
+	have the format seconds.microseconds. If start is not given (i.e., time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e, time string is 'x.y,') then analysis goes
+	to end of file.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
new file mode 100644
index 000000000..57384a97c
--- /dev/null
+++ b/tools/perf/Documentation/perf-list.txt
@@ -0,0 +1,357 @@
+perf-list(1)
+============
+
+NAME
+----
+perf-list - List all symbolic event types
+
+SYNOPSIS
+--------
+[verse]
+'perf list' [--no-desc] [--long-desc]
+            [hw|sw|cache|tracepoint|pmu|sdt|metric|metricgroup|event_glob]
+
+DESCRIPTION
+-----------
+This command displays the symbolic event types which can be selected in the
+various perf commands with the -e option.
+
+OPTIONS
+-------
+-d::
+--desc::
+Print extra event descriptions. (default)
+
+--no-desc::
+Don't print descriptions.
+
+-v::
+--long-desc::
+Print longer event descriptions.
+
+--debug::
+Enable debugging output.
+
+--details::
+Print how named events are resolved internally into perf events, and also
+any extra expressions computed by perf stat.
+
+--deprecated::
+Print deprecated events. By default the deprecated events are hidden.
+
+--cputype::
+Print events applying cpu with this type for hybrid platform
+(e.g. --cputype core or --cputype atom)
+
+[[EVENT_MODIFIERS]]
+EVENT MODIFIERS
+---------------
+
+Events can optionally have a modifier by appending a colon and one or
+more modifiers. Modifiers allow the user to restrict the events to be
+counted. The following modifiers exist:
+
+ u - user-space counting
+ k - kernel counting
+ h - hypervisor counting
+ I - non idle counting
+ G - guest counting (in KVM guests)
+ H - host counting (not in KVM guests)
+ p - precise level
+ P - use maximum detected precise level
+ S - read sample value (PERF_SAMPLE_READ)
+ D - pin the event to the PMU
+ W - group is weak and will fallback to non-group if not schedulable,
+ e - group or event are exclusive and do not share the PMU
+
+The 'p' modifier can be used for specifying how precise the instruction
+address should be. The 'p' modifier can be specified multiple times:
+
+ 0 - SAMPLE_IP can have arbitrary skid
+ 1 - SAMPLE_IP must have constant skid
+ 2 - SAMPLE_IP requested to have 0 skid
+ 3 - SAMPLE_IP must have 0 skid, or uses randomization to avoid
+     sample shadowing effects.
+
+For Intel systems precise event sampling is implemented with PEBS
+which supports up to precise-level 2, and precise level 3 for
+some special cases
+
+On AMD systems it is implemented using IBS (up to precise-level 2).
+The precise modifier works with event types 0x76 (cpu-cycles, CPU
+clocks not halted) and 0xC1 (micro-ops retired). Both events map to
+IBS execution sampling (IBS op) with the IBS Op Counter Control bit
+(IbsOpCntCtl) set respectively (see the
+Core Complex (CCX) -> Processor x86 Core -> Instruction Based Sampling (IBS)
+section of the [AMD Processor Programming Reference (PPR)] relevant to the
+family, model and stepping of the processor being used).
+
+Manual Volume 2: System Programming, 13.3 Instruction-Based
+Sampling). Examples to use IBS:
+
+ perf record -a -e cpu-cycles:p ...    # use ibs op counting cycles
+ perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+
+RAW HARDWARE EVENT DESCRIPTOR
+-----------------------------
+Even when an event is not available in a symbolic form within perf right now,
+it can be encoded in a per processor specific way.
+
+For instance on x86 CPUs, N is a hexadecimal value that represents the raw register encoding with the
+layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide] Figure 30-1 Layout
+of IA32_PERFEVTSELx MSRs) or AMD's PERF_CTL MSRs (see the
+Core Complex (CCX) -> Processor x86 Core -> MSR Registers section of the
+[AMD Processor Programming Reference (PPR)] relevant to the family, model
+and stepping of the processor being used).
+
+Note: Only the following bit fields can be set in x86 counter
+registers: event, umask, edge, inv, cmask. Esp. guest/host only and
+OS/user mode flags must be setup using <<EVENT_MODIFIERS, EVENT
+MODIFIERS>>.
+
+Example:
+
+If the Intel docs for a QM720 Core i7 describe an event as:
+
+  Event  Umask  Event Mask
+  Num.   Value  Mnemonic    Description                        Comment
+
+  A8H      01H  LSD.UOPS    Counts the number of micro-ops     Use cmask=1 and
+                            delivered by loop stream detector  invert to count
+                                                               cycles
+
+raw encoding of 0x1A8 can be used:
+
+ perf stat -e r1a8 -a sleep 1
+ perf record -e r1a8 ...
+
+It's also possible to use pmu syntax:
+
+ perf record -e r1a8 -a sleep 1
+ perf record -e cpu/r1a8/ ...
+ perf record -e cpu/r0x1a8/ ...
+
+Some processors, like those from AMD, support event codes and unit masks
+larger than a byte. In such cases, the bits corresponding to the event
+configuration parameters can be seen with:
+
+  cat /sys/bus/event_source/devices/<pmu>/format/<config>
+
+Example:
+
+If the AMD docs for an EPYC 7713 processor describe an event as:
+
+  Event  Umask  Event Mask
+  Num.   Value  Mnemonic                        Description
+
+  28FH     03H  op_cache_hit_miss.op_cache_hit  Counts Op Cache micro-tag
+                                                hit events.
+
+raw encoding of 0x0328F cannot be used since the upper nibble of the
+EventSelect bits have to be specified via bits 32-35 as can be seen with:
+
+  cat /sys/bus/event_source/devices/cpu/format/event
+
+raw encoding of 0x20000038F should be used instead:
+
+ perf stat -e r20000038f -a sleep 1
+ perf record -e r20000038f ...
+
+It's also possible to use pmu syntax:
+
+ perf record -e r20000038f -a sleep 1
+ perf record -e cpu/r20000038f/ ...
+ perf record -e cpu/r0x20000038f/ ...
+
+You should refer to the processor specific documentation for getting these
+details. Some of them are referenced in the SEE ALSO section below.
+
+ARBITRARY PMUS
+--------------
+
+perf also supports an extended syntax for specifying raw parameters
+to PMUs. Using this typically requires looking up the specific event
+in the CPU vendor specific documentation.
+
+The available PMUs and their raw parameters can be listed with
+
+  ls /sys/devices/*/format
+
+For example the raw event "LSD.UOPS" core pmu event above could
+be specified as
+
+  perf stat -e cpu/event=0xa8,umask=0x1,name=LSD.UOPS_CYCLES,cmask=0x1/ ...
+
+  or using extended name syntax
+
+  perf stat -e cpu/event=0xa8,umask=0x1,cmask=0x1,name=\'LSD.UOPS_CYCLES:cmask=0x1\'/ ...
+
+PER SOCKET PMUS
+---------------
+
+Some PMUs are not associated with a core, but with a whole CPU socket.
+Events on these PMUs generally cannot be sampled, but only counted globally
+with perf stat -a. They can be bound to one logical CPU, but will measure
+all the CPUs in the same socket.
+
+This example measures memory bandwidth every second
+on the first memory controller on socket 0 of a Intel Xeon system
+
+  perf stat -C 0 -a uncore_imc_0/cas_count_read/,uncore_imc_0/cas_count_write/ -I 1000 ...
+
+Each memory controller has its own PMU.  Measuring the complete system
+bandwidth would require specifying all imc PMUs (see perf list output),
+and adding the values together. To simplify creation of multiple events,
+prefix and glob matching is supported in the PMU name, and the prefix
+'uncore_' is also ignored when performing the match. So the command above
+can be expanded to all memory controllers by using the syntaxes:
+
+  perf stat -C 0 -a imc/cas_count_read/,imc/cas_count_write/ -I 1000 ...
+  perf stat -C 0 -a *imc*/cas_count_read/,*imc*/cas_count_write/ -I 1000 ...
+
+This example measures the combined core power every second
+
+  perf stat -I 1000 -e power/energy-cores/  -a
+
+ACCESS RESTRICTIONS
+-------------------
+
+For non root users generally only context switched PMU events are available.
+This is normally only the events in the cpu PMU, the predefined events
+like cycles and instructions and some software events.
+
+Other PMUs and global measurements are normally root only.
+Some event qualifiers, such as "any", are also root only.
+
+This can be overridden by setting the kernel.perf_event_paranoid
+sysctl to -1, which allows non root to use these events.
+
+For accessing trace point events perf needs to have read access to
+/sys/kernel/debug/tracing, even when perf_event_paranoid is in a relaxed
+setting.
+
+TRACING
+-------
+
+Some PMUs control advanced hardware tracing capabilities, such as Intel PT,
+that allows low overhead execution tracing.  These are described in a separate
+intel-pt.txt document.
+
+PARAMETERIZED EVENTS
+--------------------
+
+Some pmu events listed by 'perf-list' will be displayed with '?' in them. For
+example:
+
+  hv_gpci/dtbp_ptitc,phys_processor_idx=?/
+
+This means that when provided as an event, a value for '?' must
+also be supplied. For example:
+
+  perf stat -C 0 -e 'hv_gpci/dtbp_ptitc,phys_processor_idx=0x2/' ...
+
+EVENT QUALIFIERS:
+
+It is also possible to add extra qualifiers to an event:
+
+percore:
+
+Sums up the event counts for all hardware threads in a core, e.g.:
+
+
+  perf stat -e cpu/event=0,umask=0x3,percore=1/
+
+
+EVENT GROUPS
+------------
+
+Perf supports time based multiplexing of events, when the number of events
+active exceeds the number of hardware performance counters. Multiplexing
+can cause measurement errors when the workload changes its execution
+profile.
+
+When metrics are computed using formulas from event counts, it is useful to
+ensure some events are always measured together as a group to minimize multiplexing
+errors. Event groups can be specified using { }.
+
+  perf stat -e '{instructions,cycles}' ...
+
+The number of available performance counters depend on the CPU. A group
+cannot contain more events than available counters.
+For example Intel Core CPUs typically have four generic performance counters
+for the core, plus three fixed counters for instructions, cycles and
+ref-cycles. Some special events have restrictions on which counter they
+can schedule, and may not support multiple instances in a single group.
+When too many events are specified in the group some of them will not
+be measured.
+
+Globally pinned events can limit the number of counters available for
+other groups. On x86 systems, the NMI watchdog pins a counter by default.
+The nmi watchdog can be disabled as root with
+
+	echo 0 > /proc/sys/kernel/nmi_watchdog
+
+Events from multiple different PMUs cannot be mixed in a group, with
+some exceptions for software events.
+
+LEADER SAMPLING
+---------------
+
+perf also supports group leader sampling using the :S specifier.
+
+  perf record -e '{cycles,instructions}:S' ...
+  perf report --group
+
+Normally all events in an event group sample, but with :S only
+the first event (the leader) samples, and it only reads the values of the
+other events in the group.
+
+However, in the case AUX area events (e.g. Intel PT or CoreSight), the AUX
+area event must be the leader, so then the second event samples, not the first.
+
+OPTIONS
+-------
+
+Without options all known events will be listed.
+
+To limit the list use:
+
+. 'hw' or 'hardware' to list hardware events such as cache-misses, etc.
+
+. 'sw' or 'software' to list software events such as context switches, etc.
+
+. 'cache' or 'hwcache' to list hardware cache events such as L1-dcache-loads, etc.
+
+. 'tracepoint' to list all tracepoint events, alternatively use
+  'subsys_glob:event_glob' to filter by tracepoint subsystems such as sched,
+  block, etc.
+
+. 'pmu' to print the kernel supplied PMU events.
+
+. 'sdt' to list all Statically Defined Tracepoint events.
+
+. 'metric' to list metrics
+
+. 'metricgroup' to list metricgroups with metrics.
+
+. If none of the above is matched, it will apply the supplied glob to all
+  events, printing the ones that match.
+
+. As a last resort, it will do a substring search in all event names.
+
+One or more types can be used at the same time, listing the events for the
+types specified.
+
+Support raw format:
+
+. '--raw-dump', shows the raw-dump of all the events.
+. '--raw-dump [hw|sw|cache|tracepoint|pmu|event_glob]', shows the raw-dump of
+  a certain kind of events.
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1],
+http://www.intel.com/sdm/[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
+https://bugzilla.kernel.org/show_bug.cgi?id=206537[AMD Processor Programming Reference (PPR)]
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
new file mode 100644
index 000000000..4958a1ffa
--- /dev/null
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -0,0 +1,174 @@
+perf-lock(1)
+============
+
+NAME
+----
+perf-lock - Analyze lock events
+
+SYNOPSIS
+--------
+[verse]
+'perf lock' {record|report|script|info|contention}
+
+DESCRIPTION
+-----------
+You can analyze various lock behaviours
+and statistics with this 'perf lock' command.
+
+  'perf lock record <command>' records lock events
+  between start and end <command>. And this command
+  produces the file "perf.data" which contains tracing
+  results of lock events.
+
+  'perf lock report' reports statistical data.
+
+  'perf lock script' shows raw lock events.
+
+  'perf lock info' shows metadata like threads or addresses
+  of lock instances.
+
+  'perf lock contention' shows contention statistics.
+
+COMMON OPTIONS
+--------------
+
+-i::
+--input=<file>::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-v::
+--verbose::
+        Be more verbose (show symbol address, etc).
+
+-q::
+--quiet::
+	Do not show any warnings or messages. (Suppress -v)
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+-f::
+--force::
+	Don't complain, do it.
+
+--vmlinux=<file>::
+        vmlinux pathname
+
+--kallsyms=<file>::
+        kallsyms pathname
+
+
+REPORT OPTIONS
+--------------
+
+-k::
+--key=<value>::
+        Sorting key. Possible values: acquired (default), contended,
+	avg_wait, wait_total, wait_max, wait_min.
+
+-F::
+--field=<value>::
+        Output fields. By default it shows all the fields but users can
+	customize that using this.  Possible values: acquired, contended,
+	avg_wait, wait_total, wait_max, wait_min.
+
+-c::
+--combine-locks::
+	Merge lock instances in the same class (based on name).
+
+-t::
+--threads::
+    The -t option is to show per-thread lock stat like below:
+
+      $ perf lock report -t -F acquired,contended,avg_wait
+
+                    Name   acquired  contended   avg wait (ns)
+
+                    perf     240569          9            5784
+                 swapper     106610         19             543
+                  :15789      17370          2           14538
+            ContainerMgr       8981          6             874
+                   sleep       5275          1           11281
+         ContainerThread       4416          4             944
+         RootPressureThr       3215          5            1215
+             rcu_preempt       2954          0               0
+            ContainerMgr       2560          0               0
+                 unnamed       1873          0               0
+         EventManager_De       1845          1             636
+         futex-default-S       1609          0               0
+
+-E::
+--entries=<value>::
+	Display this many entries.
+
+
+INFO OPTIONS
+------------
+
+-t::
+--threads::
+	dump thread list in perf.data
+
+-m::
+--map::
+	dump map of lock instances (address:name table)
+
+
+CONTENTION OPTIONS
+--------------
+
+-k::
+--key=<value>::
+	Sorting key. Possible values: contended, wait_total (default),
+	wait_max, wait_min, avg_wait.
+
+-F::
+--field=<value>::
+	Output fields. By default it shows all but the wait_min fields
+	and users can customize that using this.  Possible values:
+	contended, wait_total, wait_max, wait_min, avg_wait.
+
+-t::
+--threads::
+	Show per-thread lock contention stat
+
+-b::
+--use-bpf::
+	Use BPF program to collect lock contention stats instead of
+	using the input data.
+
+-a::
+--all-cpus::
+        System-wide collection from all CPUs.
+
+-C::
+--cpu::
+	Collect samples only on the list of CPUs provided. Multiple CPUs can be
+	provided as a comma-separated list with no space: 0,1. Ranges of CPUs
+	are specified with -: 0-2.  Default is to monitor all CPUs.
+
+-p::
+--pid=::
+	Record events on existing process ID (comma separated list).
+
+--tid=::
+        Record events on existing thread ID (comma separated list).
+
+--map-nr-entries::
+	Maximum number of BPF map entries (default: 10240).
+
+--max-stack::
+	Maximum stack depth when collecting lock contention (default: 8).
+
+--stack-skip
+	Number of stack depth to skip when finding a lock caller (default: 3).
+
+-E::
+--entries=<value>::
+	Display this many entries.
+
+
+SEE ALSO
+--------
+linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
new file mode 100644
index 000000000..005c95580
--- /dev/null
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -0,0 +1,96 @@
+perf-mem(1)
+===========
+
+NAME
+----
+perf-mem - Profile memory accesses
+
+SYNOPSIS
+--------
+[verse]
+'perf mem' [<options>] (record [<command>] | report)
+
+DESCRIPTION
+-----------
+"perf mem record" runs a command and gathers memory operation data
+from it, into perf.data. Perf record options are accepted and are passed through.
+
+"perf mem report" displays the result. It invokes perf report with the
+right set of options to display a memory access profile. By default, loads
+and stores are sampled. Use the -t option to limit to loads or stores.
+
+Note that on Intel systems the memory latency reported is the use-latency,
+not the pure load (or store latency). Use latency includes any pipeline
+queueing delays in addition to the memory subsystem latency.
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-i::
+--input=<file>::
+	Input file name.
+
+-f::
+--force::
+	Don't do ownership validation
+
+-t::
+--type=<type>::
+	Select the memory operation type: load or store (default: load,store)
+
+-D::
+--dump-raw-samples::
+	Dump the raw decoded samples on the screen in a format that is easy to parse with
+	one sample per line.
+
+-x::
+--field-separator=<separator>::
+	Specify the field separator used when dump raw samples (-D option). By default,
+	The separator is the space character.
+
+-C::
+--cpu=<cpu>::
+	Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
+        comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default
+        is to monitor all CPUS.
+-U::
+--hide-unresolved::
+	Only display entries resolved to a symbol.
+
+-p::
+--phys-data::
+	Record/Report sample physical addresses
+
+--data-page-size::
+	Record/Report sample data address page size
+
+RECORD OPTIONS
+--------------
+-e::
+--event <event>::
+	Event selector. Use 'perf mem record -e list' to list available events.
+
+-K::
+--all-kernel::
+	Configure all used events to run in kernel space.
+
+-U::
+--all-user::
+	Configure all used events to run in user space.
+
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc)
+
+--ldlat <n>::
+	Specify desired latency for loads event. Supported on Intel and Arm64
+	processors only. Ignored on other archs.
+
+In addition, for report all perf report options are valid, and for record
+all perf record options.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
new file mode 100644
index 000000000..7f8e8ba3a
--- /dev/null
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -0,0 +1,313 @@
+perf-probe(1)
+=============
+
+NAME
+----
+perf-probe - Define new dynamic tracepoints
+
+SYNOPSIS
+--------
+[verse]
+'perf probe' [options] --add='PROBE' [...]
+or
+'perf probe' [options] PROBE
+or
+'perf probe' [options] --del='[GROUP:]EVENT' [...]
+or
+'perf probe' --list[=[GROUP:]EVENT]
+or
+'perf probe' [options] --line='LINE'
+or
+'perf probe' [options] --vars='PROBEPOINT'
+or
+'perf probe' [options] --funcs
+or
+'perf probe' [options] --definition='PROBE' [...]
+
+DESCRIPTION
+-----------
+This command defines dynamic tracepoint events, by symbol and registers
+without debuginfo, or by C expressions (C line numbers, C function names,
+and C local variables) with debuginfo.
+
+
+OPTIONS
+-------
+-k::
+--vmlinux=PATH::
+	Specify vmlinux path which has debuginfo (Dwarf binary).
+	Only when using this with --definition, you can give an offline
+	vmlinux file.
+
+-m::
+--module=MODNAME|PATH::
+	Specify module name in which perf-probe searches probe points
+	or lines. If a path of module file is passed, perf-probe
+	treat it as an offline module (this means you can add a probe on
+        a module which has not been loaded yet).
+
+-s::
+--source=PATH::
+	Specify path to kernel source.
+
+-v::
+--verbose::
+        Be more verbose (show parsed arguments, etc).
+	Can not use with -q.
+
+-q::
+--quiet::
+	Do not show any warnings or messages.
+	Can not use with -v.
+
+-a::
+--add=::
+	Define a probe event (see PROBE SYNTAX for detail).
+
+-d::
+--del=::
+	Delete probe events. This accepts glob wildcards('*', '?') and character
+	classes(e.g. [a-z], [!A-Z]).
+
+-l::
+--list[=[GROUP:]EVENT]::
+	List up current probe events. This can also accept filtering patterns of
+	event names.
+	When this is used with --cache, perf shows all cached probes instead of
+	the live probes.
+
+-L::
+--line=::
+	Show source code lines which can be probed. This needs an argument
+	which specifies a range of the source code. (see LINE SYNTAX for detail)
+
+-V::
+--vars=::
+	Show available local variables at given probe point. The argument
+	syntax is same as PROBE SYNTAX, but NO ARGs.
+
+--externs::
+	(Only for --vars) Show external defined variables in addition to local
+	variables.
+
+--no-inlines::
+	(Only for --add) Search only for non-inlined functions. The functions
+	which do not have instances are ignored.
+
+-F::
+--funcs[=FILTER]::
+	Show available functions in given module or kernel. With -x/--exec,
+	can also list functions in a user space executable / shared library.
+	This also can accept a FILTER rule argument.
+
+-D::
+--definition=::
+	Show trace-event definition converted from given probe-event instead
+	of write it into tracing/[k,u]probe_events.
+
+--filter=FILTER::
+	(Only for --vars and --funcs) Set filter. FILTER is a combination of glob
+	pattern, see FILTER PATTERN for detail.
+	Default FILTER is "!__k???tab_* & !__crc_*" for --vars, and "!_*"
+	for --funcs.
+	If several filters are specified, only the last filter is used.
+
+-f::
+--force::
+	Forcibly add events with existing name.
+
+-n::
+--dry-run::
+	Dry run. With this option, --add and --del doesn't execute actual
+	adding and removal operations.
+
+--cache::
+	(With --add) Cache the probes. Any events which successfully added
+	are also stored in the cache file.
+	(With --list) Show cached probes.
+	(With --del) Remove cached probes.
+
+--max-probes=NUM::
+	Set the maximum number of probe points for an event. Default is 128.
+
+--target-ns=PID:
+	Obtain mount namespace information from the target pid.  This is
+	used when creating a uprobe for a process that resides in a
+	different mount namespace from the perf(1) utility.
+
+-x::
+--exec=PATH::
+	Specify path to the executable or shared library file for user
+	space tracing. Can also be used with --funcs option.
+
+--demangle::
+	Demangle application symbols. --no-demangle is also available
+	for disabling demangling.
+
+--demangle-kernel::
+	Demangle kernel symbols. --no-demangle-kernel is also available
+	for disabling kernel demangling.
+
+In absence of -m/-x options, perf probe checks if the first argument after
+the options is an absolute path name. If its an absolute path, perf probe
+uses it as a target module/target user space binary to probe.
+
+PROBE SYNTAX
+------------
+Probe points are defined by following syntax.
+
+    1) Define event based on function name
+     [[GROUP:]EVENT=]FUNC[@SRC][:RLN|+OFFS|%return|;PTN] [ARG ...]
+
+    2) Define event based on source file with line number
+     [[GROUP:]EVENT=]SRC:ALN [ARG ...]
+
+    3) Define event based on source file with lazy pattern
+     [[GROUP:]EVENT=]SRC;PTN [ARG ...]
+
+    4) Pre-defined SDT events or cached event with name
+     %[sdt_PROVIDER:]SDTEVENT
+     or,
+     sdt_PROVIDER:SDTEVENT
+
+'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function, and for return probes, a "\_\_return" suffix is automatically added to the function name. You can also specify a group name by 'GROUP', if omitted, set 'probe' is used for kprobe and 'probe_<bin>' is used for uprobe.
+Note that using existing group name can conflict with other events. Especially, using the group name reserved for kernel modules can hide embedded events in the
+modules.
+'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, ':RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. And ';PTN' means lazy matching pattern (see LAZY MATCHING). Note that ';PTN' must be the end of the probe point definition.  In addition, '@SRC' specifies a source file which has that function.
+It is also possible to specify a probe point by the source line number or lazy matching by using 'SRC:ALN' or 'SRC;PTN' syntax, where 'SRC' is the source file path, ':ALN' is the line number and ';PTN' is the lazy matching pattern.
+'ARG' specifies the arguments of this probe point, (see PROBE ARGUMENT).
+'SDTEVENT' and 'PROVIDER' is the pre-defined event name which is defined by user SDT (Statically Defined Tracing) or the pre-cached probes with event name.
+Note that before using the SDT event, the target binary (on which SDT events are defined) must be scanned by linkperf:perf-buildid-cache[1] to make SDT events as cached events.
+
+For details of the SDT, see below.
+https://sourceware.org/gdb/onlinedocs/gdb/Static-Probe-Points.html
+
+ESCAPED CHARACTER
+-----------------
+
+In the probe syntax, '=', '@', '+', ':' and ';' are treated as a special character. You can use a backslash ('\') to escape the special characters.
+This is useful if you need to probe on a specific versioned symbols, like @GLIBC_... suffixes, or also you need to specify a source file which includes the special characters.
+Note that usually single backslash is consumed by shell, so you might need to pass double backslash (\\) or wrapping with single quotes (\'AAA\@BBB').
+See EXAMPLES how it is used.
+
+PROBE ARGUMENT
+--------------
+Each probe argument follows below syntax.
+
+ [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE][@user]
+
+'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
+'$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters.
+'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo (*). Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal integers (x/x8/x16/x32/x64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail)
+On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
+"@user" is a special attribute which means the LOCALVAR will be treated as a user-space memory. This is only valid for kprobe event.
+
+TYPES
+-----
+Basic types (u8/u16/u32/u64/s8/s16/s32/s64) and hexadecimal integers (x8/x16/x32/x64) are integer types. Prefix 's' and 'u' means those types are signed and unsigned respectively, and 'x' means that is shown in hexadecimal format. Traced arguments are shown in decimal (sNN/uNN) or hex (xNN). You can also use 's' or 'u' to specify only signedness and leave its size auto-detected by perf probe. Moreover, you can use 'x' to explicitly specify to be shown in hexadecimal (the size is also auto-detected).
+String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
+Bitfield is another special type, which takes 3 parameters, bit-width, bit-offset, and container-size (usually 32). The syntax is;
+
+ b<bit-width>@<bit-offset>/<container-size>
+
+LINE SYNTAX
+-----------
+Line range is described by following syntax.
+
+ "FUNC[@SRC][:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]"
+
+FUNC specifies the function name of showing lines. 'RLN' is the start line
+number from function entry line, and 'RLN2' is the end line number. As same as
+probe syntax, 'SRC' means the source file path, 'ALN' is start line number,
+and 'ALN2' is end line number in the file. It is also possible to specify how
+many lines to show by using 'NUM'. Moreover, 'FUNC@SRC' combination is good
+for searching a specific function when several functions share same name.
+So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function.
+
+LAZY MATCHING
+-------------
+The lazy line matching is similar to glob matching but ignoring spaces in both of pattern and target. So this accepts wildcards('*', '?') and character classes(e.g. [a-z], [!A-Z]).
+
+e.g.
+ 'a=*' can matches 'a=b', 'a = b', 'a == b' and so on.
+
+This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.)
+
+FILTER PATTERN
+--------------
+The filter pattern is a glob matching pattern(s) to filter variables.
+In addition, you can use "!" for specifying filter-out rule. You also can give several rules combined with "&" or "|", and fold those rules as one rule by using "(" ")".
+
+e.g.
+ With --filter "foo* | bar*", perf probe -V shows variables which start with "foo" or "bar".
+ With --filter "!foo* & *bar", perf probe -V shows variables which don't start with "foo" and end with "bar", like "fizzbar". But "foobar" is filtered out.
+
+EXAMPLES
+--------
+Display which lines in schedule() can be probed:
+
+ ./perf probe --line schedule
+
+Add a probe on schedule() function 12th line with recording cpu local variable:
+
+ ./perf probe schedule:12 cpu
+ or
+ ./perf probe --add='schedule:12 cpu'
+
+Add one or more probes which has the name start with "schedule".
+
+ ./perf probe schedule*
+ or
+ ./perf probe --add='schedule*'
+
+Add probes on lines in schedule() function which calls update_rq_clock().
+
+ ./perf probe 'schedule;update_rq_clock*'
+ or
+ ./perf probe --add='schedule;update_rq_clock*'
+
+Delete all probes on schedule().
+
+ ./perf probe --del='schedule*'
+
+Add probes at zfree() function on /bin/zsh
+
+ ./perf probe -x /bin/zsh zfree or ./perf probe /bin/zsh zfree
+
+Add probes at malloc() function on libc
+
+ ./perf probe -x /lib/libc.so.6 malloc or ./perf probe /lib/libc.so.6 malloc
+
+Add a uprobe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns <target pid> -x /lib64/libc.so.6 malloc
+
+Add a USDT probe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns <target pid> -x /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.121-0.b13.el7_3.x86_64/jre/lib/amd64/server/libjvm.so %sdt_hotspot:thread__sleep__end
+
+Add a probe on specific versioned symbol by backslash escape
+
+ ./perf probe -x /lib64/libc-2.25.so 'malloc_get_state\@GLIBC_2.2.5'
+
+Add a probe in a source file using special characters by backslash escape
+
+ ./perf probe -x /opt/test/a.out 'foo\+bar.c:4'
+
+
+PERMISSIONS AND SYSCTL
+----------------------
+Since perf probe depends on ftrace (tracefs) and kallsyms (/proc/kallsyms), you have to care about the permission and some sysctl knobs.
+
+ - Since tracefs and kallsyms requires root or privileged user to access it, the following perf probe commands also require it; --add, --del, --list (except for --cache option)
+
+ - The system admin can remount the tracefs with 755 (`sudo mount -o remount,mode=755 /sys/kernel/tracing/`) to allow unprivileged user to run the perf probe --list command.
+
+ - /proc/sys/kernel/kptr_restrict = 2 (restrict all users) also prevents perf probe to retrieve the important information from kallsyms. You also need to set to 1 (restrict non CAP_SYSLOG users) for the above commands. Since the user-space probe doesn't need to access kallsyms, this is only for probing the kernel function (kprobes).
+
+ - Since the perf probe commands read the vmlinux (for kernel) and/or the debuginfo file (including user-space application), you need to ensure that you can read those files.
+
+
+SEE ALSO
+--------
+linkperf:perf-trace[1], linkperf:perf-record[1], linkperf:perf-buildid-cache[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
new file mode 100644
index 000000000..9ea6d44ac
--- /dev/null
+++ b/tools/perf/Documentation/perf-record.txt
@@ -0,0 +1,790 @@
+perf-record(1)
+==============
+
+NAME
+----
+perf-record - Run a command and record its profile into perf.data
+
+SYNOPSIS
+--------
+[verse]
+'perf record' [-e <EVENT> | --event=EVENT] [-a] <command>
+'perf record' [-e <EVENT> | --event=EVENT] [-a] \-- <command> [<options>]
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it, into perf.data - without displaying anything.
+
+This file can then be inspected later on, using 'perf report'.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+	Select the PMU event. Selection can be:
+
+        - a symbolic event name	(use 'perf list' to list all events)
+
+        - a raw PMU event in the form of rN where N is a hexadecimal value
+          that represents the raw register encoding with the layout of the
+          event control registers as described by entries in
+          /sys/bus/event_source/devices/cpu/format/*.
+
+        - a symbolic or raw PMU event followed by an optional colon
+	  and a list of event modifiers, e.g., cpu-cycles:p.  See the
+	  linkperf:perf-list[1] man page for details on event modifiers.
+
+	- a symbolically formed PMU event like 'pmu/param1=0x3,param2/' where
+	  'param1', 'param2', etc are defined as formats for the PMU in
+	  /sys/bus/event_source/devices/<pmu>/format/*.
+
+	- a symbolically formed event like 'pmu/config=M,config1=N,config3=K/'
+
+          where M, N, K are numbers (in decimal, hex, octal format). Acceptable
+          values for each of 'config', 'config1' and 'config2' are defined by
+          corresponding entries in /sys/bus/event_source/devices/<pmu>/format/*
+          param1 and param2 are defined as formats for the PMU in:
+          /sys/bus/event_source/devices/<pmu>/format/*
+
+	  There are also some parameters which are not defined in .../<pmu>/format/*.
+	  These params can be used to overload default config values per event.
+	  Here are some common parameters:
+	  - 'period': Set event sampling period
+	  - 'freq': Set event sampling frequency
+	  - 'time': Disable/enable time stamping. Acceptable values are 1 for
+		    enabling time stamping. 0 for disabling time stamping.
+		    The default is 1.
+	  - 'call-graph': Disable/enable callgraph. Acceptable str are "fp" for
+			 FP mode, "dwarf" for DWARF mode, "lbr" for LBR mode and
+			 "no" for disable callgraph.
+	  - 'stack-size': user stack size for dwarf mode
+	  - 'name' : User defined event name. Single quotes (') may be used to
+		    escape symbols in the name from parsing by shell and tool
+		    like this: name=\'CPU_CLK_UNHALTED.THREAD:cmask=0x1\'.
+	  - 'aux-output': Generate AUX records instead of events. This requires
+			  that an AUX area event is also provided.
+	  - 'aux-sample-size': Set sample size for AUX area sampling. If the
+	  '--aux-sample' option has been used, set aux-sample-size=0 to disable
+	  AUX area sampling for the event.
+
+          See the linkperf:perf-list[1] man page for more parameters.
+
+	  Note: If user explicitly sets options which conflict with the params,
+	  the value set by the parameters will be overridden.
+
+	  Also not defined in .../<pmu>/format/* are PMU driver specific
+	  configuration parameters.  Any configuration parameter preceded by
+	  the letter '@' is not interpreted in user space and sent down directly
+	  to the PMU driver.  For example:
+
+	  perf record -e some_event/@cfg1,@cfg2=config/ ...
+
+	  will see 'cfg1' and 'cfg2=config' pushed to the PMU driver associated
+	  with the event for further processing.  There is no restriction on
+	  what the configuration parameters are, as long as their semantic is
+	  understood and supported by the PMU driver.
+
+        - a hardware breakpoint event in the form of '\mem:addr[/len][:access]'
+          where addr is the address in memory you want to break in.
+          Access is the memory access type (read, write, execute) it can
+          be passed as follows: '\mem:addr[:[r][w][x]]'. len is the range,
+          number of bytes from specified addr, which the breakpoint will cover.
+          If you want to profile read-write accesses in 0x1000, just set
+          'mem:0x1000:rw'.
+          If you want to profile write accesses in [0x1000~1008), just set
+          'mem:0x1000/8:w'.
+
+        - a BPF source file (ending in .c) or a precompiled object file (ending
+          in .o) selects one or more BPF events.
+          The BPF program can attach to various perf events based on the ELF section
+          names.
+
+          When processing a '.c' file, perf searches an installed LLVM to compile it
+          into an object file first. Optional clang options can be passed via the
+          '--clang-opt' command line option, e.g.:
+
+            perf record --clang-opt "-DLINUX_VERSION_CODE=0x50000" \
+                        -e tests/bpf-script-example.c
+
+          Note: '--clang-opt' must be placed before '--event/-e'.
+
+	- a group of events surrounded by a pair of brace ("{event1,event2,...}").
+	  Each event is separated by commas and the group should be quoted to
+	  prevent the shell interpretation.  You also need to use --group on
+	  "perf report" to view group events together.
+
+--filter=<filter>::
+        Event filter. This option should follow an event selector (-e) which
+	selects either tracepoint event(s) or a hardware trace PMU
+	(e.g. Intel PT or CoreSight).
+
+	- tracepoint filters
+
+	In the case of tracepoints, multiple '--filter' options are combined
+	using '&&'.
+
+	- address filters
+
+	A hardware trace PMU advertises its ability to accept a number of
+	address filters	by specifying a non-zero value in
+	/sys/bus/event_source/devices/<pmu>/nr_addr_filters.
+
+	Address filters have the format:
+
+	filter|start|stop|tracestop <start> [/ <size>] [@<file name>]
+
+	Where:
+	- 'filter': defines a region that will be traced.
+	- 'start': defines an address at which tracing will begin.
+	- 'stop': defines an address at which tracing will stop.
+	- 'tracestop': defines a region in which tracing will stop.
+
+	<file name> is the name of the object file, <start> is the offset to the
+	code to trace in that file, and <size> is the size of the region to
+	trace. 'start' and 'stop' filters need not specify a <size>.
+
+	If no object file is specified then the kernel is assumed, in which case
+	the start address must be a current kernel memory address.
+
+	<start> can also be specified by providing the name of a symbol. If the
+	symbol name is not unique, it can be disambiguated by inserting #n where
+	'n' selects the n'th symbol in address order. Alternately #0, #g or #G
+	select only a global symbol. <size> can also be specified by providing
+	the name of a symbol, in which case the size is calculated to the end
+	of that symbol. For 'filter' and 'tracestop' filters, if <size> is
+	omitted and <start> is a symbol, then the size is calculated to the end
+	of that symbol.
+
+	If <size> is omitted and <start> is '*', then the start and size will
+	be calculated from the first and last symbols, i.e. to trace the whole
+	file.
+
+	If symbol names (or '*') are provided, they must be surrounded by white
+	space.
+
+	The filter passed to the kernel is not necessarily the same as entered.
+	To see the filter that is passed, use the -v option.
+
+	The kernel may not be able to configure a trace region if it is not
+	within a single mapping.  MMAP events (or /proc/<pid>/maps) can be
+	examined to determine if that is a possibility.
+
+	Multiple filters can be separated with space or comma.
+
+--exclude-perf::
+	Don't record events issued by perf itself. This option should follow
+	an event selector (-e) which selects tracepoint event(s). It adds a
+	filter expression 'common_pid != $PERFPID' to filters. If other
+	'--filter' exists, the new filter expression will be combined with
+	them by '&&'.
+
+-a::
+--all-cpus::
+        System-wide collection from all CPUs (default if no target is specified).
+
+-p::
+--pid=::
+	Record events on existing process ID (comma separated list).
+
+-t::
+--tid=::
+        Record events on existing thread ID (comma separated list).
+        This option also disables inheritance by default.  Enable it by adding
+        --inherit.
+
+-u::
+--uid=::
+        Record events in threads owned by uid. Name or number.
+
+-r::
+--realtime=::
+	Collect data with this RT SCHED_FIFO priority.
+
+--no-buffering::
+	Collect data without buffering.
+
+-c::
+--count=::
+	Event period to sample.
+
+-o::
+--output=::
+	Output file name.
+
+-i::
+--no-inherit::
+	Child tasks do not inherit counters.
+
+-F::
+--freq=::
+	Profile at this frequency. Use 'max' to use the currently maximum
+	allowed frequency, i.e. the value in the kernel.perf_event_max_sample_rate
+	sysctl. Will throttle down to the currently maximum allowed frequency.
+	See --strict-freq.
+
+--strict-freq::
+	Fail if the specified frequency can't be used.
+
+-m::
+--mmap-pages=::
+	Number of mmap data pages (must be a power of two) or size
+	specification with appended unit character - B/K/M/G. The
+	size is rounded up to have nearest pages power of two value.
+	Also, by adding a comma, the number of mmap pages for AUX
+	area tracing can be specified.
+
+--group::
+	Put all events in a single event group.  This precedes the --event
+	option and remains only for backward compatibility.  See --event.
+
+-g::
+	Enables call-graph (stack chain/backtrace) recording for both
+	kernel space and user space.
+
+--call-graph::
+	Setup and enable call-graph (stack chain/backtrace) recording,
+	implies -g.  Default is "fp" (for user space).
+
+	The unwinding method used for kernel space is dependent on the
+	unwinder used by the active kernel configuration, i.e
+	CONFIG_UNWINDER_FRAME_POINTER (fp) or CONFIG_UNWINDER_ORC (orc)
+
+	Any option specified here controls the method used for user space.
+
+	Valid options are "fp" (frame pointer), "dwarf" (DWARF's CFI -
+	Call Frame Information) or "lbr" (Hardware Last Branch Record
+	facility).
+
+	In some systems, where binaries are build with gcc
+	--fomit-frame-pointer, using the "fp" method will produce bogus
+	call graphs, using "dwarf", if available (perf tools linked to
+	the libunwind or libdw library) should be used instead.
+	Using the "lbr" method doesn't require any compiler options. It
+	will produce call graphs from the hardware LBR registers. The
+	main limitation is that it is only available on new Intel
+	platforms, such as Haswell. It can only get user call chain. It
+	doesn't work with branch stack sampling at the same time.
+
+	When "dwarf" recording is used, perf also records (user) stack dump
+	when sampled.  Default size of the stack dump is 8192 (bytes).
+	User can change the size by passing the size after comma like
+	"--call-graph dwarf,4096".
+
+	When "fp" recording is used, perf tries to save stack enties
+	up to the number specified in sysctl.kernel.perf_event_max_stack
+	by default.  User can change the number by passing it after comma
+	like "--call-graph fp,32".
+
+-q::
+--quiet::
+	Don't print any warnings or messages, useful for scripting.
+
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc).
+
+-s::
+--stat::
+	Record per-thread event counts.  Use it with 'perf report -T' to see
+	the values.
+
+-d::
+--data::
+	Record the sample virtual addresses.
+
+--phys-data::
+	Record the sample physical addresses.
+
+--data-page-size::
+	Record the sampled data address data page size.
+
+--code-page-size::
+	Record the sampled code address (ip) page size
+
+-T::
+--timestamp::
+	Record the sample timestamps. Use it with 'perf report -D' to see the
+	timestamps, for instance.
+
+-P::
+--period::
+	Record the sample period.
+
+--sample-cpu::
+	Record the sample cpu.
+
+--sample-identifier::
+	Record the sample identifier i.e. PERF_SAMPLE_IDENTIFIER bit set in
+	the sample_type member of the struct perf_event_attr argument to the
+	perf_event_open system call.
+
+-n::
+--no-samples::
+	Don't sample.
+
+-R::
+--raw-samples::
+Collect raw sample records from all opened counters (default for tracepoint counters).
+
+-C::
+--cpu::
+Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+In per-thread mode with inheritance mode on (default), samples are captured only when
+the thread executes on the designated CPUs. Default is to monitor all CPUs.
+
+-B::
+--no-buildid::
+Do not save the build ids of binaries in the perf.data files. This skips
+post processing after recording, which sometimes makes the final step in
+the recording process to take a long time, as it needs to process all
+events looking for mmap records. The downside is that it can misresolve
+symbols if the workload binaries used when recording get locally rebuilt
+or upgraded, because the only key available in this case is the
+pathname. You can also set the "record.build-id" config variable to
+'skip to have this behaviour permanently.
+
+-N::
+--no-buildid-cache::
+Do not update the buildid cache. This saves some overhead in situations
+where the information in the perf.data file (which includes buildids)
+is sufficient.  You can also set the "record.build-id" config variable to
+'no-cache' to have the same effect.
+
+-G name,...::
+--cgroup name,...::
+monitor only in the container (cgroup) called "name". This option is available only
+in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
+container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
+can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
+to first event, second cgroup to second event and so on. It is possible to provide
+an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
+corresponding events, i.e., they always refer to events defined earlier on the command
+line. If the user wants to track multiple events for a specific cgroup, the user can
+use '-e e1 -e e2 -G foo,foo' or just use '-e e1 -e e2 -G foo'.
+
+If wanting to monitor, say, 'cycles' for a cgroup and also for system wide, this
+command line can be used: 'perf stat -e cycles -G cgroup_name -a -e cycles'.
+
+-b::
+--branch-any::
+Enable taken branch stack sampling. Any type of taken branch may be sampled.
+This is a shortcut for --branch-filter any. See --branch-filter for more infos.
+
+-j::
+--branch-filter::
+Enable taken branch stack sampling. Each sample captures a series of consecutive
+taken branches. The number of branches captured with each sample depends on the
+underlying hardware, the type of branches of interest, and the executed code.
+It is possible to select the types of branches captured by enabling filters. The
+following filters are defined:
+
+        - any:  any type of branches
+        - any_call: any function call or system call
+        - any_ret: any function return or system call return
+        - ind_call: any indirect branch
+        - call: direct calls, including far (to/from kernel) calls
+        - u:  only when the branch target is at the user level
+        - k: only when the branch target is in the kernel
+        - hv: only when the target is at the hypervisor level
+	- in_tx: only when the target is in a hardware transaction
+	- no_tx: only when the target is not in a hardware transaction
+	- abort_tx: only when the target is a hardware transaction abort
+	- cond: conditional branches
+	- save_type: save branch type during sampling in case binary is not available later
+		     For the platforms with Intel Arch LBR support (12th-Gen+ client or
+		     4th-Gen Xeon+ server), the save branch type is unconditionally enabled
+		     when the taken branch stack sampling is enabled.
+	- priv: save privilege state during sampling in case binary is not available later
+
++
+The option requires at least one branch type among any, any_call, any_ret, ind_call, cond.
+The privilege levels may be omitted, in which case, the privilege levels of the associated
+event are applied to the branch filter. Both kernel (k) and hypervisor (hv) privilege
+levels are subject to permissions.  When sampling on multiple events, branch stack sampling
+is enabled for all the sampling events. The sampled branch type is the same for all events.
+The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
+Note that this feature may not be available on all processors.
+
+-W::
+--weight::
+Enable weightened sampling. An additional weight is recorded per sample and can be
+displayed with the weight and local_weight sort keys.  This currently works for TSX
+abort events and some memory events in precise mode on modern Intel CPUs.
+
+--namespaces::
+Record events of type PERF_RECORD_NAMESPACES.  This enables 'cgroup_id' sort key.
+
+--all-cgroups::
+Record events of type PERF_RECORD_CGROUP.  This enables 'cgroup' sort key.
+
+--transaction::
+Record transaction flags for transaction related events.
+
+--per-thread::
+Use per-thread mmaps.  By default per-cpu mmaps are created.  This option
+overrides that and uses per-thread mmaps.  A side-effect of that is that
+inheritance is automatically disabled.  --per-thread is ignored with a warning
+if combined with -a or -C options.
+
+-D::
+--delay=::
+After starting the program, wait msecs before measuring (-1: start with events
+disabled), or enable events only for specified ranges of msecs (e.g.
+-D 10-20,30-40 means wait 10 msecs, enable for 10 msecs, wait 10 msecs, enable
+for 10 msecs, then stop). Note, delaying enabling of events is useful to filter
+out the startup phase of the program, which is often very different.
+
+-I::
+--intr-regs::
+Capture machine state (registers) at interrupt, i.e., on counter overflows for
+each sample. List of captured registers depends on the architecture. This option
+is off by default. It is possible to select the registers to sample using their
+symbolic names, e.g. on x86, ax, si. To list the available registers use
+--intr-regs=\?. To name registers, pass a comma separated list such as
+--intr-regs=ax,bx. The list of register is architecture dependent.
+
+--user-regs::
+Similar to -I, but capture user registers at sample time. To list the available
+user registers use --user-regs=\?.
+
+--running-time::
+Record running and enabled time for read events (:S)
+
+-k::
+--clockid::
+Sets the clock id to use for the various time fields in the perf_event_type
+records. See clock_gettime(). In particular CLOCK_MONOTONIC and
+CLOCK_MONOTONIC_RAW are supported, some events might also allow
+CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI.
+
+-S::
+--snapshot::
+Select AUX area tracing Snapshot Mode. This option is valid only with an
+AUX area tracing event. Optionally, certain snapshot capturing parameters
+can be specified in a string that follows this option:
+  'e': take one last snapshot on exit; guarantees that there is at least one
+       snapshot in the output file;
+  <size>: if the PMU supports this, specify the desired snapshot size.
+
+In Snapshot Mode trace data is captured only when signal SIGUSR2 is received
+and on exit if the above 'e' option is given.
+
+--aux-sample[=OPTIONS]::
+Select AUX area sampling. At least one of the events selected by the -e option
+must be an AUX area event. Samples on other events will be created containing
+data from the AUX area. Optionally sample size may be specified, otherwise it
+defaults to 4KiB.
+
+--proc-map-timeout::
+When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
+because the file may be huge. A time out is needed in such cases.
+This option sets the time out limit. The default value is 500 ms.
+
+--switch-events::
+Record context switch events i.e. events of type PERF_RECORD_SWITCH or
+PERF_RECORD_SWITCH_CPU_WIDE. In some cases (e.g. Intel PT, CoreSight or Arm SPE)
+switch events will be enabled automatically, which can be suppressed by
+by the option --no-switch-events.
+
+--clang-path=PATH::
+Path to clang binary to use for compiling BPF scriptlets.
+(enabled when BPF support is on)
+
+--clang-opt=OPTIONS::
+Options passed to clang when compiling BPF scriptlets.
+(enabled when BPF support is on)
+
+--vmlinux=PATH::
+Specify vmlinux path which has debuginfo.
+(enabled when BPF prologue is on)
+
+--buildid-all::
+Record build-id of all DSOs regardless whether it's actually hit or not.
+
+--buildid-mmap::
+Record build ids in mmap2 events, disables build id cache (implies --no-buildid).
+
+--aio[=n]::
+Use <n> control blocks in asynchronous (Posix AIO) trace writing mode (default: 1, max: 4).
+Asynchronous mode is supported only when linking Perf tool with libc library
+providing implementation for Posix AIO API.
+
+--affinity=mode::
+Set affinity mask of trace reading thread according to the policy defined by 'mode' value:
+  node - thread affinity mask is set to NUMA node cpu mask of the processed mmap buffer
+  cpu  - thread affinity mask is set to cpu of the processed mmap buffer
+
+--mmap-flush=number::
+
+Specify minimal number of bytes that is extracted from mmap data pages and
+processed for output. One can specify the number using B/K/M/G suffixes.
+
+The maximal allowed value is a quarter of the size of mmaped data pages.
+
+The default option value is 1 byte which means that every time that the output
+writing thread finds some new data in the mmaped buffer the data is extracted,
+possibly compressed (-z) and written to the output, perf.data or pipe.
+
+Larger data chunks are compressed more effectively in comparison to smaller
+chunks so extraction of larger chunks from the mmap data pages is preferable
+from the perspective of output size reduction.
+
+Also at some cases executing less output write syscalls with bigger data size
+can take less time than executing more output write syscalls with smaller data
+size thus lowering runtime profiling overhead.
+
+-z::
+--compression-level[=n]::
+Produce compressed trace using specified level n (default: 1 - fastest compression,
+22 - smallest trace)
+
+--all-kernel::
+Configure all used events to run in kernel space.
+
+--all-user::
+Configure all used events to run in user space.
+
+--kernel-callchains::
+Collect callchains only from kernel space. I.e. this option sets
+perf_event_attr.exclude_callchain_user to 1.
+
+--user-callchains::
+Collect callchains only from user space. I.e. this option sets
+perf_event_attr.exclude_callchain_kernel to 1.
+
+Don't use both --kernel-callchains and --user-callchains at the same time or no
+callchains will be collected.
+
+--timestamp-filename
+Append timestamp to output file name.
+
+--timestamp-boundary::
+Record timestamp boundary (time of first/last samples).
+
+--switch-output[=mode]::
+Generate multiple perf.data files, timestamp prefixed, switching to a new one
+based on 'mode' value:
+  "signal" - when receiving a SIGUSR2 (default value) or
+  <size>   - when reaching the size threshold, size is expected to
+             be a number with appended unit character - B/K/M/G
+  <time>   - when reaching the time threshold, size is expected to
+             be a number with appended unit character - s/m/h/d
+
+             Note: the precision of  the size  threshold  hugely depends
+             on your configuration  - the number and size of  your  ring
+             buffers (-m). It is generally more precise for higher sizes
+             (like >5M), for lower values expect different sizes.
+
+A possible use case is to, given an external event, slice the perf.data file
+that gets then processed, possibly via a perf script, to decide if that
+particular perf.data snapshot should be kept or not.
+
+Implies --timestamp-filename, --no-buildid and --no-buildid-cache.
+The reason for the latter two is to reduce the data file switching
+overhead. You can still switch them on with:
+
+  --switch-output --no-no-buildid  --no-no-buildid-cache
+
+--switch-output-event::
+Events that will cause the switch of the perf.data file, auto-selecting
+--switch-output=signal, the results are similar as internally the side band
+thread will also send a SIGUSR2 to the main one.
+
+Uses the same syntax as --event, it will just not be recorded, serving only to
+switch the perf.data file as soon as the --switch-output event is processed by
+a separate sideband thread.
+
+This sideband thread is also used to other purposes, like processing the
+PERF_RECORD_BPF_EVENT records as they happen, asking the kernel for extra BPF
+information, etc.
+
+--switch-max-files=N::
+
+When rotating perf.data with --switch-output, only keep N files.
+
+--dry-run::
+Parse options then exit. --dry-run can be used to detect errors in cmdline
+options.
+
+'perf record --dry-run -e' can act as a BPF script compiler if llvm.dump-obj
+in config file is set to true.
+
+--synth=TYPE::
+Collect and synthesize given type of events (comma separated).  Note that
+this option controls the synthesis from the /proc filesystem which represent
+task status for pre-existing threads.
+
+Kernel (and some other) events are recorded regardless of the
+choice in this option.  For example, --synth=no would have MMAP events for
+kernel and modules.
+
+Available types are:
+  'task'    - synthesize FORK and COMM events for each task
+  'mmap'    - synthesize MMAP events for each process (implies 'task')
+  'cgroup'  - synthesize CGROUP events for each cgroup
+  'all'     - synthesize all events (default)
+  'no'      - do not synthesize any of the above events
+
+--tail-synthesize::
+Instead of collecting non-sample events (for example, fork, comm, mmap) at
+the beginning of record, collect them during finalizing an output file.
+The collected non-sample events reflects the status of the system when
+record is finished.
+
+--overwrite::
+Makes all events use an overwritable ring buffer. An overwritable ring
+buffer works like a flight recorder: when it gets full, the kernel will
+overwrite the oldest records, that thus will never make it to the
+perf.data file.
+
+When '--overwrite' and '--switch-output' are used perf records and drops
+events until it receives a signal, meaning that something unusual was
+detected that warrants taking a snapshot of the most current events,
+those fitting in the ring buffer at that moment.
+
+'overwrite' attribute can also be set or canceled for an event using
+config terms. For example: 'cycles/overwrite/' and 'instructions/no-overwrite/'.
+
+Implies --tail-synthesize.
+
+--kcore::
+Make a copy of /proc/kcore and place it into a directory with the perf data file.
+
+--max-size=<size>::
+Limit the sample data max size, <size> is expected to be a number with
+appended unit character - B/K/M/G
+
+--num-thread-synthesize::
+	The number of threads to run when synthesizing events for existing processes.
+	By default, the number of threads equals 1.
+
+ifdef::HAVE_LIBPFM[]
+--pfm-events events::
+Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)
+including support for event filters. For example '--pfm-events
+inst_retired:any_p:u:c=1:i'. More than one event can be passed to the
+option using the comma separator. Hardware events and generic hardware
+events cannot be mixed together. The latter must be used with the -e
+option. The -e option and this one can be mixed and matched.  Events
+can be grouped using the {} notation.
+endif::HAVE_LIBPFM[]
+
+--control=fifo:ctl-fifo[,ack-fifo]::
+--control=fd:ctl-fd[,ack-fd]::
+ctl-fifo / ack-fifo are opened and used as ctl-fd / ack-fd as follows.
+Listen on ctl-fd descriptor for command to control measurement.
+
+Available commands:
+  'enable'           : enable events
+  'disable'          : disable events
+  'enable name'      : enable event 'name'
+  'disable name'     : disable event 'name'
+  'snapshot'         : AUX area tracing snapshot).
+  'stop'             : stop perf record
+  'ping'             : ping
+
+  'evlist [-v|-g|-F] : display all events
+                       -F  Show just the sample frequency used for each event.
+                       -v  Show all fields.
+                       -g  Show event group information.
+
+Measurements can be started with events disabled using --delay=-1 option. Optionally
+send control command completion ('ack\n') to ack-fd descriptor to synchronize with the
+controlling process.  Example of bash shell script to enable and disable events during
+measurements:
+
+ #!/bin/bash
+
+ ctl_dir=/tmp/
+
+ ctl_fifo=${ctl_dir}perf_ctl.fifo
+ test -p ${ctl_fifo} && unlink ${ctl_fifo}
+ mkfifo ${ctl_fifo}
+ exec {ctl_fd}<>${ctl_fifo}
+
+ ctl_ack_fifo=${ctl_dir}perf_ctl_ack.fifo
+ test -p ${ctl_ack_fifo} && unlink ${ctl_ack_fifo}
+ mkfifo ${ctl_ack_fifo}
+ exec {ctl_fd_ack}<>${ctl_ack_fifo}
+
+ perf record -D -1 -e cpu-cycles -a               \
+             --control fd:${ctl_fd},${ctl_fd_ack} \
+             -- sleep 30 &
+ perf_pid=$!
+
+ sleep 5  && echo 'enable' >&${ctl_fd} && read -u ${ctl_fd_ack} e1 && echo "enabled(${e1})"
+ sleep 10 && echo 'disable' >&${ctl_fd} && read -u ${ctl_fd_ack} d1 && echo "disabled(${d1})"
+
+ exec {ctl_fd_ack}>&-
+ unlink ${ctl_ack_fifo}
+
+ exec {ctl_fd}>&-
+ unlink ${ctl_fifo}
+
+ wait -n ${perf_pid}
+ exit $?
+
+--threads=<spec>::
+Write collected trace data into several data files using parallel threads.
+<spec> value can be user defined list of masks. Masks separated by colon
+define CPUs to be monitored by a thread and affinity mask of that thread
+is separated by slash:
+
+    <cpus mask 1>/<affinity mask 1>:<cpus mask 2>/<affinity mask 2>:...
+
+CPUs or affinity masks must not overlap with other corresponding masks.
+Invalid CPUs are ignored, but masks containing only invalid CPUs are not
+allowed.
+
+For example user specification like the following:
+
+    0,2-4/2-4:1,5-7/5-7
+
+specifies parallel threads layout that consists of two threads,
+the first thread monitors CPUs 0 and 2-4 with the affinity mask 2-4,
+the second monitors CPUs 1 and 5-7 with the affinity mask 5-7.
+
+<spec> value can also be a string meaning predefined parallel threads
+layout:
+
+    cpu    - create new data streaming thread for every monitored cpu
+    core   - create new thread to monitor CPUs grouped by a core
+    package - create new thread to monitor CPUs grouped by a package
+    numa   - create new threed to monitor CPUs grouped by a NUMA domain
+
+Predefined layouts can be used on systems with large number of CPUs in
+order not to spawn multiple per-cpu streaming threads but still avoid LOST
+events in data directory files. Option specified with no or empty value
+defaults to CPU layout. Masks defined or provided by the option value are
+filtered through the mask provided by -C option.
+
+--debuginfod[=URLs]::
+	Specify debuginfod URL to be used when cacheing perf.data binaries,
+	it follows the same syntax as the DEBUGINFOD_URLS variable, like:
+
+	  http://192.168.122.174:8002
+
+	If the URLs is not specified, the value of DEBUGINFOD_URLS
+	system environment variable is used.
+
+--off-cpu::
+	Enable off-cpu profiling with BPF.  The BPF program will collect
+	task scheduling information with (user) stacktrace and save them
+	as sample data of a software event named "offcpu-time".  The
+	sample period will have the time the task slept in nanoseconds.
+
+	Note that BPF can collect stack traces using frame pointer ("fp")
+	only, as of now.  So the applications built without the frame
+	pointer might see bogus addresses.
+
+include::intel-hybrid.txt[]
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
new file mode 100644
index 000000000..4fa509b15
--- /dev/null
+++ b/tools/perf/Documentation/perf-report.txt
@@ -0,0 +1,583 @@
+perf-report(1)
+==============
+
+NAME
+----
+perf-report - Read perf.data (created by perf record) and display the profile
+
+SYNOPSIS
+--------
+[verse]
+'perf report' [-i <file> | --input=file]
+
+DESCRIPTION
+-----------
+This command displays the performance counter profile information recorded
+via perf record.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
+-q::
+--quiet::
+	Do not show any warnings or messages.  (Suppress -v)
+
+-n::
+--show-nr-samples::
+	Show the number of samples for each symbol
+
+--show-cpu-utilization::
+        Show sample percentage for different cpu modes.
+
+-T::
+--threads::
+	Show per-thread event counters.  The input data file should be recorded
+	with -s option.
+-c::
+--comms=::
+	Only consider symbols in these comms. CSV that understands
+	file://filename entries.  This option will affect the percentage of
+	the overhead column.  See --percentage for more info.
+--pid=::
+        Only show events for given process ID (comma separated list).
+
+--tid=::
+        Only show events for given thread ID (comma separated list).
+-d::
+--dsos=::
+	Only consider symbols in these dsos. CSV that understands
+	file://filename entries.  This option will affect the percentage of
+	the overhead column.  See --percentage for more info.
+-S::
+--symbols=::
+	Only consider these symbols. CSV that understands
+	file://filename entries.  This option will affect the percentage of
+	the overhead column.  See --percentage for more info.
+
+--symbol-filter=::
+	Only show symbols that match (partially) with this filter.
+
+-U::
+--hide-unresolved::
+        Only display entries resolved to a symbol.
+
+-s::
+--sort=::
+	Sort histogram entries by given key(s) - multiple keys can be specified
+	in CSV format.  Following sort keys are available:
+	pid, comm, dso, symbol, parent, cpu, socket, srcline, weight,
+	local_weight, cgroup_id, addr.
+
+	Each key has following meaning:
+
+	- comm: command (name) of the task which can be read via /proc/<pid>/comm
+	- pid: command and tid of the task
+	- dso: name of library or module executed at the time of sample
+	- dso_size: size of library or module executed at the time of sample
+	- symbol: name of function executed at the time of sample
+	- symbol_size: size of function executed at the time of sample
+	- parent: name of function matched to the parent regex filter. Unmatched
+	entries are displayed as "[other]".
+	- cpu: cpu number the task ran at the time of sample
+	- socket: processor socket number the task ran at the time of sample
+	- srcline: filename and line number executed at the time of sample.  The
+	DWARF debugging info must be provided.
+	- srcfile: file name of the source file of the samples. Requires dwarf
+	information.
+	- weight: Event specific weight, e.g. memory latency or transaction
+	abort cost. This is the global weight.
+	- local_weight: Local weight version of the weight above.
+	- cgroup_id: ID derived from cgroup namespace device and inode numbers.
+	- cgroup: cgroup pathname in the cgroupfs.
+	- transaction: Transaction abort flags.
+	- overhead: Overhead percentage of sample
+	- overhead_sys: Overhead percentage of sample running in system mode
+	- overhead_us: Overhead percentage of sample running in user mode
+	- overhead_guest_sys: Overhead percentage of sample running in system mode
+	on guest machine
+	- overhead_guest_us: Overhead percentage of sample running in user mode on
+	guest machine
+	- sample: Number of sample
+	- period: Raw number of event count of sample
+	- time: Separate the samples by time stamp with the resolution specified by
+	--time-quantum (default 100ms). Specify with overhead and before it.
+	- code_page_size: the code page size of sampled code address (ip)
+	- ins_lat: Instruction latency in core cycles. This is the global instruction
+	  latency
+	- local_ins_lat: Local instruction latency version
+	- p_stage_cyc: On powerpc, this presents the number of cycles spent in a
+	  pipeline stage. And currently supported only on powerpc.
+	- addr: (Full) virtual address of the sampled instruction
+
+	By default, comm, dso and symbol keys are used.
+	(i.e. --sort comm,dso,symbol)
+
+	If --branch-stack option is used, following sort keys are also
+	available:
+
+	- dso_from: name of library or module branched from
+	- dso_to: name of library or module branched to
+	- symbol_from: name of function branched from
+	- symbol_to: name of function branched to
+	- srcline_from: source file and line branched from
+	- srcline_to: source file and line branched to
+	- mispredict: "N" for predicted branch, "Y" for mispredicted branch
+	- in_tx: branch in TSX transaction
+	- abort: TSX transaction abort.
+	- cycles: Cycles in basic block
+
+	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
+	and symbol_to, see '--branch-stack'.
+
+	When the sort key symbol is specified, columns "IPC" and "IPC Coverage"
+	are enabled automatically. Column "IPC" reports the average IPC per function
+	and column "IPC coverage" reports the percentage of instructions with
+	sampled IPC in this function. IPC means Instruction Per Cycle. If it's low,
+	it indicates there may be a performance bottleneck when the function is
+	executed, such as a memory access bottleneck. If a function has high overhead
+	and low IPC, it's worth further analyzing it to optimize its performance.
+
+	If the --mem-mode option is used, the following sort keys are also available
+	(incompatible with --branch-stack):
+	symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline, blocked.
+
+	- symbol_daddr: name of data symbol being executed on at the time of sample
+	- dso_daddr: name of library or module containing the data being executed
+	on at the time of the sample
+	- locked: whether the bus was locked at the time of the sample
+	- tlb: type of tlb access for the data at the time of the sample
+	- mem: type of memory access for the data at the time of the sample
+	- snoop: type of snoop (if any) for the data at the time of the sample
+	- dcacheline: the cacheline the data address is on at the time of the sample
+	- phys_daddr: physical address of data being executed on at the time of sample
+	- data_page_size: the data page size of data being executed on at the time of sample
+	- blocked: reason of blocked load access for the data at the time of the sample
+
+	And the default sort keys are changed to local_weight, mem, sym, dso,
+	symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat,
+	see '--mem-mode'.
+
+	If the data file has tracepoint event(s), following (dynamic) sort keys
+	are also available:
+	trace, trace_fields, [<event>.]<field>[/raw]
+
+	- trace: pretty printed trace output in a single column
+	- trace_fields: fields in tracepoints in separate columns
+	- <field name>: optional event and field name for a specific field
+
+	The last form consists of event and field names.  If event name is
+	omitted, it searches all events for matching field name.  The matched
+	field will be shown only for the event has the field.  The event name
+	supports substring match so user doesn't need to specify full subsystem
+	and event name everytime.  For example, 'sched:sched_switch' event can
+	be shortened to 'switch' as long as it's not ambiguous.  Also event can
+	be specified by its index (starting from 1) preceded by the '%'.
+	So '%1' is the first event, '%2' is the second, and so on.
+
+	The field name can have '/raw' suffix which disables pretty printing
+	and shows raw field value like hex numbers.  The --raw-trace option
+	has the same effect for all dynamic sort keys.
+
+	The default sort keys are changed to 'trace' if all events in the data
+	file are tracepoint.
+
+-F::
+--fields=::
+	Specify output field - multiple keys can be specified in CSV format.
+	Following fields are available:
+	overhead, overhead_sys, overhead_us, overhead_children, sample and period.
+	Also it can contain any sort key(s).
+
+	By default, every sort keys not specified in -F will be appended
+	automatically.
+
+	If the keys starts with a prefix '+', then it will append the specified
+        field(s) to the default field order. For example: perf report -F +period,sample.
+
+-p::
+--parent=<regex>::
+        A regex filter to identify parent. The parent is a caller of this
+	function and searched through the callchain, thus it requires callchain
+	information recorded. The pattern is in the extended regex format and
+	defaults to "\^sys_|^do_page_fault", see '--sort parent'.
+
+-x::
+--exclude-other::
+        Only display entries with parent-match.
+
+-w::
+--column-widths=<width[,width...]>::
+	Force each column width to the provided list, for large terminal
+	readability.  0 means no limit (default behavior).
+
+-t::
+--field-separator=::
+	Use a special separator character and don't pad with spaces, replacing
+	all occurrences of this separator in symbol names (and other output)
+	with a '.' character, that thus it's the only non valid separator.
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+--disable-order::
+	Disable raw trace ordering.
+
+-g::
+--call-graph=<print_type,threshold[,print_limit],order,sort_key[,branch],value>::
+        Display call chains using type, min percent threshold, print limit,
+	call order, sort key, optional branch and value.  Note that ordering
+	is not fixed so any parameter can be given in an arbitrary order.
+	One exception is the print_limit which should be preceded by threshold.
+
+	print_type can be either:
+	- flat: single column, linear exposure of call chains.
+	- graph: use a graph tree, displaying absolute overhead rates. (default)
+	- fractal: like graph, but displays relative rates. Each branch of
+		 the tree is considered as a new profiled object.
+	- folded: call chains are displayed in a line, separated by semicolons
+	- none: disable call chain display.
+
+	threshold is a percentage value which specifies a minimum percent to be
+	included in the output call graph.  Default is 0.5 (%).
+
+	print_limit is only applied when stdio interface is used.  It's to limit
+	number of call graph entries in a single hist entry.  Note that it needs
+	to be given after threshold (but not necessarily consecutive).
+	Default is 0 (unlimited).
+
+	order can be either:
+	- callee: callee based call graph.
+	- caller: inverted caller based call graph.
+	Default is 'caller' when --children is used, otherwise 'callee'.
+
+	sort_key can be:
+	- function: compare on functions (default)
+	- address: compare on individual code addresses
+	- srcline: compare on source filename and line number
+
+	branch can be:
+	- branch: include last branch information in callgraph when available.
+	          Usually more convenient to use --branch-history for this.
+
+	value can be:
+	- percent: display overhead percent (default)
+	- period: display event period
+	- count: display event count
+
+--children::
+	Accumulate callchain of children to parent entry so that then can
+	show up in the output.  The output will have a new "Children" column
+	and will be sorted on the data.  It requires callchains are recorded.
+	See the `overhead calculation' section for more details. Enabled by
+	default, disable with --no-children.
+
+--max-stack::
+	Set the stack depth limit when parsing the callchain, anything
+	beyond the specified depth will be ignored. This is a trade-off
+	between information loss and faster processing especially for
+	workloads that can have a very long callchain stack.
+	Note that when using the --itrace option the synthesized callchain size
+	will override this value if the synthesized callchain size is bigger.
+
+	Default: 127
+
+-G::
+--inverted::
+        alias for inverted caller based call graph.
+
+--ignore-callees=<regex>::
+        Ignore callees of the function(s) matching the given regex.
+        This has the effect of collecting the callers of each such
+        function into one place in the call-graph tree.
+
+--pretty=<key>::
+        Pretty printing style.  key: normal, raw
+
+--stdio:: Use the stdio interface.
+
+--stdio-color::
+	'always', 'never' or 'auto', allowing configuring color output
+	via the command line, in addition to via "color.ui" .perfconfig.
+	Use '--stdio-color always' to generate color even when redirecting
+	to a pipe or file. Using just '--stdio-color' is equivalent to
+	using 'always'.
+
+--tui:: Use the TUI interface, that is integrated with annotate and allows
+        zooming into DSOs or threads, among other features. Use of --tui
+	requires a tty, if one is not present, as when piping to other
+	commands, the stdio interface is used.
+
+--gtk:: Use the GTK2 interface.
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname
+
+--ignore-vmlinux::
+	Ignore vmlinux files.
+
+--kallsyms=<file>::
+        kallsyms pathname
+
+-m::
+--modules::
+        Load module symbols. WARNING: This should only be used with -k and
+        a LIVE kernel.
+
+-f::
+--force::
+        Don't do ownership validation.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-C::
+--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
+	be provided as a comma-separated list with no space: 0,1. Ranges of
+	CPUs are specified with -: 0-2. Default is to report samples on all
+	CPUs.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+--show-total-period:: Show a column with the sum of periods.
+
+-I::
+--show-info::
+	Display extended information about the perf.data file. This adds
+	information which may be very large and thus may clutter the display.
+	It currently includes: cpu and numa topology of the host system.
+
+-b::
+--branch-stack::
+	Use the addresses of sampled taken branches instead of the instruction
+	address to build the histograms. To generate meaningful output, the
+	perf.data file must have been obtained using perf record -b or
+	perf record --branch-filter xxx where xxx is a branch filter option.
+	perf report is able to auto-detect whether a perf.data file contains
+	branch stacks and it will automatically switch to the branch view mode,
+	unless --no-branch-stack is used.
+
+--branch-history::
+	Add the addresses of sampled taken branches to the callstack.
+	This allows to examine the path the program took to each sample.
+	The data collection must have used -b (or -j) and -g.
+
+--objdump=<path>::
+        Path to objdump binary.
+
+--prefix=PREFIX::
+--prefix-strip=N::
+	Remove first N entries from source file path names in executables
+	and add PREFIX. This allows to display source code compiled on systems
+	with different file system layout.
+
+--group::
+	Show event group information together. It forces group output also
+	if there are no groups defined in data file.
+
+--group-sort-idx::
+	Sort the output by the event at the index n in group. If n is invalid,
+	sort by the first event. It can support multiple groups with different
+	amount of events. WARNING: This should be used on grouped events.
+
+--demangle::
+	Demangle symbol names to human readable form. It's enabled by default,
+	disable with --no-demangle.
+
+--demangle-kernel::
+	Demangle kernel symbol names to human readable form (for C++ kernels).
+
+--mem-mode::
+	Use the data addresses of samples in addition to instruction addresses
+	to build the histograms.  To generate meaningful output, the perf.data
+	file must have been obtained using perf record -d -W and using a
+	special event -e cpu/mem-loads/p or -e cpu/mem-stores/p. See
+	'perf mem' for simpler access.
+
+--percent-limit::
+	Do not show entries which have an overhead under that percent.
+	(Default: 0).  Note that this option also sets the percent limit (threshold)
+	of callchains.  However the default value of callchain threshold is
+	different than the default value of hist entries.  Please see the
+	--call-graph option for details.
+
+--percentage::
+	Determine how to display the overhead percentage of filtered entries.
+	Filters can be applied by --comms, --dsos and/or --symbols options and
+	Zoom operations on the TUI (thread, dso, etc).
+
+	"relative" means it's relative to filtered entries only so that the
+	sum of shown entries will be always 100%.  "absolute" means it retains
+	the original value before and after the filter is applied.
+
+--header::
+	Show header information in the perf.data file.  This includes
+	various information like hostname, OS and perf version, cpu/mem
+	info, perf command line, event list and so on.  Currently only
+	--stdio output supports this feature.
+
+--header-only::
+	Show only perf.data header (forces --stdio).
+
+--time::
+	Only analyze samples within given time window: <start>,<stop>. Times
+	have the format seconds.nanoseconds. If start is not given (i.e. time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e. time string is 'x.y,') then analysis goes
+	to end of file. Multiple ranges can be separated by spaces, which
+	requires the argument to be quoted e.g. --time "1234.567,1234.789 1235,"
+
+	Also support time percent with multiple time ranges. Time string is
+	'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'.
+
+	For example:
+	Select the second 10% time slice:
+
+	  perf report --time 10%/2
+
+	Select from 0% to 10% time slice:
+
+	  perf report --time 0%-10%
+
+	Select the first and second 10% time slices:
+
+	  perf report --time 10%/1,10%/2
+
+	Select from 0% to 10% and 30% to 40% slices:
+
+	  perf report --time 0%-10%,30%-40%
+
+--switch-on EVENT_NAME::
+	Only consider events after this event is found.
+
+	This may be interesting to measure a workload only after some initialization
+	phase is over, i.e. insert a perf probe at that point and then using this
+	option with that probe.
+
+--switch-off EVENT_NAME::
+	Stop considering events after this event is found.
+
+--show-on-off-events::
+	Show the --switch-on/off events too. This has no effect in 'perf report' now
+	but probably we'll make the default not to show the switch-on/off events
+        on the --group mode and if there is only one event besides the off/on ones,
+	go straight to the histogram browser, just like 'perf report' with no events
+	explicitly specified does.
+
+--itrace::
+	Options for decoding instruction tracing data. The options are:
+
+include::itrace.txt[]
+
+	To disable decoding entirely, use --no-itrace.
+
+--full-source-path::
+	Show the full path for source files for srcline output.
+
+--show-ref-call-graph::
+	When multiple events are sampled, it may not be needed to collect
+	callgraphs for all of them. The sample sites are usually nearby,
+	and it's enough to collect the callgraphs on a reference event.
+	So user can use "call-graph=no" event modifier to disable callgraph
+	for other events to reduce the overhead.
+	However, perf report cannot show callgraphs for the event which
+	disable the callgraph.
+	This option extends the perf report to show reference callgraphs,
+	which collected by reference event, in no callgraph event.
+
+--stitch-lbr::
+	Show callgraph with stitched LBRs, which may have more complete
+	callgraph. The perf.data file must have been obtained using
+	perf record --call-graph lbr.
+	Disabled by default. In common cases with call stack overflows,
+	it can recreate better call stacks than the default lbr call stack
+	output. But this approach is not full proof. There can be cases
+	where it creates incorrect call stacks from incorrect matches.
+	The known limitations include exception handing such as
+	setjmp/longjmp will have calls/returns not match.
+
+--socket-filter::
+	Only report the samples on the processor socket that match with this filter
+
+--samples=N::
+	Save N individual samples for each histogram entry to show context in perf
+	report tui browser.
+
+--raw-trace::
+	When displaying traceevent output, do not use print fmt or plugins.
+
+--hierarchy::
+	Enable hierarchical output.
+
+--inline::
+	If a callgraph address belongs to an inlined function, the inline stack
+	will be printed. Each entry is function name or file/line. Enabled by
+	default, disable with --no-inline.
+
+--mmaps::
+	Show --tasks output plus mmap information in a format similar to
+	/proc/<PID>/maps.
+
+	Please note that not all mmaps are stored, options affecting which ones
+	are include 'perf record --data', for instance.
+
+--ns::
+	Show time stamps in nanoseconds.
+
+--stats::
+	Display overall events statistics without any further processing.
+	(like the one at the end of the perf report -D command)
+
+--tasks::
+	Display monitored tasks stored in perf data. Displaying pid/tid/ppid
+	plus the command string aligned to distinguish parent and child tasks.
+
+--percent-type::
+	Set annotation percent type from following choices:
+	  global-period, local-period, global-hits, local-hits
+
+	The local/global keywords set if the percentage is computed
+	in the scope of the function (local) or the whole data (global).
+	The period/hits keywords set the base the percentage is computed
+	on - the samples period or the number of samples (hits).
+
+--time-quantum::
+	Configure time quantum for time sort key. Default 100ms.
+	Accepts s, us, ms, ns units.
+
+--total-cycles::
+	When --total-cycles is specified, it supports sorting for all blocks by
+	'Sampled Cycles%'. This is useful to concentrate on the globally hottest
+	blocks. In output, there are some new columns:
+
+	'Sampled Cycles%' - block sampled cycles aggregation / total sampled cycles
+	'Sampled Cycles'  - block sampled cycles aggregation
+	'Avg Cycles%'     - block average sampled cycles / sum of total block average
+			    sampled cycles
+	'Avg Cycles'      - block average sampled cycles
+
+--skip-empty::
+	Do not print 0 results in the --stat output.
+
+include::callchain-overhead-calculation.txt[]
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1],
+linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
new file mode 100644
index 000000000..5fbe42bd5
--- /dev/null
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -0,0 +1,171 @@
+perf-sched(1)
+=============
+
+NAME
+----
+perf-sched - Tool to trace/measure scheduler properties (latencies)
+
+SYNOPSIS
+--------
+[verse]
+'perf sched' {record|latency|map|replay|script|timehist}
+
+DESCRIPTION
+-----------
+There are several variants of 'perf sched':
+
+  'perf sched record <command>' to record the scheduling events
+  of an arbitrary workload.
+
+  'perf sched latency' to report the per task scheduling latencies
+  and other scheduling properties of the workload.
+
+  'perf sched script' to see a detailed trace of the workload that
+   was recorded (aliased to 'perf script' for now).
+
+  'perf sched replay' to simulate the workload that was recorded
+  via perf sched record. (this is done by starting up mockup threads
+  that mimic the workload based on the events in the trace. These
+  threads can then replay the timings (CPU runtime and sleep patterns)
+  of the workload as it occurred when it was recorded - and can repeat
+  it a number of times, measuring its performance.)
+
+  'perf sched map' to print a textual context-switching outline of
+  workload captured via perf sched record.  Columns stand for
+  individual CPUs, and the two-letter shortcuts stand for tasks that
+  are running on a CPU. A '*' denotes the CPU that had the event, and
+  a dot signals an idle CPU.
+
+  'perf sched timehist' provides an analysis of scheduling events.
+    
+    Example usage:
+        perf sched record -- sleep 1
+        perf sched timehist
+    
+   By default it shows the individual schedule events, including the wait
+   time (time between sched-out and next sched-in events for the task), the
+   task scheduling delay (time between wakeup and actually running) and run
+   time for the task:
+    
+                time    cpu  task name             wait time  sch delay   run time
+                             [tid/pid]                (msec)     (msec)     (msec)
+      -------------- ------  --------------------  ---------  ---------  ---------
+        79371.874569 [0011]  gcc[31949]                0.014      0.000      1.148
+        79371.874591 [0010]  gcc[31951]                0.000      0.000      0.024
+        79371.874603 [0010]  migration/10[59]          3.350      0.004      0.011
+        79371.874604 [0011]  <idle>                    1.148      0.000      0.035
+        79371.874723 [0005]  <idle>                    0.016      0.000      1.383
+        79371.874746 [0005]  gcc[31949]                0.153      0.078      0.022
+    ...
+    
+   Times are in msec.usec.
+
+OPTIONS
+-------
+-i::
+--input=<file>::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the sched data.
+
+-f::
+--force::
+	Don't complain, do it.
+
+OPTIONS for 'perf sched map'
+----------------------------
+
+--compact::
+	Show only CPUs with activity. Helps visualizing on high core
+	count systems.
+
+--cpus::
+	Show just entries with activities for the given CPUs.
+
+--color-cpus::
+	Highlight the given cpus.
+
+--color-pids::
+	Highlight the given pids.
+
+OPTIONS for 'perf sched timehist'
+---------------------------------
+-k::
+--vmlinux=<file>::
+    vmlinux pathname
+
+--kallsyms=<file>::
+    kallsyms pathname
+
+-g::
+--call-graph::
+	Display call chains if present (default on).
+
+--max-stack::
+	Maximum number of functions to display in backtrace, default 5.
+
+-C=::
+--cpu=::
+	Only show events for the given CPU(s) (comma separated list).
+
+-p=::
+--pid=::
+	Only show events for given process ID (comma separated list).
+
+-t=::
+--tid=::
+	Only show events for given thread ID (comma separated list).
+
+-s::
+--summary::
+    Show only a summary of scheduling by thread with min, max, and average
+    run times (in sec) and relative stddev.
+
+-S::
+--with-summary::
+    Show all scheduling events followed by a summary by thread with min,
+    max, and average run times (in sec) and relative stddev.
+
+--symfs=<directory>::
+    Look for files with symbols relative to this directory.
+
+-V::
+--cpu-visual::
+	Show visual aid for sched switches by CPU: 'i' marks idle time,
+	's' are scheduler events.
+
+-w::
+--wakeups::
+	Show wakeup events.
+
+-M::
+--migrations::
+	Show migration events.
+
+-n::
+--next::
+	Show next task.
+
+-I::
+--idle-hist::
+	Show idle-related events only.
+
+--time::
+	Only analyze samples within given time window: <start>,<stop>. Times
+	have the format seconds.microseconds. If start is not given (i.e., time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e, time string is 'x.y,') then analysis goes
+	to end of file.
+
+--state::
+	Show task state when it switched out.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt
new file mode 100644
index 000000000..fa4f39d30
--- /dev/null
+++ b/tools/perf/Documentation/perf-script-perl.txt
@@ -0,0 +1,216 @@
+perf-script-perl(1)
+===================
+
+NAME
+----
+perf-script-perl - Process trace data with a Perl script
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [-s [Perl]:script[.pl] ]
+
+DESCRIPTION
+-----------
+
+This perf script option is used to process perf script data using perf's
+built-in Perl interpreter.  It reads and processes the input file and
+displays the results of the trace analysis implemented in the given
+Perl script, if any.
+
+STARTER SCRIPTS
+---------------
+
+You can avoid reading the rest of this document by running 'perf script
+-g perl' in the same directory as an existing perf.data trace file.
+That will generate a starter script containing a handler for each of
+the event types in the trace file; it simply prints every available
+field for each event in the trace file.
+
+You can also look at the existing scripts in
+~/libexec/perf-core/scripts/perl for typical examples showing how to
+do basic things like aggregate event data, print results, etc.  Also,
+the check-perf-script.pl script, while not interesting for its results,
+attempts to exercise all of the main scripting features.
+
+EVENT HANDLERS
+--------------
+
+When perf script is invoked using a trace script, a user-defined
+'handler function' is called for each event in the trace.  If there's
+no handler function defined for a given event type, the event is
+ignored (or passed to a 'trace_unhandled' function, see below) and the
+next event is processed.
+
+Most of the event's field values are passed as arguments to the
+handler function; some of the less common ones aren't - those are
+available as calls back into the perf executable (see below).
+
+As an example, the following perf record command can be used to record
+all sched_wakeup events in the system:
+
+ # perf record -a -e sched:sched_wakeup
+
+Traces meant to be processed using a script should be recorded with
+the above option: -a to enable system-wide collection.
+
+The format file for the sched_wakeup event defines the following fields
+(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
+
+----
+ format:
+        field:unsigned short common_type;
+        field:unsigned char common_flags;
+        field:unsigned char common_preempt_count;
+        field:int common_pid;
+
+        field:char comm[TASK_COMM_LEN];
+        field:pid_t pid;
+        field:int prio;
+        field:int success;
+        field:int target_cpu;
+----
+
+The handler function for this event would be defined as:
+
+----
+sub sched::sched_wakeup
+{
+   my ($event_name, $context, $common_cpu, $common_secs,
+       $common_nsecs, $common_pid, $common_comm,
+       $comm, $pid, $prio, $success, $target_cpu) = @_;
+}
+----
+
+The handler function takes the form subsystem::event_name.
+
+The $common_* arguments in the handler's argument list are the set of
+arguments passed to all event handlers; some of the fields correspond
+to the common_* fields in the format file, but some are synthesized,
+and some of the common_* fields aren't common enough to to be passed
+to every event as arguments but are available as library functions.
+
+Here's a brief description of each of the invariant event args:
+
+ $event_name 	  	    the name of the event as text
+ $context		    an opaque 'cookie' used in calls back into perf
+ $common_cpu		    the cpu the event occurred on
+ $common_secs		    the secs portion of the event timestamp
+ $common_nsecs		    the nsecs portion of the event timestamp
+ $common_pid		    the pid of the current task
+ $common_comm		    the name of the current process
+
+All of the remaining fields in the event's format file have
+counterparts as handler function arguments of the same name, as can be
+seen in the example above.
+
+The above provides the basics needed to directly access every field of
+every event in a trace, which covers 90% of what you need to know to
+write a useful trace script.  The sections below cover the rest.
+
+SCRIPT LAYOUT
+-------------
+
+Every perf script Perl script should start by setting up a Perl module
+search path and 'use'ing a few support modules (see module
+descriptions below):
+
+----
+ use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+ use lib "./Perf-Trace-Util/lib";
+ use Perf::Trace::Core;
+ use Perf::Trace::Context;
+ use Perf::Trace::Util;
+----
+
+The rest of the script can contain handler functions and support
+functions in any order.
+
+Aside from the event handler functions discussed above, every script
+can implement a set of optional functions:
+
+*trace_begin*, if defined, is called before any event is processed and
+gives scripts a chance to do setup tasks:
+
+----
+ sub trace_begin
+ {
+ }
+----
+
+*trace_end*, if defined, is called after all events have been
+ processed and gives scripts a chance to do end-of-script tasks, such
+ as display results:
+
+----
+sub trace_end
+{
+}
+----
+
+*trace_unhandled*, if defined, is called after for any event that
+ doesn't have a handler explicitly defined for it.  The standard set
+ of common arguments are passed into it:
+
+----
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs,
+        $common_nsecs, $common_pid, $common_comm) = @_;
+}
+----
+
+The remaining sections provide descriptions of each of the available
+built-in perf script Perl modules and their associated functions.
+
+AVAILABLE MODULES AND FUNCTIONS
+-------------------------------
+
+The following sections describe the functions and variables available
+via the various Perf::Trace::* Perl modules.  To use the functions and
+variables from the given module, add the corresponding 'use
+Perf::Trace::XXX' line to your perf script script.
+
+Perf::Trace::Core Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions provide some essential functions to user scripts.
+
+The *flag_str* and *symbol_str* functions provide human-readable
+strings for flag and symbolic fields.  These correspond to the strings
+and values parsed from the 'print fmt' fields of the event format
+files:
+
+  flag_str($event_name, $field_name, $field_value) - returns the string representation corresponding to $field_value for the flag field $field_name of event $event_name
+  symbol_str($event_name, $field_name, $field_value) - returns the string representation corresponding to $field_value for the symbolic field $field_name of event $event_name
+
+Perf::Trace::Context Module
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of the 'common' fields in the event format file aren't all that
+common, but need to be made accessible to user scripts nonetheless.
+
+Perf::Trace::Context defines a set of functions that can be used to
+access this data in the context of the current event.  Each of these
+functions expects a $context variable, which is the same as the
+$context variable passed into every event handler as the second
+argument.
+
+ common_pc($context) - returns common_preempt count for the current event
+ common_flags($context) - returns common_flags for the current event
+ common_lock_depth($context) - returns common_lock_depth for the current event
+
+Perf::Trace::Util Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Various utility functions for use with perf script:
+
+  nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
+  nsecs_secs($nsecs) - returns whole secs portion given nsecs
+  nsecs_nsecs($nsecs) - returns nsecs remainder given nsecs
+  nsecs_str($nsecs) - returns printable string in the form secs.nsecs
+  avg($total, $n) - returns average given a sum and a total number of values
+
+SEE ALSO
+--------
+linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt
new file mode 100644
index 000000000..cf4b7f4b6
--- /dev/null
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -0,0 +1,679 @@
+perf-script-python(1)
+====================
+
+NAME
+----
+perf-script-python - Process trace data with a Python script
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [-s [Python]:script[.py] ]
+
+DESCRIPTION
+-----------
+
+This perf script option is used to process perf script data using perf's
+built-in Python interpreter.  It reads and processes the input file and
+displays the results of the trace analysis implemented in the given
+Python script, if any.
+
+A QUICK EXAMPLE
+---------------
+
+This section shows the process, start to finish, of creating a working
+Python script that aggregates and extracts useful information from a
+raw perf script stream.  You can avoid reading the rest of this
+document if an example is enough for you; the rest of the document
+provides more details on each step and lists the library functions
+available to script writers.
+
+This example actually details the steps that were used to create the
+'syscall-counts' script you see when you list the available perf script
+scripts via 'perf script -l'.  As such, this script also shows how to
+integrate your script into the list of general-purpose 'perf script'
+scripts listed by that command.
+
+The syscall-counts script is a simple script, but demonstrates all the
+basic ideas necessary to create a useful script.  Here's an example
+of its output (syscall names are not yet supported, they will appear
+as numbers):
+
+----
+syscall events:
+
+event                                          count
+----------------------------------------  -----------
+sys_write                                     455067
+sys_getdents                                    4072
+sys_close                                       3037
+sys_swapoff                                     1769
+sys_read                                         923
+sys_sched_setparam                               826
+sys_open                                         331
+sys_newfstat                                     326
+sys_mmap                                         217
+sys_munmap                                       216
+sys_futex                                        141
+sys_select                                       102
+sys_poll                                          84
+sys_setitimer                                     12
+sys_writev                                         8
+15                                                 8
+sys_lseek                                          7
+sys_rt_sigprocmask                                 6
+sys_wait4                                          3
+sys_ioctl                                          3
+sys_set_robust_list                                1
+sys_exit                                           1
+56                                                 1
+sys_access                                         1
+----
+
+Basically our task is to keep a per-syscall tally that gets updated
+every time a system call occurs in the system.  Our script will do
+that, but first we need to record the data that will be processed by
+that script.  Theoretically, there are a couple of ways we could do
+that:
+
+- we could enable every event under the tracing/events/syscalls
+  directory, but this is over 600 syscalls, well beyond the number
+  allowable by perf.  These individual syscall events will however be
+  useful if we want to later use the guidance we get from the
+  general-purpose scripts to drill down and get more detail about
+  individual syscalls of interest.
+
+- we can enable the sys_enter and/or sys_exit syscalls found under
+  tracing/events/raw_syscalls.  These are called for all syscalls; the
+  'id' field can be used to distinguish between individual syscall
+  numbers.
+
+For this script, we only need to know that a syscall was entered; we
+don't care how it exited, so we'll use 'perf record' to record only
+the sys_enter events:
+
+----
+# perf record -a -e raw_syscalls:sys_enter
+
+^C[ perf record: Woken up 1 times to write data ]
+[ perf record: Captured and wrote 56.545 MB perf.data (~2470503 samples) ]
+----
+
+The options basically say to collect data for every syscall event
+system-wide and multiplex the per-cpu output into a single stream.
+That single stream will be recorded in a file in the current directory
+called perf.data.
+
+Once we have a perf.data file containing our data, we can use the -g
+'perf script' option to generate a Python script that will contain a
+callback handler for each event type found in the perf.data trace
+stream (for more details, see the STARTER SCRIPTS section).
+
+----
+# perf script -g python
+generated Python script: perf-script.py
+
+The output file created also in the current directory is named
+perf-script.py.  Here's the file in its entirety:
+
+# perf script event handlers, generated by perf script -g python
+# Licensed under the terms of the GNU GPL License version 2
+
+# The common_* event handler fields are the most useful fields common to
+# all events.  They don't necessarily correspond to the 'common_*' fields
+# in the format files.  Those fields not available as handler params can
+# be retrieved using Python functions of the form common_*(context).
+# See the perf-script-python Documentation for the list of available functions.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+def trace_begin():
+	print "in trace_begin"
+
+def trace_end():
+	print "in trace_end"
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	id, args):
+		print_header(event_name, common_cpu, common_secs, common_nsecs,
+			common_pid, common_comm)
+
+		print "id=%d, args=%s\n" % \
+		(id, args),
+
+def trace_unhandled(event_name, context, event_fields_dict):
+		print ' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())])
+
+def print_header(event_name, cpu, secs, nsecs, pid, comm):
+	print "%-20s %5u %05u.%09u %8u %-20s " % \
+	(event_name, cpu, secs, nsecs, pid, comm),
+----
+
+At the top is a comment block followed by some import statements and a
+path append which every perf script script should include.
+
+Following that are a couple generated functions, trace_begin() and
+trace_end(), which are called at the beginning and the end of the
+script respectively (for more details, see the SCRIPT_LAYOUT section
+below).
+
+Following those are the 'event handler' functions generated one for
+every event in the 'perf record' output.  The handler functions take
+the form subsystem\__event_name, and contain named parameters, one for
+each field in the event; in this case, there's only one event,
+raw_syscalls__sys_enter().  (see the EVENT HANDLERS section below for
+more info on event handlers).
+
+The final couple of functions are, like the begin and end functions,
+generated for every script.  The first, trace_unhandled(), is called
+every time the script finds an event in the perf.data file that
+doesn't correspond to any event handler in the script.  This could
+mean either that the record step recorded event types that it wasn't
+really interested in, or the script was run against a trace file that
+doesn't correspond to the script.
+
+The script generated by -g option simply prints a line for each
+event found in the trace stream i.e. it basically just dumps the event
+and its parameter values to stdout.  The print_header() function is
+simply a utility function used for that purpose.  Let's rename the
+script and run it to see the default output:
+
+----
+# mv perf-script.py syscall-counts.py
+# perf script -s syscall-counts.py
+
+raw_syscalls__sys_enter     1 00840.847582083     7506 perf                  id=1, args=
+raw_syscalls__sys_enter     1 00840.847595764     7506 perf                  id=1, args=
+raw_syscalls__sys_enter     1 00840.847620860     7506 perf                  id=1, args=
+raw_syscalls__sys_enter     1 00840.847710478     6533 npviewer.bin          id=78, args=
+raw_syscalls__sys_enter     1 00840.847719204     6533 npviewer.bin          id=142, args=
+raw_syscalls__sys_enter     1 00840.847755445     6533 npviewer.bin          id=3, args=
+raw_syscalls__sys_enter     1 00840.847775601     6533 npviewer.bin          id=3, args=
+raw_syscalls__sys_enter     1 00840.847781820     6533 npviewer.bin          id=3, args=
+.
+.
+.
+----
+
+Of course, for this script, we're not interested in printing every
+trace event, but rather aggregating it in a useful way.  So we'll get
+rid of everything to do with printing as well as the trace_begin() and
+trace_unhandled() functions, which we won't be using.  That leaves us
+with this minimalistic skeleton:
+
+----
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+def trace_end():
+	print "in trace_end"
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	id, args):
+----
+
+In trace_end(), we'll simply print the results, but first we need to
+generate some results to print.  To do that we need to have our
+sys_enter() handler do the necessary tallying until all events have
+been counted.  A hash table indexed by syscall id is a good way to
+store that information; every time the sys_enter() handler is called,
+we simply increment a count associated with that hash entry indexed by
+that syscall id:
+
+----
+  syscalls = autodict()
+
+  try:
+    syscalls[id] += 1
+  except TypeError:
+    syscalls[id] = 1
+----
+
+The syscalls 'autodict' object is a special kind of Python dictionary
+(implemented in Core.py) that implements Perl's 'autovivifying' hashes
+in Python i.e. with autovivifying hashes, you can assign nested hash
+values without having to go to the trouble of creating intermediate
+levels if they don't exist e.g syscalls[comm][pid][id] = 1 will create
+the intermediate hash levels and finally assign the value 1 to the
+hash entry for 'id' (because the value being assigned isn't a hash
+object itself, the initial value is assigned in the TypeError
+exception.  Well, there may be a better way to do this in Python but
+that's what works for now).
+
+Putting that code into the raw_syscalls__sys_enter() handler, we
+effectively end up with a single-level dictionary keyed on syscall id
+and having the counts we've tallied as values.
+
+The print_syscall_totals() function iterates over the entries in the
+dictionary and displays a line for each entry containing the syscall
+name (the dictionary keys contain the syscall ids, which are passed to
+the Util function syscall_name(), which translates the raw syscall
+numbers to the corresponding syscall name strings).  The output is
+displayed after all the events in the trace have been processed, by
+calling the print_syscall_totals() function from the trace_end()
+handler called at the end of script processing.
+
+The final script producing the output shown above is shown in its
+entirety below (syscall_name() helper is not yet available, you can
+only deal with id's for now):
+
+----
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+syscalls = autodict()
+
+def trace_end():
+	print_syscall_totals()
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	id, args):
+	try:
+		syscalls[id] += 1
+	except TypeError:
+		syscalls[id] = 1
+
+def print_syscall_totals():
+    if for_comm is not None:
+	    print "\nsyscall events for %s:\n\n" % (for_comm),
+    else:
+	    print "\nsyscall events:\n\n",
+
+    print "%-40s  %10s\n" % ("event", "count"),
+    print "%-40s  %10s\n" % ("----------------------------------------", \
+                                 "-----------"),
+
+    for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
+				  reverse = True):
+	    print "%-40s  %10d\n" % (syscall_name(id), val),
+----
+
+The script can be run just as before:
+
+  # perf script -s syscall-counts.py
+
+So those are the essential steps in writing and running a script.  The
+process can be generalized to any tracepoint or set of tracepoints
+you're interested in - basically find the tracepoint(s) you're
+interested in by looking at the list of available events shown by
+'perf list' and/or look in /sys/kernel/debug/tracing/events/ for
+detailed event and field info, record the corresponding trace data
+using 'perf record', passing it the list of interesting events,
+generate a skeleton script using 'perf script -g python' and modify the
+code to aggregate and display it for your particular needs.
+
+After you've done that you may end up with a general-purpose script
+that you want to keep around and have available for future use.  By
+writing a couple of very simple shell scripts and putting them in the
+right place, you can have your script listed alongside the other
+scripts listed by the 'perf script -l' command e.g.:
+
+----
+# perf script -l
+List of available trace scripts:
+  wakeup-latency                       system-wide min/max/avg wakeup latency
+  rw-by-file <comm>                    r/w activity for a program, by file
+  rw-by-pid                            system-wide r/w activity
+----
+
+A nice side effect of doing this is that you also then capture the
+probably lengthy 'perf record' command needed to record the events for
+the script.
+
+To have the script appear as a 'built-in' script, you write two simple
+scripts, one for recording and one for 'reporting'.
+
+The 'record' script is a shell script with the same base name as your
+script, but with -record appended.  The shell script should be put
+into the perf/scripts/python/bin directory in the kernel source tree.
+In that script, you write the 'perf record' command-line needed for
+your script:
+
+----
+# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-record
+
+#!/bin/bash
+perf record -a -e raw_syscalls:sys_enter
+----
+
+The 'report' script is also a shell script with the same base name as
+your script, but with -report appended.  It should also be located in
+the perf/scripts/python/bin directory.  In that script, you write the
+'perf script -s' command-line needed for running your script:
+
+----
+# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
+
+#!/bin/bash
+# description: system-wide syscall counts
+perf script -s ~/libexec/perf-core/scripts/python/syscall-counts.py
+----
+
+Note that the location of the Python script given in the shell script
+is in the libexec/perf-core/scripts/python directory - this is where
+the script will be copied by 'make install' when you install perf.
+For the installation to install your script there, your script needs
+to be located in the perf/scripts/python directory in the kernel
+source tree:
+
+----
+# ls -al kernel-source/tools/perf/scripts/python
+total 32
+drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
+drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
+drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
+-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py
+drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util
+-rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
+----
+
+Once you've done that (don't forget to do a new 'make install',
+otherwise your script won't show up at run-time), 'perf script -l'
+should show a new entry for your script:
+
+----
+# perf script -l
+List of available trace scripts:
+  wakeup-latency                       system-wide min/max/avg wakeup latency
+  rw-by-file <comm>                    r/w activity for a program, by file
+  rw-by-pid                            system-wide r/w activity
+  syscall-counts                       system-wide syscall counts
+----
+
+You can now perform the record step via 'perf script record':
+
+  # perf script record syscall-counts
+
+and display the output using 'perf script report':
+
+  # perf script report syscall-counts
+
+STARTER SCRIPTS
+---------------
+
+You can quickly get started writing a script for a particular set of
+trace data by generating a skeleton script using 'perf script -g
+python' in the same directory as an existing perf.data trace file.
+That will generate a starter script containing a handler for each of
+the event types in the trace file; it simply prints every available
+field for each event in the trace file.
+
+You can also look at the existing scripts in
+~/libexec/perf-core/scripts/python for typical examples showing how to
+do basic things like aggregate event data, print results, etc.  Also,
+the check-perf-script.py script, while not interesting for its results,
+attempts to exercise all of the main scripting features.
+
+EVENT HANDLERS
+--------------
+
+When perf script is invoked using a trace script, a user-defined
+'handler function' is called for each event in the trace.  If there's
+no handler function defined for a given event type, the event is
+ignored (or passed to a 'trace_unhandled' function, see below) and the
+next event is processed.
+
+Most of the event's field values are passed as arguments to the
+handler function; some of the less common ones aren't - those are
+available as calls back into the perf executable (see below).
+
+As an example, the following perf record command can be used to record
+all sched_wakeup events in the system:
+
+ # perf record -a -e sched:sched_wakeup
+
+Traces meant to be processed using a script should be recorded with
+the above option: -a to enable system-wide collection.
+
+The format file for the sched_wakeup event defines the following fields
+(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
+
+----
+ format:
+        field:unsigned short common_type;
+        field:unsigned char common_flags;
+        field:unsigned char common_preempt_count;
+        field:int common_pid;
+
+        field:char comm[TASK_COMM_LEN];
+        field:pid_t pid;
+        field:int prio;
+        field:int success;
+        field:int target_cpu;
+----
+
+The handler function for this event would be defined as:
+
+----
+def sched__sched_wakeup(event_name, context, common_cpu, common_secs,
+       common_nsecs, common_pid, common_comm,
+       comm, pid, prio, success, target_cpu):
+       pass
+----
+
+The handler function takes the form subsystem__event_name.
+
+The common_* arguments in the handler's argument list are the set of
+arguments passed to all event handlers; some of the fields correspond
+to the common_* fields in the format file, but some are synthesized,
+and some of the common_* fields aren't common enough to to be passed
+to every event as arguments but are available as library functions.
+
+Here's a brief description of each of the invariant event args:
+
+ event_name 	  	    the name of the event as text
+ context		    an opaque 'cookie' used in calls back into perf
+ common_cpu		    the cpu the event occurred on
+ common_secs		    the secs portion of the event timestamp
+ common_nsecs		    the nsecs portion of the event timestamp
+ common_pid		    the pid of the current task
+ common_comm		    the name of the current process
+
+All of the remaining fields in the event's format file have
+counterparts as handler function arguments of the same name, as can be
+seen in the example above.
+
+The above provides the basics needed to directly access every field of
+every event in a trace, which covers 90% of what you need to know to
+write a useful trace script.  The sections below cover the rest.
+
+SCRIPT LAYOUT
+-------------
+
+Every perf script Python script should start by setting up a Python
+module search path and 'import'ing a few support modules (see module
+descriptions below):
+
+----
+ import os
+ import sys
+
+ sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	      '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+ from perf_trace_context import *
+ from Core import *
+----
+
+The rest of the script can contain handler functions and support
+functions in any order.
+
+Aside from the event handler functions discussed above, every script
+can implement a set of optional functions:
+
+*trace_begin*, if defined, is called before any event is processed and
+gives scripts a chance to do setup tasks:
+
+----
+def trace_begin():
+    pass
+----
+
+*trace_end*, if defined, is called after all events have been
+ processed and gives scripts a chance to do end-of-script tasks, such
+ as display results:
+
+----
+def trace_end():
+    pass
+----
+
+*trace_unhandled*, if defined, is called after for any event that
+ doesn't have a handler explicitly defined for it.  The standard set
+ of common arguments are passed into it:
+
+----
+def trace_unhandled(event_name, context, event_fields_dict):
+    pass
+----
+
+*process_event*, if defined, is called for any non-tracepoint event
+
+----
+def process_event(param_dict):
+    pass
+----
+
+*context_switch*, if defined, is called for any context switch
+
+----
+def context_switch(ts, cpu, pid, tid, np_pid, np_tid, machine_pid, out, out_preempt, *x):
+    pass
+----
+
+*auxtrace_error*, if defined, is called for any AUX area tracing error
+
+----
+def auxtrace_error(typ, code, cpu, pid, tid, ip, ts, msg, cpumode, *x):
+    pass
+----
+
+The remaining sections provide descriptions of each of the available
+built-in perf script Python modules and their associated functions.
+
+AVAILABLE MODULES AND FUNCTIONS
+-------------------------------
+
+The following sections describe the functions and variables available
+via the various perf script Python modules.  To use the functions and
+variables from the given module, add the corresponding 'from XXXX
+import' line to your perf script script.
+
+Core.py Module
+~~~~~~~~~~~~~~
+
+These functions provide some essential functions to user scripts.
+
+The *flag_str* and *symbol_str* functions provide human-readable
+strings for flag and symbolic fields.  These correspond to the strings
+and values parsed from the 'print fmt' fields of the event format
+files:
+
+  flag_str(event_name, field_name, field_value) - returns the string representation corresponding to field_value for the flag field field_name of event event_name
+  symbol_str(event_name, field_name, field_value) - returns the string representation corresponding to field_value for the symbolic field field_name of event event_name
+
+The *autodict* function returns a special kind of Python
+dictionary that implements Perl's 'autovivifying' hashes in Python
+i.e. with autovivifying hashes, you can assign nested hash values
+without having to go to the trouble of creating intermediate levels if
+they don't exist.
+
+  autodict() - returns an autovivifying dictionary instance
+
+
+perf_trace_context Module
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of the 'common' fields in the event format file aren't all that
+common, but need to be made accessible to user scripts nonetheless.
+
+perf_trace_context defines a set of functions that can be used to
+access this data in the context of the current event.  Each of these
+functions expects a context variable, which is the same as the
+context variable passed into every tracepoint event handler as the second
+argument. For non-tracepoint events, the context variable is also present
+as perf_trace_context.perf_script_context .
+
+ common_pc(context) - returns common_preempt count for the current event
+ common_flags(context) - returns common_flags for the current event
+ common_lock_depth(context) - returns common_lock_depth for the current event
+ perf_sample_insn(context) - returns the machine code instruction
+ perf_set_itrace_options(context, itrace_options) - set --itrace options if they have not been set already
+ perf_sample_srcline(context) - returns source_file_name, line_number
+ perf_sample_srccode(context) - returns source_file_name, line_number, source_line
+
+
+Util.py Module
+~~~~~~~~~~~~~~
+
+Various utility functions for use with perf script:
+
+  nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
+  nsecs_secs(nsecs) - returns whole secs portion given nsecs
+  nsecs_nsecs(nsecs) - returns nsecs remainder given nsecs
+  nsecs_str(nsecs) - returns printable string in the form secs.nsecs
+  avg(total, n) - returns average given a sum and a total number of values
+
+SUPPORTED FIELDS
+----------------
+
+Currently supported fields:
+
+ev_name, comm, pid, tid, cpu, ip, time, period, phys_addr, addr,
+symbol, symoff, dso, time_enabled, time_running, values, callchain,
+brstack, brstacksym, datasrc, datasrc_decode, iregs, uregs,
+weight, transaction, raw_buf, attr, cpumode.
+
+Fields that may also be present:
+
+ flags - sample flags
+ flags_disp - sample flags display
+ insn_cnt - instruction count for determining instructions-per-cycle (IPC)
+ cyc_cnt - cycle count for determining IPC
+ addr_correlates_sym - addr can correlate to a symbol
+ addr_dso - addr dso
+ addr_symbol - addr symbol
+ addr_symoff - addr symbol offset
+
+Some fields have sub items:
+
+brstack:
+    from, to, from_dsoname, to_dsoname, mispred,
+    predicted, in_tx, abort, cycles.
+
+brstacksym:
+    items: from, to, pred, in_tx, abort (converted string)
+
+For example,
+We can use this code to print brstack "from", "to", "cycles".
+
+if 'brstack' in dict:
+	for entry in dict['brstack']:
+		print "from %s, to %s, cycles %s" % (entry["from"], entry["to"], entry["cycles"])
+
+SEE ALSO
+--------
+linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
new file mode 100644
index 000000000..68e37de5f
--- /dev/null
+++ b/tools/perf/Documentation/perf-script.txt
@@ -0,0 +1,518 @@
+perf-script(1)
+=============
+
+NAME
+----
+perf-script - Read perf.data (created by perf record) and display trace output
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [<options>]
+'perf script' [<options>] record <script> [<record-options>] <command>
+'perf script' [<options>] report <script> [script-args]
+'perf script' [<options>] <script> <required-script-args> [<record-options>] <command>
+'perf script' [<options>] <top-script> [script-args]
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the trace recorded.
+
+There are several variants of perf script:
+
+  'perf script' to see a detailed trace of the workload that was
+  recorded.
+
+  You can also run a set of pre-canned scripts that aggregate and
+  summarize the raw trace data in various ways (the list of scripts is
+  available via 'perf script -l').  The following variants allow you to
+  record and run those scripts:
+
+  'perf script record <script> <command>' to record the events required
+  for 'perf script report'.  <script> is the name displayed in the
+  output of 'perf script --list' i.e. the actual script name minus any
+  language extension.  If <command> is not specified, the events are
+  recorded using the -a (system-wide) 'perf record' option.
+
+  'perf script report <script> [args]' to run and display the results
+  of <script>.  <script> is the name displayed in the output of 'perf
+  script --list' i.e. the actual script name minus any language
+  extension.  The perf.data output from a previous run of 'perf script
+  record <script>' is used and should be present for this command to
+  succeed.  [args] refers to the (mainly optional) args expected by
+  the script.
+
+  'perf script <script> <required-script-args> <command>' to both
+  record the events required for <script> and to run the <script>
+  using 'live-mode' i.e. without writing anything to disk.  <script>
+  is the name displayed in the output of 'perf script --list' i.e. the
+  actual script name minus any language extension.  If <command> is
+  not specified, the events are recorded using the -a (system-wide)
+  'perf record' option.  If <script> has any required args, they
+  should be specified before <command>.  This mode doesn't allow for
+  optional script args to be specified; if optional script args are
+  desired, they can be specified using separate 'perf script record'
+  and 'perf script report' commands, with the stdout of the record step
+  piped to the stdin of the report script, using the '-o -' and '-i -'
+  options of the corresponding commands.
+
+  'perf script <top-script>' to both record the events required for
+  <top-script> and to run the <top-script> using 'live-mode'
+  i.e. without writing anything to disk.  <top-script> is the name
+  displayed in the output of 'perf script --list' i.e. the actual
+  script name minus any language extension; a <top-script> is defined
+  as any script name ending with the string 'top'.
+
+  [<record-options>] can be passed to the record steps of 'perf script
+  record' and 'live-mode' variants; this isn't possible however for
+  <top-script> 'live-mode' or 'perf script report' variants.
+
+  See the 'SEE ALSO' section for links to language-specific
+  information on how to write and run your own trace scripts.
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the trace data.
+
+--dump-unsorted-raw-trace=::
+        Same as --dump-raw-trace but not sorted in time order.
+
+-L::
+--Latency=::
+        Show latency attributes (irqs/preemption disabled, etc).
+
+-l::
+--list=::
+        Display a list of available trace scripts.
+
+-s ['lang']::
+--script=::
+        Process trace data with the given script ([lang]:script[.ext]).
+	If the string 'lang' is specified in place of a script name, a
+        list of supported languages will be displayed instead.
+
+-g::
+--gen-script=::
+        Generate perf-script.[ext] starter script for given language,
+        using current perf.data.
+
+--dlfilter=<file>::
+	Filter sample events using the given shared object file.
+	Refer linkperf:perf-dlfilter[1]
+
+--dlarg=<arg>::
+	Pass 'arg' as an argument to the dlfilter. --dlarg may be repeated
+	to add more arguments.
+
+--list-dlfilters::
+        Display a list of available dlfilters. Use with option -v (must come
+        before option --list-dlfilters) to show long descriptions.
+
+-a::
+        Force system-wide collection.  Scripts run without a <command>
+        normally use -a by default, while scripts run with a <command>
+        normally don't - this option allows the latter to be run in
+        system-wide mode.
+
+-i::
+--input=::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-d::
+--debug-mode::
+        Do various checks like samples ordering and lost events.
+
+-F::
+--fields::
+        Comma separated list of fields to print. Options are:
+        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
+        srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output,
+        brstackinsn, brstackinsnlen, brstackoff, callindent, insn, insnlen, synth,
+        phys_addr, metric, misc, srccode, ipc, data_page_size, code_page_size, ins_lat,
+        machine_pid, vcpu.
+        Field list can be prepended with the type, trace, sw or hw,
+        to indicate to which event type the field list applies.
+        e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
+
+		perf script -F <fields>
+
+	is equivalent to:
+
+		perf script -F trace:<fields> -F sw:<fields> -F hw:<fields>
+
+	i.e., the specified fields apply to all event types if the type string
+	is not given.
+
+	In addition to overriding fields, it is also possible to add or remove
+	fields from the defaults. For example
+
+		-F -cpu,+insn
+
+	removes the cpu field and adds the insn field. Adding/removing fields
+	cannot be mixed with normal overriding.
+
+	The arguments are processed in the order received. A later usage can
+	reset a prior request. e.g.:
+
+		-F trace: -F comm,tid,time,ip,sym
+
+	The first -F suppresses trace events (field list is ""), but then the
+	second invocation sets the fields to comm,tid,time,ip,sym. In this case a
+	warning is given to the user:
+
+		"Overriding previous field request for all events."
+
+	Alternatively, consider the order:
+
+		-F comm,tid,time,ip,sym -F trace:
+
+	The first -F sets the fields for all events and the second -F
+	suppresses trace events. The user is given a warning message about
+	the override, and the result of the above is that only S/W and H/W
+	events are displayed with the given fields.
+
+	It's possible tp add/remove fields only for specific event type:
+
+		-Fsw:-cpu,-period
+
+	removes cpu and period from software events.
+
+	For the 'wildcard' option if a user selected field is invalid for an
+	event type, a message is displayed to the user that the option is
+	ignored for that type. For example:
+
+		$ perf script -F comm,tid,trace
+		'trace' not valid for hardware events. Ignoring.
+		'trace' not valid for software events. Ignoring.
+
+	Alternatively, if the type is given an invalid field is specified it
+	is an error. For example:
+
+        perf script -v -F sw:comm,tid,trace
+        'trace' not valid for software events.
+
+	At this point usage is displayed, and perf-script exits.
+
+	The flags field is synthesized and may have a value when Instruction
+	Trace decoding. The flags are "bcrosyiABExghDt" which stand for branch,
+	call, return, conditional, system, asynchronous, interrupt,
+	transaction abort, trace begin, trace end, in transaction, VM-Entry,
+	VM-Exit, interrupt disabled and interrupt disable toggle respectively.
+	Known combinations of flags are printed more nicely e.g.
+	"call" for "bc", "return" for "br", "jcc" for "bo", "jmp" for "b",
+	"int" for "bci", "iret" for "bri", "syscall" for "bcs", "sysret" for "brs",
+	"async" for "by", "hw int" for "bcyi", "tx abrt" for "bA", "tr strt" for "bB",
+	"tr end" for "bE", "vmentry" for "bcg", "vmexit" for "bch".
+	However the "x", "D" and "t" flags will be displayed separately in those
+	cases e.g. "jcc     (xD)" for a condition branch within a transaction
+	with interrupts disabled. Note, interrupts becoming disabled is "t",
+	whereas interrupts becoming enabled is "Dt".
+
+	The callindent field is synthesized and may have a value when
+	Instruction Trace decoding. For calls and returns, it will display the
+	name of the symbol indented with spaces to reflect the stack depth.
+
+	When doing instruction trace decoding insn and insnlen give the
+	instruction bytes and the instruction length of the current
+	instruction.
+
+	The synth field is used by synthesized events which may be created when
+	Instruction Trace decoding.
+
+	The ipc (instructions per cycle) field is synthesized and may have a value when
+	Instruction Trace decoding.
+
+	The machine_pid and vcpu fields are derived from data resulting from using
+	perf inject to insert a perf.data file recorded inside a virtual machine into
+	a perf.data file recorded on the host at the same time.
+
+	Finally, a user may not set fields to none for all event types.
+	i.e., -F "" is not allowed.
+
+	The brstack output includes branch related information with raw addresses using the
+	/v/v/v/v/cycles syntax in the following order:
+	FROM: branch source instruction
+	TO  : branch target instruction
+        M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
+	X/- : X=branch inside a transactional region, -=not in transaction region or not supported
+	A/- : A=TSX abort entry, -=not aborted region or not supported
+	cycles
+
+	The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
+
+	When brstackinsn is specified the full assembler sequences of branch sequences for each sample
+	is printed. This is the full execution path leading to the sample. This is only supported when the
+	sample was recorded with perf record -b or -j any.
+
+	Use brstackinsnlen to print the brstackinsn lenght. For example, you
+	can’t know the next sequential instruction after an unconditional branch unless
+	you calculate that based on its length.
+
+	The brstackoff field will print an offset into a specific dso/binary.
+
+	With the metric option perf script can compute metrics for
+	sampling periods, similar to perf stat. This requires
+	specifying a group with multiple events defining metrics with the :S option
+	for perf record. perf will sample on the first event, and
+	print computed metrics for all the events in the group. Please note
+	that the metric computed is averaged over the whole sampling
+	period (since the last sample), not just for the sample point.
+
+	For sample events it's possible to display misc field with -F +misc option,
+	following letters are displayed for each bit:
+
+	  PERF_RECORD_MISC_KERNEL               K
+	  PERF_RECORD_MISC_USER                 U
+	  PERF_RECORD_MISC_HYPERVISOR           H
+	  PERF_RECORD_MISC_GUEST_KERNEL         G
+	  PERF_RECORD_MISC_GUEST_USER           g
+	  PERF_RECORD_MISC_MMAP_DATA*           M
+	  PERF_RECORD_MISC_COMM_EXEC            E
+	  PERF_RECORD_MISC_SWITCH_OUT           S
+	  PERF_RECORD_MISC_SWITCH_OUT_PREEMPT   Sp
+
+	  $ perf script -F +misc ...
+	   sched-messaging  1414 K     28690.636582:       4590 cycles ...
+	   sched-messaging  1407 U     28690.636600:     325620 cycles ...
+	   sched-messaging  1414 K     28690.636608:      19473 cycles ...
+	  misc field ___________/
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname
+
+--kallsyms=<file>::
+        kallsyms pathname
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-G::
+--hide-call-graph::
+        When printing symbols do not display call chain.
+
+--stop-bt::
+        Stop display of callgraph at these symbols
+
+-C::
+--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
+	be provided as a comma-separated list with no space: 0,1. Ranges of
+	CPUs are specified with -: 0-2. Default is to report samples on all
+	CPUs.
+
+-c::
+--comms=::
+	Only display events for these comms. CSV that understands
+	file://filename entries.
+
+--pid=::
+	Only show events for given process ID (comma separated list).
+
+--tid=::
+	Only show events for given thread ID (comma separated list).
+
+-I::
+--show-info::
+	Display extended information about the perf.data file. This adds
+	information which may be very large and thus may clutter the display.
+	It currently includes: cpu and numa topology of the host system.
+	It can only be used with the perf script report mode.
+
+--show-kernel-path::
+	Try to resolve the path of [kernel.kallsyms]
+
+--show-task-events
+	Display task related events (e.g. FORK, COMM, EXIT).
+
+--show-mmap-events
+	Display mmap related events (e.g. MMAP, MMAP2).
+
+--show-namespace-events
+	Display namespace events i.e. events of type PERF_RECORD_NAMESPACES.
+
+--show-switch-events
+	Display context switch events i.e. events of type PERF_RECORD_SWITCH or
+	PERF_RECORD_SWITCH_CPU_WIDE.
+
+--show-lost-events
+	Display lost events i.e. events of type PERF_RECORD_LOST.
+
+--show-round-events
+	Display finished round events i.e. events of type PERF_RECORD_FINISHED_ROUND.
+
+--show-bpf-events
+	Display bpf events i.e. events of type PERF_RECORD_KSYMBOL and PERF_RECORD_BPF_EVENT.
+
+--show-cgroup-events
+	Display cgroup events i.e. events of type PERF_RECORD_CGROUP.
+
+--show-text-poke-events
+	Display text poke events i.e. events of type PERF_RECORD_TEXT_POKE and
+	PERF_RECORD_KSYMBOL.
+
+--demangle::
+	Demangle symbol names to human readable form. It's enabled by default,
+	disable with --no-demangle.
+
+--demangle-kernel::
+	Demangle kernel symbol names to human readable form (for C++ kernels).
+
+--header
+	Show perf.data header.
+
+--header-only
+	Show only perf.data header.
+
+--itrace::
+	Options for decoding instruction tracing data. The options are:
+
+include::itrace.txt[]
+
+	To disable decoding entirely, use --no-itrace.
+
+--full-source-path::
+	Show the full path for source files for srcline output.
+
+--max-stack::
+        Set the stack depth limit when parsing the callchain, anything
+        beyond the specified depth will be ignored. This is a trade-off
+        between information loss and faster processing especially for
+        workloads that can have a very long callchain stack.
+        Note that when using the --itrace option the synthesized callchain size
+        will override this value if the synthesized callchain size is bigger.
+
+        Default: 127
+
+--ns::
+	Use 9 decimal places when displaying time (i.e. show the nanoseconds)
+
+-f::
+--force::
+	Don't do ownership validation.
+
+--time::
+	Only analyze samples within given time window: <start>,<stop>. Times
+	have the format seconds.nanoseconds. If start is not given (i.e. time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e. time string is 'x.y,') then analysis goes
+	to end of file. Multiple ranges can be separated by spaces, which
+	requires the argument to be quoted e.g. --time "1234.567,1234.789 1235,"
+
+	Also support time percent with multiple time ranges. Time string is
+	'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'.
+
+	For example:
+	Select the second 10% time slice:
+	perf script --time 10%/2
+
+	Select from 0% to 10% time slice:
+	perf script --time 0%-10%
+
+	Select the first and second 10% time slices:
+	perf script --time 10%/1,10%/2
+
+	Select from 0% to 10% and 30% to 40% slices:
+	perf script --time 0%-10%,30%-40%
+
+--max-blocks::
+	Set the maximum number of program blocks to print with brstackinsn for
+	each sample.
+
+--reltime::
+	Print time stamps relative to trace start.
+
+--deltatime::
+	Print time stamps relative to previous event.
+
+--per-event-dump::
+	Create per event files with a "perf.data.EVENT.dump" name instead of
+        printing to stdout, useful, for instance, for generating flamegraphs.
+
+--inline::
+	If a callgraph address belongs to an inlined function, the inline stack
+	will be printed. Each entry has function name and file/line. Enabled by
+	default, disable with --no-inline.
+
+--insn-trace::
+	Show instruction stream for intel_pt traces. Combine with --xed to
+	show disassembly.
+
+--xed::
+	Run xed disassembler on output. Requires installing the xed disassembler.
+
+-S::
+--symbols=symbol[,symbol...]::
+	Only consider the listed symbols. Symbols are typically a name
+	but they may also be hexadecimal address.
+
+	The hexadecimal address may be the start address of a symbol or
+	any other address to filter the trace records
+
+	For example, to select the symbol noploop or the address 0x4007a0:
+	perf script --symbols=noploop,0x4007a0
+
+	Support filtering trace records by symbol name, start address of
+	symbol, any hexadecimal address and address range.
+
+	The comparison order is:
+
+	1. symbol name comparison
+	2. symbol start address comparison.
+	3. any hexadecimal address comparison.
+	4. address range comparison (see --addr-range).
+
+--addr-range::
+       Use with -S or --symbols to list traced records within address range.
+
+       For example, to list the traced records within the address range
+       [0x4007a0, 0x0x4007a9]:
+       perf script -S 0x4007a0 --addr-range 10
+
+--dsos=::
+	Only consider symbols in these DSOs.
+
+--call-trace::
+	Show call stream for intel_pt traces. The CPUs are interleaved, but
+	can be filtered with -C.
+
+--call-ret-trace::
+	Show call and return stream for intel_pt traces.
+
+--graph-function::
+	For itrace only show specified functions and their callees for
+	itrace. Multiple functions can be separated by comma.
+
+--switch-on EVENT_NAME::
+	Only consider events after this event is found.
+
+--switch-off EVENT_NAME::
+	Stop considering events after this event is found.
+
+--show-on-off-events::
+	Show the --switch-on/off events too.
+
+--stitch-lbr::
+	Show callgraph with stitched LBRs, which may have more complete
+	callgraph. The perf.data file must have been obtained using
+	perf record --call-graph lbr.
+	Disabled by default. In common cases with call stack overflows,
+	it can recreate better call stacks than the default lbr call stack
+	output. But this approach is not full proof. There can be cases
+	where it creates incorrect call stacks from incorrect matches.
+	The known limitations include exception handing such as
+	setjmp/longjmp will have calls/returns not match.
+
+:GMEXAMPLECMD: script
+:GMEXAMPLESUBCMD:
+include::guest-files.txt[]
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-script-perl[1],
+linkperf:perf-script-python[1], linkperf:perf-intel-pt[1],
+linkperf:perf-dlfilter[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
new file mode 100644
index 000000000..18abdc1dc
--- /dev/null
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -0,0 +1,596 @@
+perf-stat(1)
+============
+
+NAME
+----
+perf-stat - Run a command and gather performance counter statistics
+
+SYNOPSIS
+--------
+[verse]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] \-- <command> [<options>]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] record [-o file] \-- <command> [<options>]
+'perf stat' report [-i file]
+
+DESCRIPTION
+-----------
+This command runs a command and gathers performance counter statistics
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+record::
+	See STAT RECORD.
+
+report::
+	See STAT REPORT.
+
+-e::
+--event=::
+	Select the PMU event. Selection can be:
+
+	- a symbolic event name (use 'perf list' to list all events)
+
+	- a raw PMU event in the form of rN where N is a hexadecimal value
+	  that represents the raw register encoding with the layout of the
+	  event control registers as described by entries in
+	  /sys/bus/event_source/devices/cpu/format/*.
+
+        - a symbolic or raw PMU event followed by an optional colon
+	  and a list of event modifiers, e.g., cpu-cycles:p.  See the
+	  linkperf:perf-list[1] man page for details on event modifiers.
+
+	- a symbolically formed event like 'pmu/param1=0x3,param2/' where
+	  param1 and param2 are defined as formats for the PMU in
+	  /sys/bus/event_source/devices/<pmu>/format/*
+
+	  'percore' is a event qualifier that sums up the event counts for both
+	  hardware threads in a core. For example:
+	  perf stat -A -a -e cpu/event,percore=1/,otherevent ...
+
+	- a symbolically formed event like 'pmu/config=M,config1=N,config2=K/'
+	  where M, N, K are numbers (in decimal, hex, octal format).
+	  Acceptable values for each of 'config', 'config1' and 'config2'
+	  parameters are defined by corresponding entries in
+	  /sys/bus/event_source/devices/<pmu>/format/*
+
+	Note that the last two syntaxes support prefix and glob matching in
+	the PMU name to simplify creation of events across multiple instances
+	of the same type of PMU in large systems (e.g. memory controller PMUs).
+	Multiple PMU instances are typical for uncore PMUs, so the prefix
+	'uncore_' is also ignored when performing this match.
+
+
+-i::
+--no-inherit::
+        child tasks do not inherit counters
+-p::
+--pid=<pid>::
+        stat events on existing process id (comma separated list)
+
+-t::
+--tid=<tid>::
+        stat events on existing thread id (comma separated list)
+
+-b::
+--bpf-prog::
+        stat events on existing bpf program id (comma separated list),
+        requiring root rights. bpftool-prog could be used to find program
+        id all bpf programs in the system. For example:
+
+  # bpftool prog | head -n 1
+  17247: tracepoint  name sys_enter  tag 192d548b9d754067  gpl
+
+  # perf stat -e cycles,instructions --bpf-prog 17247 --timeout 1000
+
+   Performance counter stats for 'BPF program(s) 17247':
+
+             85,967      cycles
+             28,982      instructions              #    0.34  insn per cycle
+
+        1.102235068 seconds time elapsed
+
+--bpf-counters::
+	Use BPF programs to aggregate readings from perf_events.  This
+	allows multiple perf-stat sessions that are counting the same metric (cycles,
+	instructions, etc.) to share hardware counters.
+	To use BPF programs on common events by default, use
+	"perf config stat.bpf-counter-events=<list_of_events>".
+
+--bpf-attr-map::
+	With option "--bpf-counters", different perf-stat sessions share
+	information about shared BPF programs and maps via a pinned hashmap.
+	Use "--bpf-attr-map" to specify the path of this pinned hashmap.
+	The default path is /sys/fs/bpf/perf_attr_map.
+
+ifdef::HAVE_LIBPFM[]
+--pfm-events events::
+Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)
+including support for event filters. For example '--pfm-events
+inst_retired:any_p:u:c=1:i'. More than one event can be passed to the
+option using the comma separator. Hardware events and generic hardware
+events cannot be mixed together. The latter must be used with the -e
+option. The -e option and this one can be mixed and matched.  Events
+can be grouped using the {} notation.
+endif::HAVE_LIBPFM[]
+
+-a::
+--all-cpus::
+        system-wide collection from all CPUs (default if no target is specified)
+
+--no-scale::
+	Don't scale/normalize counter values
+
+-d::
+--detailed::
+	print more detailed statistics, can be specified up to 3 times
+
+	   -d:          detailed events, L1 and LLC data cache
+        -d -d:     more detailed events, dTLB and iTLB events
+     -d -d -d:     very detailed events, adding prefetch events
+
+-r::
+--repeat=<n>::
+	repeat command and print average + stddev (max: 100). 0 means forever.
+
+-B::
+--big-num::
+        print large numbers with thousands' separators according to locale.
+	Enabled by default. Use "--no-big-num" to disable.
+	Default setting can be changed with "perf config stat.big-num=false".
+
+-C::
+--cpu=::
+Count only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+In per-thread mode, this option is ignored. The -a option is still necessary
+to activate system-wide monitoring. Default is to count on all CPUs.
+
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs.
+
+-n::
+--null::
+null run - Don't start any counters.
+
+This can be useful to measure just elapsed wall-clock time - or to assess the
+raw overhead of perf stat itself, without running any counters.
+
+-v::
+--verbose::
+        be more verbose (show counter open errors, etc)
+
+-x SEP::
+--field-separator SEP::
+print counts using a CSV-style output to make it easy to import directly into
+spreadsheets. Columns are separated by the string specified in SEP.
+
+--table:: Display time for each run (-r option), in a table format, e.g.:
+
+  $ perf stat --null -r 5 --table perf bench sched pipe
+
+   Performance counter stats for 'perf bench sched pipe' (5 runs):
+
+             # Table of individual measurements:
+             5.189 (-0.293) #
+             5.189 (-0.294) #
+             5.186 (-0.296) #
+             5.663 (+0.181) ##
+             6.186 (+0.703) ####
+
+             # Final result:
+             5.483 +- 0.198 seconds time elapsed  ( +-  3.62% )
+
+-G name::
+--cgroup name::
+monitor only in the container (cgroup) called "name". This option is available only
+in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
+container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
+can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
+to first event, second cgroup to second event and so on. It is possible to provide
+an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
+corresponding events, i.e., they always refer to events defined earlier on the command
+line. If the user wants to track multiple events for a specific cgroup, the user can
+use '-e e1 -e e2 -G foo,foo' or just use '-e e1 -e e2 -G foo'.
+
+If wanting to monitor, say, 'cycles' for a cgroup and also for system wide, this
+command line can be used: 'perf stat -e cycles -G cgroup_name -a -e cycles'.
+
+--for-each-cgroup name::
+Expand event list for each cgroup in "name" (allow multiple cgroups separated
+by comma).  It also support regex patterns to match multiple groups.  This has same
+effect that repeating -e option and -G option for each event x name.  This option
+cannot be used with -G/--cgroup option.
+
+-o file::
+--output file::
+Print the output into the designated file.
+
+--append::
+Append to the output file designated with the -o option. Ignored if -o is not specified.
+
+--log-fd::
+
+Log output to fd, instead of stderr.  Complementary to --output, and mutually exclusive
+with it.  --append may be used here.  Examples:
+     3>results  perf stat --log-fd 3          \-- $cmd
+     3>>results perf stat --log-fd 3 --append \-- $cmd
+
+--control=fifo:ctl-fifo[,ack-fifo]::
+--control=fd:ctl-fd[,ack-fd]::
+ctl-fifo / ack-fifo are opened and used as ctl-fd / ack-fd as follows.
+Listen on ctl-fd descriptor for command to control measurement ('enable': enable events,
+'disable': disable events). Measurements can be started with events disabled using
+--delay=-1 option. Optionally send control command completion ('ack\n') to ack-fd descriptor
+to synchronize with the controlling process. Example of bash shell script to enable and
+disable events during measurements:
+
+ #!/bin/bash
+
+ ctl_dir=/tmp/
+
+ ctl_fifo=${ctl_dir}perf_ctl.fifo
+ test -p ${ctl_fifo} && unlink ${ctl_fifo}
+ mkfifo ${ctl_fifo}
+ exec {ctl_fd}<>${ctl_fifo}
+
+ ctl_ack_fifo=${ctl_dir}perf_ctl_ack.fifo
+ test -p ${ctl_ack_fifo} && unlink ${ctl_ack_fifo}
+ mkfifo ${ctl_ack_fifo}
+ exec {ctl_fd_ack}<>${ctl_ack_fifo}
+
+ perf stat -D -1 -e cpu-cycles -a -I 1000       \
+           --control fd:${ctl_fd},${ctl_fd_ack} \
+           \-- sleep 30 &
+ perf_pid=$!
+
+ sleep 5  && echo 'enable' >&${ctl_fd} && read -u ${ctl_fd_ack} e1 && echo "enabled(${e1})"
+ sleep 10 && echo 'disable' >&${ctl_fd} && read -u ${ctl_fd_ack} d1 && echo "disabled(${d1})"
+
+ exec {ctl_fd_ack}>&-
+ unlink ${ctl_ack_fifo}
+
+ exec {ctl_fd}>&-
+ unlink ${ctl_fifo}
+
+ wait -n ${perf_pid}
+ exit $?
+
+
+--pre::
+--post::
+	Pre and post measurement hooks, e.g.:
+
+perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' \-- make -s -j64 O=defconfig-build/ bzImage
+
+-I msecs::
+--interval-print msecs::
+Print count deltas every N milliseconds (minimum: 1ms)
+The overhead percentage could be high in some cases, for instance with small, sub 100ms intervals.  Use with caution.
+	example: 'perf stat -I 1000 -e cycles -a sleep 5'
+
+If the metric exists, it is calculated by the counts generated in this interval and the metric is printed after #.
+
+--interval-count times::
+Print count deltas for fixed number of times.
+This option should be used together with "-I" option.
+	example: 'perf stat -I 1000 --interval-count 2 -e cycles -a'
+
+--interval-clear::
+Clear the screen before next interval.
+
+--timeout msecs::
+Stop the 'perf stat' session and print count deltas after N milliseconds (minimum: 10 ms).
+This option is not supported with the "-I" option.
+	example: 'perf stat --time 2000 -e cycles -a'
+
+--metric-only::
+Only print computed metrics. Print them in a single line.
+Don't show any raw values. Not supported with --per-thread.
+
+--per-socket::
+Aggregate counts per processor socket for system-wide mode measurements.  This
+is a useful mode to detect imbalance between sockets.  To enable this mode,
+use --per-socket in addition to -a. (system-wide).  The output includes the
+socket number and the number of online processors on that socket. This is
+useful to gauge the amount of aggregation.
+
+--per-die::
+Aggregate counts per processor die for system-wide mode measurements.  This
+is a useful mode to detect imbalance between dies.  To enable this mode,
+use --per-die in addition to -a. (system-wide).  The output includes the
+die number and the number of online processors on that die. This is
+useful to gauge the amount of aggregation.
+
+--per-core::
+Aggregate counts per physical processor for system-wide mode measurements.  This
+is a useful mode to detect imbalance between physical cores.  To enable this mode,
+use --per-core in addition to -a. (system-wide).  The output includes the
+core number and the number of online logical processors on that physical processor.
+
+--per-thread::
+Aggregate counts per monitored threads, when monitoring threads (-t option)
+or processes (-p option).
+
+--per-node::
+Aggregate counts per NUMA nodes for system-wide mode measurements. This
+is a useful mode to detect imbalance between NUMA nodes. To enable this
+mode, use --per-node in addition to -a. (system-wide).
+
+-D msecs::
+--delay msecs::
+After starting the program, wait msecs before measuring (-1: start with events
+disabled). This is useful to filter out the startup phase of the program,
+which is often very different.
+
+-T::
+--transaction::
+
+Print statistics of transactional execution if supported.
+
+--metric-no-group::
+By default, events to compute a metric are placed in weak groups. The
+group tries to enforce scheduling all or none of the events. The
+--metric-no-group option places events outside of groups and may
+increase the chance of the event being scheduled - leading to more
+accuracy. However, as events may not be scheduled together accuracy
+for metrics like instructions per cycle can be lower - as both metrics
+may no longer be being measured at the same time.
+
+--metric-no-merge::
+By default metric events in different weak groups can be shared if one
+group contains all the events needed by another. In such cases one
+group will be eliminated reducing event multiplexing and making it so
+that certain groups of metrics sum to 100%. A downside to sharing a
+group is that the group may require multiplexing and so accuracy for a
+small group that need not have multiplexing is lowered. This option
+forbids the event merging logic from sharing events between groups and
+may be used to increase accuracy in this case.
+
+--quiet::
+Don't print output, warnings or messages. This is useful with perf stat
+record below to only write data to the perf.data file.
+
+STAT RECORD
+-----------
+Stores stat data into perf data file.
+
+-o file::
+--output file::
+Output file name.
+
+STAT REPORT
+-----------
+Reads and reports stat data from perf data file.
+
+-i file::
+--input file::
+Input file name.
+
+--per-socket::
+Aggregate counts per processor socket for system-wide mode measurements.
+
+--per-die::
+Aggregate counts per processor die for system-wide mode measurements.
+
+--per-core::
+Aggregate counts per physical processor for system-wide mode measurements.
+
+-M::
+--metrics::
+Print metrics or metricgroups specified in a comma separated list.
+For a group all metrics from the group are added.
+The events from the metrics are automatically measured.
+See perf list output for the possible metrics and metricgroups.
+
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs.
+
+--topdown::
+Print complete top-down metrics supported by the CPU. This allows to
+determine bottle necks in the CPU pipeline for CPU bound workloads,
+by breaking the cycles consumed down into frontend bound, backend bound,
+bad speculation and retiring.
+
+Frontend bound means that the CPU cannot fetch and decode instructions fast
+enough. Backend bound means that computation or memory access is the bottle
+neck. Bad Speculation means that the CPU wasted cycles due to branch
+mispredictions and similar issues. Retiring means that the CPU computed without
+an apparently bottleneck. The bottleneck is only the real bottleneck
+if the workload is actually bound by the CPU and not by something else.
+
+For best results it is usually a good idea to use it with interval
+mode like -I 1000, as the bottleneck of workloads can change often.
+
+This enables --metric-only, unless overridden with --no-metric-only.
+
+The following restrictions only apply to older Intel CPUs and Atom,
+on newer CPUs (IceLake and later) TopDown can be collected for any thread:
+
+The top down metrics are collected per core instead of per
+CPU thread. Per core mode is automatically enabled
+and -a (global monitoring) is needed, requiring root rights or
+perf.perf_event_paranoid=-1.
+
+Topdown uses the full Performance Monitoring Unit, and needs
+disabling of the NMI watchdog (as root):
+echo 0 > /proc/sys/kernel/nmi_watchdog
+for best results. Otherwise the bottlenecks may be inconsistent
+on workload with changing phases.
+
+To interpret the results it is usually needed to know on which
+CPUs the workload runs on. If needed the CPUs can be forced using
+taskset.
+
+--td-level::
+Print the top-down statistics that equal to or lower than the input level.
+It allows users to print the interested top-down metrics level instead of
+the complete top-down metrics.
+
+The availability of the top-down metrics level depends on the hardware. For
+example, Ice Lake only supports L1 top-down metrics. The Sapphire Rapids
+supports both L1 and L2 top-down metrics.
+
+Default: 0 means the max level that the current hardware support.
+Error out if the input is higher than the supported max level.
+
+--no-merge::
+Do not merge results from same PMUs.
+
+When multiple events are created from a single event specification,
+stat will, by default, aggregate the event counts and show the result
+in a single row. This option disables that behavior and shows
+the individual events and counts.
+
+Multiple events are created from a single event specification when:
+1. Prefix or glob matching is used for the PMU name.
+2. Aliases, which are listed immediately after the Kernel PMU events
+   by perf list, are used.
+
+--hybrid-merge::
+Merge the hybrid event counts from all PMUs.
+
+For hybrid events, by default, the stat aggregates and reports the event
+counts per PMU. But sometimes, it's also useful to aggregate event counts
+from all PMUs. This option enables that behavior and reports the counts
+without PMUs.
+
+For non-hybrid events, it should be no effect.
+
+--smi-cost::
+Measure SMI cost if msr/aperf/ and msr/smi/ events are supported.
+
+During the measurement, the /sys/device/cpu/freeze_on_smi will be set to
+freeze core counters on SMI.
+The aperf counter will not be effected by the setting.
+The cost of SMI can be measured by (aperf - unhalted core cycles).
+
+In practice, the percentages of SMI cycles is very useful for performance
+oriented analysis. --metric_only will be applied by default.
+The output is SMI cycles%, equals to (aperf - unhalted core cycles) / aperf
+
+Users who wants to get the actual value can apply --no-metric-only.
+
+--all-kernel::
+Configure all used events to run in kernel space.
+
+--all-user::
+Configure all used events to run in user space.
+
+--percore-show-thread::
+The event modifier "percore" has supported to sum up the event counts
+for all hardware threads in a core and show the counts per core.
+
+This option with event modifier "percore" enabled also sums up the event
+counts for all hardware threads in a core but show the sum counts per
+hardware thread. This is essentially a replacement for the any bit and
+convenient for post processing.
+
+--summary::
+Print summary for interval mode (-I).
+
+--no-csv-summary::
+Don't print 'summary' at the first column for CVS summary output.
+This option must be used with -x and --summary.
+
+This option can be enabled in perf config by setting the variable
+'stat.no-csv-summary'.
+
+$ perf config stat.no-csv-summary=true
+
+--cputype::
+Only enable events on applying cpu with this type for hybrid platform
+(e.g. core or atom)"
+
+EXAMPLES
+--------
+
+$ perf stat \-- make
+
+   Performance counter stats for 'make':
+
+        83723.452481      task-clock:u (msec)       #    1.004 CPUs utilized
+                   0      context-switches:u        #    0.000 K/sec
+                   0      cpu-migrations:u          #    0.000 K/sec
+           3,228,188      page-faults:u             #    0.039 M/sec
+     229,570,665,834      cycles:u                  #    2.742 GHz
+     313,163,853,778      instructions:u            #    1.36  insn per cycle
+      69,704,684,856      branches:u                #  832.559 M/sec
+       2,078,861,393      branch-misses:u           #    2.98% of all branches
+
+        83.409183620 seconds time elapsed
+
+        74.684747000 seconds user
+         8.739217000 seconds sys
+
+TIMINGS
+-------
+As displayed in the example above we can display 3 types of timings.
+We always display the time the counters were enabled/alive:
+
+        83.409183620 seconds time elapsed
+
+For workload sessions we also display time the workloads spent in
+user/system lands:
+
+        74.684747000 seconds user
+         8.739217000 seconds sys
+
+Those times are the very same as displayed by the 'time' tool.
+
+CSV FORMAT
+----------
+
+With -x, perf stat is able to output a not-quite-CSV format output
+Commas in the output are not put into "". To make it easy to parse
+it is recommended to use a different character like -x \;
+
+The fields are in this order:
+
+	- optional usec time stamp in fractions of second (with -I xxx)
+	- optional CPU, core, or socket identifier
+	- optional number of logical CPUs aggregated
+	- counter value
+	- unit of the counter value or empty
+	- event name
+	- run time of counter
+	- percentage of measurement time the counter was running
+	- optional variance if multiple values are collected with -r
+	- optional metric value
+	- optional unit of metric
+
+Additional metrics may be printed with all earlier fields being empty.
+
+include::intel-hybrid.txt[]
+
+JSON FORMAT
+-----------
+
+With -j, perf stat is able to print out a JSON format output
+that can be used for parsing.
+
+- timestamp : optional usec time stamp in fractions of second (with -I)
+- optional aggregate options:
+		- core : core identifier (with --per-core)
+		- die : die identifier (with --per-die)
+		- socket : socket identifier (with --per-socket)
+		- node : node identifier (with --per-node)
+		- thread : thread identifier (with --per-thread)
+- counter-value : counter value
+- unit : unit of the counter value or empty
+- event : event name
+- variance : optional variance if multiple values are collected (with -r)
+- runtime : run time of counter
+- metric-value : optional metric value
+- metric-unit : optional unit of metric
+
+SEE ALSO
+--------
+linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt
new file mode 100644
index 000000000..b329c65d7
--- /dev/null
+++ b/tools/perf/Documentation/perf-test.txt
@@ -0,0 +1,36 @@
+perf-test(1)
+============
+
+NAME
+----
+perf-test - Runs sanity tests.
+
+SYNOPSIS
+--------
+[verse]
+'perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]'
+
+DESCRIPTION
+-----------
+This command does assorted sanity tests, initially through linked routines but
+also will look for a directory with more tests in the form of scripts.
+
+To get a list of available tests use 'perf test list', specifying a test name
+fragment will show all tests that have it.
+
+To run just specific tests, inform test name fragments or the numbers obtained
+from 'perf test list'.
+
+OPTIONS
+-------
+-s::
+--skip::
+	Tests to skip (comma separated numeric list).
+
+-v::
+--verbose::
+	Be more verbose.
+
+-F::
+--dont-fork::
+	Do not fork child for each test, run all tests within single process.
diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt
new file mode 100644
index 000000000..ef0c7565b
--- /dev/null
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -0,0 +1,128 @@
+perf-timechart(1)
+=================
+
+NAME
+----
+perf-timechart - Tool to visualize total system behavior during a workload
+
+SYNOPSIS
+--------
+[verse]
+'perf timechart' [<timechart options>] {record} [<record options>]
+
+DESCRIPTION
+-----------
+There are two variants of perf timechart:
+
+  'perf timechart record <command>' to record the system level events
+  of an arbitrary workload. By default timechart records only scheduler
+  and CPU events (task switches, running times, CPU power states, etc),
+  but it's possible to record IO (disk, network) activity using -I argument.
+
+  'perf timechart' to turn a trace into a Scalable Vector Graphics file,
+  that can be viewed with popular SVG viewers such as 'Inkscape'. Depending
+  on the events in the perf.data file, timechart will contain scheduler/cpu
+  events or IO events.
+
+  In IO mode, every bar has two charts: upper and lower.
+  Upper bar shows incoming events (disk reads, ingress network packets).
+  Lower bar shows outgoing events (disk writes, egress network packets).
+  There are also poll bars which show how much time application spent
+  in poll/epoll/select syscalls.
+
+TIMECHART OPTIONS
+-----------------
+-o::
+--output=::
+        Select the output file (default: output.svg)
+-i::
+--input=::
+        Select the input file (default: perf.data unless stdin is a fifo)
+-w::
+--width=::
+        Select the width of the SVG file (default: 1000)
+-P::
+--power-only::
+        Only output the CPU power section of the diagram
+-T::
+--tasks-only::
+        Don't output processor state transitions
+-p::
+--process::
+        Select the processes to display, by name or PID
+-f::
+--force::
+	Don't complain, do it.
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+-n::
+--proc-num::
+        Print task info for at least given number of tasks.
+-t::
+--topology::
+        Sort CPUs according to topology.
+--highlight=<duration_nsecs|task_name>::
+	Highlight tasks (using different color) that run more than given
+	duration or tasks with given name. If number is given it's interpreted
+	as number of nanoseconds. If non-numeric string is given it's
+	interpreted as task name.
+--io-skip-eagain::
+	Don't draw EAGAIN IO events.
+--io-min-time=<nsecs>::
+	Draw small events as if they lasted min-time. Useful when you need
+	to see very small and fast IO. It's possible to specify ms or us
+	suffix to specify time in milliseconds or microseconds.
+	Default value is 1ms.
+--io-merge-dist=<nsecs>::
+	Merge events that are merge-dist nanoseconds apart.
+	Reduces number of figures on the SVG and makes it more render-friendly.
+	It's possible to specify ms or us suffix to specify time in
+	milliseconds or microseconds.
+	Default value is 1us.
+
+RECORD OPTIONS
+--------------
+-P::
+--power-only::
+        Record only power-related events
+-T::
+--tasks-only::
+        Record only tasks-related events
+-I::
+--io-only::
+        Record only io-related events
+-g::
+--callchain::
+        Do call-graph (stack chain/backtrace) recording
+
+EXAMPLES
+--------
+
+$ perf timechart record git pull
+
+  [ perf record: Woken up 13 times to write data ]
+  [ perf record: Captured and wrote 4.253 MB perf.data (~185801 samples) ]
+
+$ perf timechart
+
+  Written 10.2 seconds of trace to output.svg.
+
+Record system-wide timechart:
+
+  $ perf timechart record
+
+  then generate timechart and highlight 'gcc' tasks:
+
+  $ perf timechart --highlight gcc
+
+Record system-wide IO events:
+
+  $ perf timechart record -I
+
+  then generate timechart:
+
+  $ perf timechart
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
new file mode 100644
index 000000000..c1fdba26b
--- /dev/null
+++ b/tools/perf/Documentation/perf-top.txt
@@ -0,0 +1,398 @@
+perf-top(1)
+===========
+
+NAME
+----
+perf-top - System profiling tool.
+
+SYNOPSIS
+--------
+[verse]
+'perf top' [-e <EVENT> | --event=EVENT] [<options>]
+
+DESCRIPTION
+-----------
+This command generates and displays a performance counter profile in real time.
+
+
+OPTIONS
+-------
+-a::
+--all-cpus::
+        System-wide collection.  (default)
+
+-c <count>::
+--count=<count>::
+	Event period to sample.
+
+-C <cpu-list>::
+--cpu=<cpu>::
+Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Default is to monitor all CPUS.
+
+-d <seconds>::
+--delay=<seconds>::
+	Number of seconds to delay between refreshes.
+
+-e <event>::
+--event=<event>::
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU event in the form
+	of rN where N is a hexadecimal value that represents the raw register
+	encoding with the layout of the event control registers as described
+	by entries in /sys/bus/event_source/devices/cpu/format/*.
+
+-E <entries>::
+--entries=<entries>::
+	Display this many functions.
+
+-f <count>::
+--count-filter=<count>::
+	Only display functions with more events than this.
+
+--group::
+        Put the counters into a counter group.
+
+--group-sort-idx::
+	Sort the output by the event at the index n in group. If n is invalid,
+	sort by the first event. It can support multiple groups with different
+	amount of events. WARNING: This should be used on grouped events.
+
+-F <freq>::
+--freq=<freq>::
+	Profile at this frequency. Use 'max' to use the currently maximum
+	allowed frequency, i.e. the value in the kernel.perf_event_max_sample_rate
+	sysctl.
+
+-i::
+--inherit::
+	Child tasks do not inherit counters.
+
+-k <path>::
+--vmlinux=<path>::
+	Path to vmlinux.  Required for annotation functionality.
+
+--ignore-vmlinux::
+	Ignore vmlinux files.
+
+--kallsyms=<file>::
+	kallsyms pathname
+
+-m <pages>::
+--mmap-pages=<pages>::
+	Number of mmap data pages (must be a power of two) or size
+	specification with appended unit character - B/K/M/G. The
+	size is rounded up to have nearest pages power of two value.
+
+-p <pid>::
+--pid=<pid>::
+	Profile events on existing Process ID (comma separated list).
+
+-t <tid>::
+--tid=<tid>::
+        Profile events on existing thread ID (comma separated list).
+
+-u::
+--uid=::
+        Record events in threads owned by uid. Name or number.
+
+-r <priority>::
+--realtime=<priority>::
+	Collect data with this RT SCHED_FIFO priority.
+
+--sym-annotate=<symbol>::
+        Annotate this symbol.
+
+-K::
+--hide_kernel_symbols::
+        Hide kernel symbols.
+
+-U::
+--hide_user_symbols::
+        Hide user symbols.
+
+--demangle-kernel::
+        Demangle kernel symbols.
+
+-D::
+--dump-symtab::
+        Dump the symbol table used for profiling.
+
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc).
+
+-z::
+--zero::
+	Zero history across display updates.
+
+-s::
+--sort::
+	Sort by key(s): pid, comm, dso, symbol, parent, srcline, weight,
+	local_weight, abort, in_tx, transaction, overhead, sample, period.
+	Please see description of --sort in the perf-report man page.
+
+--fields=::
+	Specify output field - multiple keys can be specified in CSV format.
+	Following fields are available:
+	overhead, overhead_sys, overhead_us, overhead_children, sample and period.
+	Also it can contain any sort key(s).
+
+	By default, every sort keys not specified in --field will be appended
+	automatically.
+
+-n::
+--show-nr-samples::
+	Show a column with the number of samples.
+
+--show-total-period::
+	Show a column with the sum of periods.
+
+--dsos::
+	Only consider symbols in these dsos.  This option will affect the
+	percentage of the overhead column.  See --percentage for more info.
+
+--comms::
+	Only consider symbols in these comms.  This option will affect the
+	percentage of the overhead column.  See --percentage for more info.
+
+--symbols::
+	Only consider these symbols.  This option will affect the
+	percentage of the overhead column.  See --percentage for more info.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--prefix=PREFIX::
+--prefix-strip=N::
+        Remove first N entries from source file path names in executables
+        and add PREFIX. This allows to display source code compiled on systems
+        with different file system layout.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+-g::
+	Enables call-graph (stack chain/backtrace) recording.
+
+--call-graph [mode,type,min[,limit],order[,key][,branch]]::
+	Setup and enable call-graph (stack chain/backtrace) recording,
+	implies -g.  See `--call-graph` section in perf-record and
+	perf-report man pages for details.
+
+--children::
+	Accumulate callchain of children to parent entry so that then can
+	show up in the output.  The output will have a new "Children" column
+	and will be sorted on the data.  It requires -g/--call-graph option
+	enabled.  See the `overhead calculation' section for more details.
+	Enabled by default, disable with --no-children.
+
+--max-stack::
+	Set the stack depth limit when parsing the callchain, anything
+	beyond the specified depth will be ignored. This is a trade-off
+	between information loss and faster processing especially for
+	workloads that can have a very long callchain stack.
+
+	Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+
+--ignore-callees=<regex>::
+        Ignore callees of the function(s) matching the given regex.
+        This has the effect of collecting the callers of each such
+        function into one place in the call-graph tree.
+
+--percent-limit::
+	Do not show entries which have an overhead under that percent.
+	(Default: 0).
+
+--percentage::
+	Determine how to display the overhead percentage of filtered entries.
+	Filters can be applied by --comms, --dsos and/or --symbols options and
+	Zoom operations on the TUI (thread, dso, etc).
+
+	"relative" means it's relative to filtered entries only so that the
+	sum of shown entries will be always 100%. "absolute" means it retains
+	the original value before and after the filter is applied.
+
+-w::
+--column-widths=<width[,width...]>::
+	Force each column width to the provided list, for large terminal
+	readability.  0 means no limit (default behavior).
+
+--proc-map-timeout::
+	When processing pre-existing threads /proc/XXX/mmap, it may take
+	a long time, because the file may be huge. A time out is needed
+	in such cases.
+	This option sets the time out limit. The default value is 500 ms.
+
+
+-b::
+--branch-any::
+	Enable taken branch stack sampling. Any type of taken branch may be sampled.
+	This is a shortcut for --branch-filter any. See --branch-filter for more infos.
+
+-j::
+--branch-filter::
+	Enable taken branch stack sampling. Each sample captures a series of consecutive
+	taken branches. The number of branches captured with each sample depends on the
+	underlying hardware, the type of branches of interest, and the executed code.
+	It is possible to select the types of branches captured by enabling filters.
+	For a full list of modifiers please see the perf record manpage.
+
+	The option requires at least one branch type among any, any_call, any_ret, ind_call, cond.
+	The privilege levels may be omitted, in which case, the privilege levels of the associated
+	event are applied to the branch filter. Both kernel (k) and hypervisor (hv) privilege
+	levels are subject to permissions.  When sampling on multiple events, branch stack sampling
+	is enabled for all the sampling events. The sampled branch type is the same for all events.
+	The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
+	Note that this feature may not be available on all processors.
+
+--raw-trace::
+	When displaying traceevent output, do not use print fmt or plugins.
+
+--hierarchy::
+	Enable hierarchy output.
+
+--overwrite::
+	Enable this to use just the most recent records, which helps in high core count
+	machines such as Knights Landing/Mill, but right now is disabled by default as
+	the pausing used in this technique is leading to loss of metadata events such
+	as PERF_RECORD_MMAP which makes 'perf top' unable to resolve samples, leading
+	to lots of unknown samples appearing on the UI. Enable this if you are in such
+	machines and profiling a workload that doesn't creates short lived threads and/or
+	doesn't uses many executable mmap operations. Work is being planed to solve
+	this situation, till then, this will remain disabled by default.
+
+--force::
+	Don't do ownership validation.
+
+--num-thread-synthesize::
+	The number of threads to run when synthesizing events for existing processes.
+	By default, the number of threads equals to the number of online CPUs.
+
+--namespaces::
+	Record events of type PERF_RECORD_NAMESPACES and display it with the
+	'cgroup_id' sort key.
+
+-G name::
+--cgroup name::
+monitor only in the container (cgroup) called "name". This option is available only
+in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
+container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
+can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
+to first event, second cgroup to second event and so on. It is possible to provide
+an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
+corresponding events, i.e., they always refer to events defined earlier on the command
+line. If the user wants to track multiple events for a specific cgroup, the user can
+use '-e e1 -e e2 -G foo,foo' or just use '-e e1 -e e2 -G foo'.
+
+--all-cgroups::
+	Record events of type PERF_RECORD_CGROUP and display it with the
+	'cgroup' sort key.
+
+--switch-on EVENT_NAME::
+	Only consider events after this event is found.
+
+	E.g.:
+
+           Find out where broadcast packets are handled
+
+		perf probe -L icmp_rcv
+
+	   Insert a probe there:
+
+		perf probe icmp_rcv:59
+
+	   Start perf top and ask it to only consider the cycles events when a
+           broadcast packet arrives This will show a menu with two entries and
+           will start counting when a broadcast packet arrives:
+
+		perf top -e cycles,probe:icmp_rcv --switch-on=probe:icmp_rcv
+
+	   Alternatively one can ask for --group and then two overhead columns
+           will appear, the first for cycles and the second for the switch-on event.
+
+		perf top --group -e cycles,probe:icmp_rcv --switch-on=probe:icmp_rcv
+
+	This may be interesting to measure a workload only after some initialization
+	phase is over, i.e. insert a perf probe at that point and use the above
+	examples replacing probe:icmp_rcv with the just-after-init probe.
+
+--switch-off EVENT_NAME::
+	Stop considering events after this event is found.
+
+--show-on-off-events::
+	Show the --switch-on/off events too. This has no effect in 'perf top' now
+	but probably we'll make the default not to show the switch-on/off events
+        on the --group mode and if there is only one event besides the off/on ones,
+	go straight to the histogram browser, just like 'perf top' with no events
+	explicitly specified does.
+
+--stitch-lbr::
+	Show callgraph with stitched LBRs, which may have more complete
+	callgraph. The option must be used with --call-graph lbr recording.
+	Disabled by default. In common cases with call stack overflows,
+	it can recreate better call stacks than the default lbr call stack
+	output. But this approach is not full proof. There can be cases
+	where it creates incorrect call stacks from incorrect matches.
+	The known limitations include exception handing such as
+	setjmp/longjmp will have calls/returns not match.
+
+ifdef::HAVE_LIBPFM[]
+--pfm-events events::
+Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)
+including support for event filters. For example '--pfm-events
+inst_retired:any_p:u:c=1:i'. More than one event can be passed to the
+option using the comma separator. Hardware events and generic hardware
+events cannot be mixed together. The latter must be used with the -e
+option. The -e option and this one can be mixed and matched.  Events
+can be grouped using the {} notation.
+endif::HAVE_LIBPFM[]
+
+INTERACTIVE PROMPTING KEYS
+--------------------------
+
+[d]::
+	Display refresh delay.
+
+[e]::
+	Number of entries to display.
+
+[E]::
+	Event to display when multiple counters are active.
+
+[f]::
+	Profile display filter (>= hit count).
+
+[F]::
+	Annotation display filter (>= % of total).
+
+[s]::
+	Annotate symbol.
+
+[S]::
+	Stop annotation, return to full profile display.
+
+[K]::
+	Hide kernel symbols.
+
+[U]::
+	Hide user symbols.
+
+[z]::
+	Toggle event count zeroing across display updates.
+
+[qQ]::
+	Quit.
+
+Pressing any unmapped key displays a menu, and prompts for input.
+
+include::callchain-overhead-calculation.txt[]
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
new file mode 100644
index 000000000..f0da8cf63
--- /dev/null
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -0,0 +1,347 @@
+perf-trace(1)
+=============
+
+NAME
+----
+perf-trace - strace inspired tool
+
+SYNOPSIS
+--------
+[verse]
+'perf trace'
+'perf trace record'
+
+DESCRIPTION
+-----------
+This command will show the events associated with the target, initially
+syscalls, but other system events like pagefaults, task lifetime events,
+scheduling events, etc.
+
+This is a live mode tool in addition to working with perf.data files like
+the other perf tools. Files can be generated using the 'perf record' command
+but the session needs to include the raw_syscalls events (-e 'raw_syscalls:*').
+Alternatively, 'perf trace record' can be used as a shortcut to
+automatically include the raw_syscalls events when writing events to a file.
+
+The following options apply to perf trace; options to perf trace record are
+found in the perf record man page.
+
+OPTIONS
+-------
+
+-a::
+--all-cpus::
+        System-wide collection from all CPUs.
+
+-e::
+--expr::
+--event::
+	List of syscalls and other perf events (tracepoints, HW cache events,
+	etc) to show. Globbing is supported, e.g.: "epoll_*", "*msg*", etc.
+	See 'perf list' for a complete list of events.
+	Prefixing with ! shows all syscalls but the ones specified.  You may
+	need to escape it.
+
+--filter=<filter>::
+        Event filter. This option should follow an event selector (-e) which
+	selects tracepoint event(s).
+
+
+-D msecs::
+--delay msecs::
+After starting the program, wait msecs before measuring. This is useful to
+filter out the startup phase of the program, which is often very different.
+
+-o::
+--output=::
+	Output file name.
+
+-p::
+--pid=::
+	Record events on existing process ID (comma separated list).
+
+-t::
+--tid=::
+        Record events on existing thread ID (comma separated list).
+
+-u::
+--uid=::
+        Record events in threads owned by uid. Name or number.
+
+-G::
+--cgroup::
+	Record events in threads in a cgroup.
+
+	Look for cgroups to set at the /sys/fs/cgroup/perf_event directory, then
+	remove the /sys/fs/cgroup/perf_event/ part and try:
+
+		perf trace -G A -e sched:*switch
+
+	Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
+	_and_ sched:sched_switch to the 'A' cgroup, while:
+
+		perf trace -e sched:*switch -G A
+
+	will only set the sched:sched_switch event to the 'A' cgroup, all the
+	other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
+	a cgroup (on the root cgroup, sys wide, etc).
+
+	Multiple cgroups:
+
+		perf trace -G A -e sched:*switch -G B
+
+	the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
+	to the 'B' cgroup.
+
+--filter-pids=::
+	Filter out events for these pids and for 'trace' itself (comma separated list).
+
+-v::
+--verbose::
+        Increase the verbosity level.
+
+--no-inherit::
+	Child tasks do not inherit counters.
+
+-m::
+--mmap-pages=::
+	Number of mmap data pages (must be a power of two) or size
+	specification with appended unit character - B/K/M/G. The
+	size is rounded up to have nearest pages power of two value.
+
+-C::
+--cpu::
+Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+In per-thread mode with inheritance mode on (default), Events are captured only when
+the thread executes on the designated CPUs. Default is to monitor all CPUs.
+
+--duration::
+	Show only events that had a duration greater than N.M ms.
+
+--sched::
+	Accrue thread runtime and provide a summary at the end of the session.
+
+--failure::
+	Show only syscalls that failed, i.e. that returned < 0.
+
+-i::
+--input::
+	Process events from a given perf data file.
+
+-T::
+--time::
+	Print full timestamp rather time relative to first sample.
+
+--comm::
+        Show process COMM right beside its ID, on by default, disable with --no-comm.
+
+-s::
+--summary::
+	Show only a summary of syscalls by thread with min, max, and average times
+    (in msec) and relative stddev.
+
+-S::
+--with-summary::
+	Show all syscalls followed by a summary by thread with min, max, and
+    average times (in msec) and relative stddev.
+
+--errno-summary::
+	To be used with -s or -S, to show stats for the errnos experienced by
+	syscalls, using only this option will trigger --summary.
+
+--tool_stats::
+	Show tool stats such as number of times fd->pathname was discovered thru
+	hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc.
+
+-f::
+--force::
+	Don't complain, do it.
+
+-F=[all|min|maj]::
+--pf=[all|min|maj]::
+	Trace pagefaults. Optionally, you can specify whether you want minor,
+	major or all pagefaults. Default value is maj.
+
+--syscalls::
+	Trace system calls. This options is enabled by default, disable with
+	--no-syscalls.
+
+--call-graph [mode,type,min[,limit],order[,key][,branch]]::
+        Setup and enable call-graph (stack chain/backtrace) recording.
+        See `--call-graph` section in perf-record and perf-report
+        man pages for details. The ones that are most useful in 'perf trace'
+        are 'dwarf' and 'lbr', where available, try: 'perf trace --call-graph dwarf'.
+
+        Using this will, for the root user, bump the value of --mmap-pages to 4
+        times the maximum for non-root users, based on the kernel.perf_event_mlock_kb
+        sysctl. This is done only if the user doesn't specify a --mmap-pages value.
+
+--kernel-syscall-graph::
+	 Show the kernel callchains on the syscall exit path.
+
+--max-events=N::
+	Stop after processing N events. Note that strace-like events are considered
+	only at exit time or when a syscall is interrupted, i.e. in those cases this
+	option is equivalent to the number of lines printed.
+
+--switch-on EVENT_NAME::
+	Only consider events after this event is found.
+
+--switch-off EVENT_NAME::
+	Stop considering events after this event is found.
+
+--show-on-off-events::
+	Show the --switch-on/off events too.
+
+--max-stack::
+        Set the stack depth limit when parsing the callchain, anything
+        beyond the specified depth will be ignored. Note that at this point
+        this is just about the presentation part, i.e. the kernel is still
+        not limiting, the overhead of callchains needs to be set via the
+        knobs in --call-graph dwarf.
+
+        Implies '--call-graph dwarf' when --call-graph not present on the
+        command line, on systems where DWARF unwinding was built in.
+
+        Default: /proc/sys/kernel/perf_event_max_stack when present for
+                 live sessions (without --input/-i), 127 otherwise.
+
+--min-stack::
+        Set the stack depth limit when parsing the callchain, anything
+        below the specified depth will be ignored. Disabled by default.
+
+        Implies '--call-graph dwarf' when --call-graph not present on the
+        command line, on systems where DWARF unwinding was built in.
+
+--print-sample::
+	Print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info for the
+	raw_syscalls:sys_{enter,exit} tracepoints, for debugging.
+
+--proc-map-timeout::
+	When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
+	because the file may be huge. A time out is needed in such cases.
+	This option sets the time out limit. The default value is 500 ms.
+
+--sort-events::
+	Do sorting on batches of events, use when noticing out of order events that
+	may happen, for instance, when a thread gets migrated to a different CPU
+	while processing a syscall.
+
+--libtraceevent_print::
+	Use libtraceevent to print tracepoint arguments. By default 'perf trace' uses
+	the same beautifiers used in the strace-like enter+exit lines to augment the
+	tracepoint arguments.
+
+--map-dump::
+	Dump BPF maps setup by events passed via -e, for instance the augmented_raw_syscalls
+	living in tools/perf/examples/bpf/augmented_raw_syscalls.c. For now this
+	dumps just boolean map values and integer keys, in time this will print in hex
+	by default and use BTF when available, as well as use functions to do pretty
+	printing using the existing 'perf trace' syscall arg beautifiers to map integer
+	arguments to strings (pid to comm, syscall id to syscall name, etc).
+
+
+PAGEFAULTS
+----------
+
+When tracing pagefaults, the format of the trace is as follows:
+
+<min|maj>fault [<ip.symbol>+<ip.offset>] => <addr.dso@addr.offset> (<map type><addr level>).
+
+- min/maj indicates whether fault event is minor or major;
+- ip.symbol shows symbol for instruction pointer (the code that generated the
+  fault); if no debug symbols available, perf trace will print raw IP;
+- addr.dso shows DSO for the faulted address;
+- map type is either 'd' for non-executable maps or 'x' for executable maps;
+- addr level is either 'k' for kernel dso or '.' for user dso.
+
+For symbols resolution you may need to install debugging symbols.
+
+Please be aware that duration is currently always 0 and doesn't reflect actual
+time it took for fault to be handled!
+
+When --verbose specified, perf trace tries to print all available information
+for both IP and fault address in the form of dso@symbol+offset.
+
+EXAMPLES
+--------
+
+Trace only major pagefaults:
+
+ $ perf trace --no-syscalls -F
+
+Trace syscalls, major and minor pagefaults:
+
+ $ perf trace -F all
+
+  1416.547 ( 0.000 ms): python/20235 majfault [CRYPTO_push_info_+0x0] => /lib/x86_64-linux-gnu/libcrypto.so.1.0.0@0x61be0 (x.)
+
+  As you can see, there was major pagefault in python process, from
+  CRYPTO_push_info_ routine which faulted somewhere in libcrypto.so.
+
+Trace the first 4 open, openat or open_by_handle_at syscalls (in the future more syscalls may match here):
+
+  $ perf trace -e open* --max-events 4
+  [root@jouet perf]# trace -e open* --max-events 4
+  2272.992 ( 0.037 ms): gnome-shell/1370 openat(dfd: CWD, filename: /proc/self/stat) = 31
+  2277.481 ( 0.139 ms): gnome-shell/3039 openat(dfd: CWD, filename: /proc/self/stat) = 65
+  3026.398 ( 0.076 ms): gnome-shell/3039 openat(dfd: CWD, filename: /proc/self/stat) = 65
+  4294.665 ( 0.015 ms): sed/15879 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC) = 3
+  $
+
+Trace the first minor page fault when running a workload:
+
+  # perf trace -F min --max-stack=7 --max-events 1 sleep 1
+     0.000 ( 0.000 ms): sleep/18006 minfault [__clear_user+0x1a] => 0x5626efa56080 (?k)
+                                       __clear_user ([kernel.kallsyms])
+                                       load_elf_binary ([kernel.kallsyms])
+                                       search_binary_handler ([kernel.kallsyms])
+                                       __do_execve_file.isra.33 ([kernel.kallsyms])
+                                       __x64_sys_execve ([kernel.kallsyms])
+                                       do_syscall_64 ([kernel.kallsyms])
+                                       entry_SYSCALL_64 ([kernel.kallsyms])
+  #
+
+Trace the next min page page fault to take place on the first CPU:
+
+  # perf trace -F min --call-graph=dwarf --max-events 1 --cpu 0
+     0.000 ( 0.000 ms): Web Content/17136 minfault [js::gc::Chunk::fetchNextDecommittedArena+0x4b] => 0x7fbe6181b000 (?.)
+                                       js::gc::FreeSpan::initAsEmpty (inlined)
+                                       js::gc::Arena::setAsNotAllocated (inlined)
+                                       js::gc::Chunk::fetchNextDecommittedArena (/usr/lib64/firefox/libxul.so)
+                                       js::gc::Chunk::allocateArena (/usr/lib64/firefox/libxul.so)
+                                       js::gc::GCRuntime::allocateArena (/usr/lib64/firefox/libxul.so)
+                                       js::gc::ArenaLists::allocateFromArena (/usr/lib64/firefox/libxul.so)
+                                       js::gc::GCRuntime::tryNewTenuredThing<JSString, (js::AllowGC)1> (inlined)
+                                       js::AllocateString<JSString, (js::AllowGC)1> (/usr/lib64/firefox/libxul.so)
+                                       js::Allocate<JSThinInlineString, (js::AllowGC)1> (inlined)
+                                       JSThinInlineString::new_<(js::AllowGC)1> (inlined)
+                                       AllocateInlineString<(js::AllowGC)1, unsigned char> (inlined)
+                                       js::ConcatStrings<(js::AllowGC)1> (/usr/lib64/firefox/libxul.so)
+                                       [0x18b26e6bc2bd] (/tmp/perf-17136.map)
+  #
+
+Trace the next two sched:sched_switch events, four block:*_plug events, the
+next block:*_unplug and the next three net:*dev_queue events, this last one
+with a backtrace of at most 16 entries, system wide:
+
+  # perf trace -e sched:*switch/nr=2/,block:*_plug/nr=4/,block:*_unplug/nr=1/,net:*dev_queue/nr=3,max-stack=16/
+     0.000 :0/0 sched:sched_switch:swapper/2:0 [120] S ==> rcu_sched:10 [120]
+     0.015 rcu_sched/10 sched:sched_switch:rcu_sched:10 [120] R ==> swapper/2:0 [120]
+   254.198 irq/50-iwlwifi/680 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051f600 len=66
+                                       __dev_queue_xmit ([kernel.kallsyms])
+   273.977 :0/0 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051f600 len=78
+                                       __dev_queue_xmit ([kernel.kallsyms])
+   274.007 :0/0 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051ff00 len=78
+                                       __dev_queue_xmit ([kernel.kallsyms])
+  2930.140 kworker/u16:58/2722 block:block_plug:[kworker/u16:58]
+  2930.162 kworker/u16:58/2722 block:block_unplug:[kworker/u16:58] 1
+  4466.094 jbd2/dm-2-8/748 block:block_plug:[jbd2/dm-2-8]
+  8050.123 kworker/u16:30/2694 block:block_plug:[kworker/u16:30]
+  8050.271 kworker/u16:30/2694 block:block_plug:[kworker/u16:30]
+  #
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-version.txt b/tools/perf/Documentation/perf-version.txt
new file mode 100644
index 000000000..e207b7cfc
--- /dev/null
+++ b/tools/perf/Documentation/perf-version.txt
@@ -0,0 +1,24 @@
+perf-version(1)
+===============
+
+NAME
+----
+perf-version - display the version of perf binary
+
+SYNOPSIS
+--------
+'perf version' [--build-options]
+
+DESCRIPTION
+-----------
+With no options given, the 'perf version' prints the perf version
+on the standard output.
+
+If the option '--build-options' is given, then the status of
+compiled-in libraries are printed on the standard output.
+
+OPTIONS
+-------
+--build-options::
+        Prints the status of compiled-in libraries on the
+        standard output.
diff --git a/tools/perf/Documentation/perf.data-directory-format.txt b/tools/perf/Documentation/perf.data-directory-format.txt
new file mode 100644
index 000000000..f37fbd291
--- /dev/null
+++ b/tools/perf/Documentation/perf.data-directory-format.txt
@@ -0,0 +1,63 @@
+perf.data directory format
+
+DISCLAIMER This is not ABI yet and is subject to possible change
+           in following versions of perf. We will remove this
+           disclaimer once the directory format soaks in.
+
+
+This document describes the on-disk perf.data directory format.
+
+The layout is described by HEADER_DIR_FORMAT feature.
+Currently it holds only version number (0):
+
+  HEADER_DIR_FORMAT = 24
+
+  struct {
+     uint64_t version;
+  }
+
+The current only version value 0 means that:
+  - there is a single perf.data file named 'data' within the directory.
+  e.g.
+
+    $ tree -ps perf.data
+    perf.data
+    └── [-rw-------       25912]  data
+
+Future versions are expected to describe different data files
+layout according to special needs.
+
+Currently the only 'perf record' option to output to a directory is
+the --kcore option which puts a copy of /proc/kcore into the directory.
+e.g.
+
+  $ sudo perf record --kcore uname
+  Linux
+  [ perf record: Woken up 1 times to write data ]
+  [ perf record: Captured and wrote 0.015 MB perf.data (9 samples) ]
+  $ sudo tree -ps perf.data
+  perf.data
+  ├── [-rw-------       23744]  data
+  └── [drwx------        4096]  kcore_dir
+      ├── [-r--------     6731125]  kallsyms
+      ├── [-r--------    40230912]  kcore
+      └── [-r--------        5419]  modules
+
+  1 directory, 4 files
+  $ sudo perf script -v
+  build id event received for vmlinux: 1eaa285996affce2d74d8e66dcea09a80c9941de
+  build id event received for [vdso]: 8bbaf5dc62a9b644b4d4e4539737e104e4a84541
+  build id event received for /lib/x86_64-linux-gnu/libc-2.28.so: 5b157f49586a3ca84d55837f97ff466767dd3445
+  Samples for 'cycles' event do not have CPU attribute set. Skipping 'cpu' field.
+  Using CPUID GenuineIntel-6-8E-A
+  Using perf.data/kcore_dir/kcore for kernel data
+  Using perf.data/kcore_dir/kallsyms for symbols
+              perf 15316 2060795.480902:          1 cycles:  ffffffffa2caa548 native_write_msr+0x8 (vmlinux)
+              perf 15316 2060795.480906:          1 cycles:  ffffffffa2caa548 native_write_msr+0x8 (vmlinux)
+              perf 15316 2060795.480908:          7 cycles:  ffffffffa2caa548 native_write_msr+0x8 (vmlinux)
+              perf 15316 2060795.480910:        119 cycles:  ffffffffa2caa54a native_write_msr+0xa (vmlinux)
+              perf 15316 2060795.480912:       2109 cycles:  ffffffffa2c9b7b0 native_apic_msr_write+0x0 (vmlinux)
+              perf 15316 2060795.480914:      37606 cycles:  ffffffffa2f121fe perf_event_addr_filters_exec+0x2e (vmlinux)
+             uname 15316 2060795.480924:     588287 cycles:  ffffffffa303a56d page_counter_try_charge+0x6d (vmlinux)
+             uname 15316 2060795.481067:    2261945 cycles:  ffffffffa301438f kmem_cache_free+0x4f (vmlinux)
+             uname 15316 2060795.481643:    2172167 cycles:      7f1a48c393c0 _IO_un_link+0x0 (/lib/x86_64-linux-gnu/libc-2.28.so)
diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
new file mode 100644
index 000000000..635ba043f
--- /dev/null
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -0,0 +1,682 @@
+perf.data format
+
+Uptodate as of v4.7
+
+This document describes the on-disk perf.data format, generated by perf record
+or perf inject and consumed by the other perf tools.
+
+On a high level perf.data contains the events generated by the PMUs, plus metadata.
+
+All fields are in native-endian of the machine that generated the perf.data.
+
+When perf is writing to a pipe it uses a special version of the file
+format that does not rely on seeking to adjust data offsets.  This
+format is described in "Pipe-mode data" section. The pipe data version can be
+augmented with additional events using perf inject.
+
+The file starts with a perf_header:
+
+struct perf_header {
+	char magic[8];		/* PERFILE2 */
+	uint64_t size;		/* size of the header */
+	uint64_t attr_size;	/* size of an attribute in attrs */
+	struct perf_file_section attrs;
+	struct perf_file_section data;
+	struct perf_file_section event_types;
+	uint64_t flags;
+	uint64_t flags1[3];
+};
+
+The magic number identifies the perf file and the version. Current perf versions
+use PERFILE2. Old perf versions generated a version 1 format (PERFFILE). Version 1
+is not described here. The magic number also identifies the endian. When the
+magic value is 64bit byte swapped compared the file is in non-native
+endian.
+
+A perf_file_section contains a pointer to another section of the perf file.
+The header contains three such pointers: for attributes, data and event types.
+
+struct perf_file_section {
+	uint64_t offset;	/* offset from start of file */
+	uint64_t size;		/* size of the section */
+};
+
+Flags section:
+
+For each of the optional features a perf_file_section it placed after the data
+section if the feature bit is set in the perf_header flags bitset. The
+respective perf_file_section points to the data of the additional header and
+defines its size.
+
+Some headers consist of strings, which are defined like this:
+
+struct perf_header_string {
+       uint32_t len;
+       char string[len]; /* zero terminated */
+};
+
+Some headers consist of a sequence of strings, which start with a
+
+struct perf_header_string_list {
+     uint32_t nr;
+     struct perf_header_string strings[nr]; /* variable length records */
+};
+
+The bits are the flags bits in a 256 bit bitmap starting with
+flags. These define the valid bits:
+
+	HEADER_RESERVED		= 0,	/* always cleared */
+	HEADER_FIRST_FEATURE	= 1,
+	HEADER_TRACING_DATA	= 1,
+
+Describe me.
+
+	HEADER_BUILD_ID = 2,
+
+The header consists of an sequence of build_id_event. The size of each record
+is defined by header.size (see perf_event.h). Each event defines a ELF build id
+for a executable file name for a pid. An ELF build id is a unique identifier
+assigned by the linker to an executable.
+
+struct build_id_event {
+	struct perf_event_header header;
+	pid_t			 pid;
+	uint8_t			 build_id[24];
+	char			 filename[header.size - offsetof(struct build_id_event, filename)];
+};
+
+	HEADER_HOSTNAME = 3,
+
+A perf_header_string with the hostname where the data was collected
+(uname -n)
+
+	HEADER_OSRELEASE = 4,
+
+A perf_header_string with the os release where the data was collected
+(uname -r)
+
+	HEADER_VERSION = 5,
+
+A perf_header_string with the perf user tool version where the
+data was collected. This is the same as the version of the source tree
+the perf tool was built from.
+
+	HEADER_ARCH = 6,
+
+A perf_header_string with the CPU architecture (uname -m)
+
+	HEADER_NRCPUS = 7,
+
+A structure defining the number of CPUs.
+
+struct nr_cpus {
+       uint32_t nr_cpus_available; /* CPUs not yet onlined */
+       uint32_t nr_cpus_online;
+};
+
+	HEADER_CPUDESC = 8,
+
+A perf_header_string with description of the CPU. On x86 this is the model name
+in /proc/cpuinfo
+
+	HEADER_CPUID = 9,
+
+A perf_header_string with the exact CPU type. On x86 this is
+vendor,family,model,stepping. For example: GenuineIntel,6,69,1
+
+	HEADER_TOTAL_MEM = 10,
+
+An uint64_t with the total memory in kilobytes.
+
+	HEADER_CMDLINE = 11,
+
+A perf_header_string_list with the perf arg-vector used to collect the data.
+
+	HEADER_EVENT_DESC = 12,
+
+Another description of the perf_event_attrs, more detailed than header.attrs
+including IDs and names. See perf_event.h or the man page for a description
+of a struct perf_event_attr.
+
+struct {
+       uint32_t nr; /* number of events */
+       uint32_t attr_size; /* size of each perf_event_attr */
+       struct {
+	      struct perf_event_attr attr;  /* size of attr_size */
+	      uint32_t nr_ids;
+	      struct perf_header_string event_string;
+	      uint64_t ids[nr_ids];
+       } events[nr]; /* Variable length records */
+};
+
+	HEADER_CPU_TOPOLOGY = 13,
+
+struct {
+	/*
+	 * First revision of HEADER_CPU_TOPOLOGY
+	 *
+	 * See 'struct perf_header_string_list' definition earlier
+	 * in this file.
+	 */
+
+       struct perf_header_string_list cores; /* Variable length */
+       struct perf_header_string_list threads; /* Variable length */
+
+       /*
+        * Second revision of HEADER_CPU_TOPOLOGY, older tools
+        * will not consider what comes next
+        */
+
+       struct {
+	      uint32_t core_id;
+	      uint32_t socket_id;
+       } cpus[nr]; /* Variable length records */
+       /* 'nr' comes from previously processed HEADER_NRCPUS's nr_cpu_avail */
+
+        /*
+	 * Third revision of HEADER_CPU_TOPOLOGY, older tools
+	 * will not consider what comes next
+	 */
+
+	struct perf_header_string_list dies; /* Variable length */
+	uint32_t die_id[nr_cpus_avail]; /* from previously processed HEADER_NR_CPUS, VLA */
+};
+
+Example:
+	sibling sockets : 0-8
+	sibling dies	: 0-3
+	sibling dies	: 4-7
+	sibling threads : 0-1
+	sibling threads : 2-3
+	sibling threads : 4-5
+	sibling threads : 6-7
+
+	HEADER_NUMA_TOPOLOGY = 14,
+
+	A list of NUMA node descriptions
+
+struct {
+       uint32_t nr;
+       struct {
+	      uint32_t nodenr;
+	      uint64_t mem_total;
+	      uint64_t mem_free;
+	      struct perf_header_string cpus;
+       } nodes[nr]; /* Variable length records */
+};
+
+	HEADER_BRANCH_STACK = 15,
+
+Not implemented in perf.
+
+	HEADER_PMU_MAPPINGS = 16,
+
+	A list of PMU structures, defining the different PMUs supported by perf.
+
+struct {
+       uint32_t nr;
+       struct pmu {
+	      uint32_t pmu_type;
+	      struct perf_header_string pmu_name;
+       } [nr]; /* Variable length records */
+};
+
+	HEADER_GROUP_DESC = 17,
+
+	Description of counter groups ({...} in perf syntax)
+
+struct {
+         uint32_t nr;
+         struct {
+		struct perf_header_string string;
+		uint32_t leader_idx;
+		uint32_t nr_members;
+	 } [nr]; /* Variable length records */
+};
+
+	HEADER_AUXTRACE = 18,
+
+Define additional auxtrace areas in the perf.data. auxtrace is used to store
+undecoded hardware tracing information, such as Intel Processor Trace data.
+
+/**
+ * struct auxtrace_index_entry - indexes a AUX area tracing event within a
+ *                               perf.data file.
+ * @file_offset: offset within the perf.data file
+ * @sz: size of the event
+ */
+struct auxtrace_index_entry {
+	u64			file_offset;
+	u64			sz;
+};
+
+#define PERF_AUXTRACE_INDEX_ENTRY_COUNT 256
+
+/**
+ * struct auxtrace_index - index of AUX area tracing events within a perf.data
+ *                         file.
+ * @list: linking a number of arrays of entries
+ * @nr: number of entries
+ * @entries: array of entries
+ */
+struct auxtrace_index {
+	struct list_head	list;
+	size_t			nr;
+	struct auxtrace_index_entry entries[PERF_AUXTRACE_INDEX_ENTRY_COUNT];
+};
+
+	HEADER_STAT = 19,
+
+This is merely a flag signifying that the data section contains data
+recorded from perf stat record.
+
+	HEADER_CACHE = 20,
+
+Description of the cache hierarchy. Based on the Linux sysfs format
+in /sys/devices/system/cpu/cpu*/cache/
+
+	u32 version	Currently always 1
+	u32 number_of_cache_levels
+
+struct {
+	u32	level;
+	u32	line_size;
+	u32	sets;
+	u32	ways;
+	struct perf_header_string type;
+	struct perf_header_string size;
+	struct perf_header_string map;
+}[number_of_cache_levels];
+
+	HEADER_SAMPLE_TIME = 21,
+
+Two uint64_t for the time of first sample and the time of last sample.
+
+	HEADER_SAMPLE_TOPOLOGY = 22,
+
+Physical memory map and its node assignments.
+
+The format of data in MEM_TOPOLOGY is as follows:
+
+	u64 version;            // Currently 1
+	u64 block_size_bytes;   // /sys/devices/system/memory/block_size_bytes
+	u64 count;              // number of nodes
+
+struct memory_node {
+        u64 node_id;            // node index
+        u64 size;               // size of bitmap
+        struct bitmap {
+		/* size of bitmap again */
+                u64 bitmapsize;
+		/* bitmap of memory indexes that belongs to node     */
+		/* /sys/devices/system/node/node<NODE>/memory<INDEX> */
+                u64 entries[(bitmapsize/64)+1];
+        }
+}[count];
+
+The MEM_TOPOLOGY can be displayed with following command:
+
+$ perf report --header-only -I
+...
+# memory nodes (nr 1, block size 0x8000000):
+#    0 [7G]: 0-23,32-69
+
+	HEADER_CLOCKID = 23,
+
+One uint64_t for the clockid frequency, specified, for instance, via 'perf
+record -k' (see clock_gettime()), to enable timestamps derived metrics
+conversion into wall clock time on the reporting stage.
+
+	HEADER_DIR_FORMAT = 24,
+
+The data files layout is described by HEADER_DIR_FORMAT feature.  Currently it
+holds only version number (1):
+
+  uint64_t version;
+
+The current version holds only version value (1) means that data files:
+
+- Follow the 'data.*' name format.
+
+- Contain raw events data in standard perf format as read from kernel (and need
+  to be sorted)
+
+Future versions are expected to describe different data files layout according
+to special needs.
+
+        HEADER_BPF_PROG_INFO = 25,
+
+struct perf_bpil, which contains detailed information about
+a BPF program, including type, id, tag, jited/xlated instructions, etc.
+
+        HEADER_BPF_BTF = 26,
+
+Contains BPF Type Format (BTF). For more information about BTF, please
+refer to Documentation/bpf/btf.rst.
+
+struct {
+	u32	id;
+	u32	data_size;
+	char	data[];
+};
+
+        HEADER_COMPRESSED = 27,
+
+struct {
+	u32	version;
+	u32	type;
+	u32	level;
+	u32	ratio;
+	u32	mmap_len;
+};
+
+Indicates that trace contains records of PERF_RECORD_COMPRESSED type
+that have perf_events records in compressed form.
+
+	HEADER_CPU_PMU_CAPS = 28,
+
+	A list of cpu PMU capabilities. The format of data is as below.
+
+struct {
+	u32 nr_cpu_pmu_caps;
+	{
+		char	name[];
+		char	value[];
+	} [nr_cpu_pmu_caps]
+};
+
+
+Example:
+ cpu pmu capabilities: branches=32, max_precise=3, pmu_name=icelake
+
+	HEADER_CLOCK_DATA = 29,
+
+	Contains clock id and its reference time together with wall clock
+	time taken at the 'same time', both values are in nanoseconds.
+	The format of data is as below.
+
+struct {
+	u32 version;  /* version = 1 */
+	u32 clockid;
+	u64 wall_clock_ns;
+	u64 clockid_time_ns;
+};
+
+	HEADER_HYBRID_TOPOLOGY = 30,
+
+Indicate the hybrid CPUs. The format of data is as below.
+
+struct {
+	u32 nr;
+	struct {
+		char pmu_name[];
+		char cpus[];
+	} [nr]; /* Variable length records */
+};
+
+Example:
+  hybrid cpu system:
+  cpu_core cpu list : 0-15
+  cpu_atom cpu list : 16-23
+
+	HEADER_PMU_CAPS = 31,
+
+	List of pmu capabilities (except cpu pmu which is already
+	covered by HEADER_CPU_PMU_CAPS). Note that hybrid cpu pmu
+	capabilities are also stored here.
+
+struct {
+	u32 nr_pmu;
+	struct {
+		u32 nr_caps;
+		{
+			char	name[];
+			char	value[];
+		} [nr_caps];
+		char pmu_name[];
+	} [nr_pmu];
+};
+
+	other bits are reserved and should ignored for now
+	HEADER_FEAT_BITS	= 256,
+
+Attributes
+
+This is an array of perf_event_attrs, each attr_size bytes long, which defines
+each event collected. See perf_event.h or the man page for a detailed
+description.
+
+Data
+
+This section is the bulk of the file. It consist of a stream of perf_events
+describing events. This matches the format generated by the kernel.
+See perf_event.h or the manpage for a detailed description.
+
+Some notes on parsing:
+
+Ordering
+
+The events are not necessarily in time stamp order, as they can be
+collected in parallel on different CPUs. If the events should be
+processed in time order they need to be sorted first. It is possible
+to only do a partial sort using the FINISHED_ROUND event header (see
+below). perf record guarantees that there is no reordering over a
+FINISHED_ROUND.
+
+ID vs IDENTIFIER
+
+When the event stream contains multiple events each event is identified
+by an ID. This can be either through the PERF_SAMPLE_ID or the
+PERF_SAMPLE_IDENTIFIER header. The PERF_SAMPLE_IDENTIFIER header is
+at a fixed offset from the event header, which allows reliable
+parsing of the header. Relying on ID may be ambiguous.
+IDENTIFIER is only supported by newer Linux kernels.
+
+Perf record specific events:
+
+In addition to the kernel generated event types perf record adds its
+own event types (in addition it also synthesizes some kernel events,
+for example MMAP events)
+
+	PERF_RECORD_USER_TYPE_START		= 64,
+	PERF_RECORD_HEADER_ATTR			= 64,
+
+struct attr_event {
+	struct perf_event_header header;
+	struct perf_event_attr attr;
+	uint64_t id[];
+};
+
+	PERF_RECORD_HEADER_EVENT_TYPE		= 65, /* deprecated */
+
+#define MAX_EVENT_NAME 64
+
+struct perf_trace_event_type {
+	uint64_t	event_id;
+	char	name[MAX_EVENT_NAME];
+};
+
+struct event_type_event {
+	struct perf_event_header header;
+	struct perf_trace_event_type event_type;
+};
+
+
+	PERF_RECORD_HEADER_TRACING_DATA		= 66,
+
+Describe me
+
+struct tracing_data_event {
+	struct perf_event_header header;
+	uint32_t size;
+};
+
+	PERF_RECORD_HEADER_BUILD_ID		= 67,
+
+Define a ELF build ID for a referenced executable.
+
+       struct build_id_event;   /* See above */
+
+	PERF_RECORD_FINISHED_ROUND		= 68,
+
+No event reordering over this header. No payload.
+
+	PERF_RECORD_ID_INDEX			= 69,
+
+Map event ids to CPUs and TIDs.
+
+struct id_index_entry {
+	uint64_t id;
+	uint64_t idx;
+	uint64_t cpu;
+	uint64_t tid;
+};
+
+struct id_index_event {
+	struct perf_event_header header;
+	uint64_t nr;
+	struct id_index_entry entries[nr];
+};
+
+	PERF_RECORD_AUXTRACE_INFO		= 70,
+
+Auxtrace type specific information. Describe me
+
+struct auxtrace_info_event {
+	struct perf_event_header header;
+	uint32_t type;
+	uint32_t reserved__; /* For alignment */
+	uint64_t priv[];
+};
+
+	PERF_RECORD_AUXTRACE			= 71,
+
+Defines auxtrace data. Followed by the actual data. The contents of
+the auxtrace data is dependent on the event and the CPU. For example
+for Intel Processor Trace it contains Processor Trace data generated
+by the CPU.
+
+struct auxtrace_event {
+	struct perf_event_header header;
+	uint64_t size;
+	uint64_t offset;
+	uint64_t reference;
+	uint32_t idx;
+	uint32_t tid;
+	uint32_t cpu;
+	uint32_t reserved__; /* For alignment */
+};
+
+struct aux_event {
+	struct perf_event_header header;
+	uint64_t	aux_offset;
+	uint64_t	aux_size;
+	uint64_t	flags;
+};
+
+	PERF_RECORD_AUXTRACE_ERROR		= 72,
+
+Describes an error in hardware tracing
+
+enum auxtrace_error_type {
+	PERF_AUXTRACE_ERROR_ITRACE  = 1,
+	PERF_AUXTRACE_ERROR_MAX
+};
+
+#define MAX_AUXTRACE_ERROR_MSG 64
+
+struct auxtrace_error_event {
+	struct perf_event_header header;
+	uint32_t type;
+	uint32_t code;
+	uint32_t cpu;
+	uint32_t pid;
+	uint32_t tid;
+	uint32_t reserved__; /* For alignment */
+	uint64_t ip;
+	char msg[MAX_AUXTRACE_ERROR_MSG];
+};
+
+	PERF_RECORD_HEADER_FEATURE		= 80,
+
+Describes a header feature. These are records used in pipe-mode that
+contain information that otherwise would be in perf.data file's header.
+
+	PERF_RECORD_COMPRESSED 			= 81,
+
+struct compressed_event {
+	struct perf_event_header	header;
+	char				data[];
+};
+
+	PERF_RECORD_FINISHED_INIT			= 82,
+
+Marks the end of records for the system, pre-existing threads in system wide
+sessions, etc. Those are the ones prefixed PERF_RECORD_USER_*.
+
+This is used, for instance, to 'perf inject' events after init and before
+regular events, those emitted by the kernel, to support combining guest and
+host records.
+
+
+The header is followed by compressed data frame that can be decompressed
+into array of perf trace records. The size of the entire compressed event
+record including the header is limited by the max value of header.size.
+
+Event types
+
+Define the event attributes with their IDs.
+
+An array bound by the perf_file_section size.
+
+	struct {
+		struct perf_event_attr attr;   /* Size defined by header.attr_size */
+		struct perf_file_section ids;
+	}
+
+ids points to a array of uint64_t defining the ids for event attr attr.
+
+Pipe-mode data
+
+Pipe-mode avoid seeks in the file by removing the perf_file_section and flags
+from the struct perf_header. The trimmed header is:
+
+struct perf_pipe_file_header {
+	u64				magic;
+	u64				size;
+};
+
+The information about attrs, data, and event_types is instead in the
+synthesized events PERF_RECORD_ATTR, PERF_RECORD_HEADER_TRACING_DATA,
+PERF_RECORD_HEADER_EVENT_TYPE, and PERF_RECORD_HEADER_FEATURE
+that are generated by perf record in pipe-mode.
+
+
+References:
+
+include/uapi/linux/perf_event.h
+
+This is the canonical description of the kernel generated perf_events
+and the perf_event_attrs.
+
+perf_events manpage
+
+A manpage describing perf_event and perf_event_attr is here:
+http://web.eece.maine.edu/~vweaver/projects/perf_events/programming.html
+This tends to be slightly behind the kernel include, but has better
+descriptions.  An (typically older) version of the man page may be
+included with the standard Linux man pages, available with "man
+perf_events"
+
+pmu-tools
+
+https://github.com/andikleen/pmu-tools/tree/master/parser
+
+A definition of the perf.data format in python "construct" format is available
+in pmu-tools parser. This allows to read perf.data from python and dump it.
+
+quipper
+
+The quipper C++ parser is available at
+http://github.com/google/perf_data_converter/tree/master/src/quipper
+
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
new file mode 100644
index 000000000..ba3df49c1
--- /dev/null
+++ b/tools/perf/Documentation/perf.txt
@@ -0,0 +1,90 @@
+perf(1)
+=======
+
+NAME
+----
+perf - Performance analysis tools for Linux
+
+SYNOPSIS
+--------
+[verse]
+'perf' [--version] [--help] [OPTIONS] COMMAND [ARGS]
+
+OPTIONS
+-------
+-h::
+--help::
+        Run perf help command.
+
+-v::
+--version::
+        Display perf version.
+
+-vv::
+        Print the compiled-in status of libraries.
+
+--exec-path::
+        Display or set exec path.
+
+--html-path::
+        Display html documentation path.
+
+-p::
+--paginate::
+        Set up pager.
+
+--no-pager::
+        Do not set pager.
+
+--buildid-dir::
+        Setup buildid cache directory. It has higher priority
+        than buildid.dir config file option.
+
+--list-cmds::
+        List the most commonly used perf commands.
+
+--list-opts::
+        List available perf options.
+
+--debugfs-dir::
+        Set debugfs directory or set environment variable PERF_DEBUGFS_DIR.
+
+--debug::
+        Setup debug variable (see list below) in value
+        range (0, 10). Use like:
+          --debug verbose   # sets verbose = 1
+          --debug verbose=2 # sets verbose = 2
+
+        List of debug variables allowed to set:
+          verbose          - general debug messages
+          ordered-events   - ordered events object debug messages
+          data-convert     - data convert command debug messages
+          stderr           - write debug output (option -v) to stderr
+                             in browser mode
+          perf-event-open  - Print perf_event_open() arguments and
+                             return value
+
+DESCRIPTION
+-----------
+Performance counters for Linux are a new kernel-based subsystem
+that provide a framework for all things performance analysis. It
+covers hardware level (CPU/PMU, Performance Monitoring Unit) features
+and software features (software counters, tracepoints) as well.
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-list[1]
+
+linkperf:perf-annotate[1],linkperf:perf-archive[1],linkperf:perf-arm-spe[1],
+linkperf:perf-bench[1], linkperf:perf-buildid-cache[1],
+linkperf:perf-buildid-list[1], linkperf:perf-c2c[1],
+linkperf:perf-config[1], linkperf:perf-data[1], linkperf:perf-diff[1],
+linkperf:perf-evlist[1], linkperf:perf-ftrace[1],
+linkperf:perf-help[1], linkperf:perf-inject[1],
+linkperf:perf-intel-pt[1], linkperf:perf-iostat[1], linkperf:perf-kallsyms[1],
+linkperf:perf-kmem[1], linkperf:perf-kvm[1], linkperf:perf-lock[1],
+linkperf:perf-mem[1], linkperf:perf-probe[1], linkperf:perf-sched[1],
+linkperf:perf-script[1], linkperf:perf-test[1],
+linkperf:perf-trace[1], linkperf:perf-version[1]
diff --git a/tools/perf/Documentation/perfconfig.example b/tools/perf/Documentation/perfconfig.example
new file mode 100644
index 000000000..2b477c1d1
--- /dev/null
+++ b/tools/perf/Documentation/perfconfig.example
@@ -0,0 +1,38 @@
+[colors]
+
+	# These were the old defaults
+	top = red, lightgray
+	medium = green, lightgray
+	normal = black, lightgray
+	selected = lightgray, magenta
+	jump_arrows = blue, lightgray
+	addr = magenta, lightgray
+
+[tui]
+
+	# Defaults if linked with libslang
+	report = on
+	annotate = on
+	top = on
+
+[buildid]
+
+	# Default, disable using /dev/null
+	dir = /root/.debug
+
+[annotate]
+
+	# Defaults
+	hide_src_code = false
+	use_offset = true
+	jump_arrows = true
+	show_nr_jumps = false
+
+[report]
+
+	# Defaults
+	sort-order = comm,dso,symbol
+	percent-limit = 0
+	queue-size = 0
+	children = true
+	group = true
diff --git a/tools/perf/Documentation/security.txt b/tools/perf/Documentation/security.txt
new file mode 100644
index 000000000..4fe3b8b19
--- /dev/null
+++ b/tools/perf/Documentation/security.txt
@@ -0,0 +1,237 @@
+Overview
+========
+
+For general security related questions of perf_event_open() syscall usage,
+performance monitoring and observability operations by Perf see here:
+https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html
+
+Enabling LSM based mandatory access control (MAC) to perf_event_open() syscall
+==============================================================================
+
+LSM hooks for mandatory access control for perf_event_open() syscall can be
+used starting from Linux v5.3. Below are the steps to extend Fedora (v31) with
+Targeted policy with perf_event_open() access control capabilities:
+
+1. Download selinux-policy SRPM package (e.g. selinux-policy-3.14.4-48.fc31.src.rpm on FC31)
+   and install it so rpmbuild directory would exist in the current working directory:
+
+   # rpm -Uhv selinux-policy-3.14.4-48.fc31.src.rpm
+
+2. Get into rpmbuild/SPECS directory and unpack the source code:
+
+   # rpmbuild -bp selinux-policy.spec
+
+3. Place patch below at rpmbuild/BUILD/selinux-policy-b86eaaf4dbcf2d51dd4432df7185c0eaf3cbcc02
+   directory and apply it:
+
+   # patch -p1 < selinux-policy-perf-events-perfmon.patch
+   patching file policy/flask/access_vectors
+   patching file policy/flask/security_classes
+   # cat selinux-policy-perf-events-perfmon.patch
+diff -Nura a/policy/flask/access_vectors b/policy/flask/access_vectors
+--- a/policy/flask/access_vectors	2020-02-04 18:19:53.000000000 +0300
++++ b/policy/flask/access_vectors	2020-02-28 23:37:25.000000000 +0300
+@@ -174,6 +174,7 @@
+ 	wake_alarm
+ 	block_suspend
+ 	audit_read
++	perfmon
+ }
+ 
+ #
+@@ -1099,3 +1100,15 @@
+ 
+ class xdp_socket
+ inherits socket
++
++class perf_event
++{
++	open
++	cpu
++	kernel
++	tracepoint
++	read
++	write
++}
++
++
+diff -Nura a/policy/flask/security_classes b/policy/flask/security_classes
+--- a/policy/flask/security_classes	2020-02-04 18:19:53.000000000 +0300
++++ b/policy/flask/security_classes	2020-02-28 21:35:17.000000000 +0300
+@@ -200,4 +200,6 @@
+ 
+ class xdp_socket
+ 
++class perf_event
++
+ # FLASK
+
+4. Get into rpmbuild/SPECS directory and build policy packages from patched sources:
+
+   # rpmbuild --noclean --noprep -ba selinux-policy.spec
+
+   so you have this:
+
+   # ls -alh rpmbuild/RPMS/noarch/
+   total 33M
+   drwxr-xr-x. 2 root root 4.0K Mar 20 12:16 .
+   drwxr-xr-x. 3 root root 4.0K Mar 20 12:16 ..
+   -rw-r--r--. 1 root root 112K Mar 20 12:16 selinux-policy-3.14.4-48.fc31.noarch.rpm
+   -rw-r--r--. 1 root root 1.2M Mar 20 12:17 selinux-policy-devel-3.14.4-48.fc31.noarch.rpm
+   -rw-r--r--. 1 root root 2.3M Mar 20 12:17 selinux-policy-doc-3.14.4-48.fc31.noarch.rpm
+   -rw-r--r--. 1 root root  12M Mar 20 12:17 selinux-policy-minimum-3.14.4-48.fc31.noarch.rpm
+   -rw-r--r--. 1 root root 4.5M Mar 20 12:16 selinux-policy-mls-3.14.4-48.fc31.noarch.rpm
+   -rw-r--r--. 1 root root 111K Mar 20 12:16 selinux-policy-sandbox-3.14.4-48.fc31.noarch.rpm
+   -rw-r--r--. 1 root root  14M Mar 20 12:17 selinux-policy-targeted-3.14.4-48.fc31.noarch.rpm
+
+5. Install SELinux packages from Fedora repo, if not already done so, and
+   update with the patched rpms above:
+
+   # rpm -Uhv rpmbuild/RPMS/noarch/selinux-policy-*
+
+6. Enable SELinux Permissive mode for Targeted policy, if not already done so:
+
+   # cat /etc/selinux/config
+
+   # This file controls the state of SELinux on the system.
+   # SELINUX= can take one of these three values:
+   #     enforcing - SELinux security policy is enforced.
+   #     permissive - SELinux prints warnings instead of enforcing.
+   #     disabled - No SELinux policy is loaded.
+   SELINUX=permissive
+   # SELINUXTYPE= can take one of these three values:
+   #     targeted - Targeted processes are protected,
+   #     minimum - Modification of targeted policy. Only selected processes are protected.
+   #     mls - Multi Level Security protection.
+   SELINUXTYPE=targeted
+
+7. Enable filesystem SELinux labeling at the next reboot:
+
+   # touch /.autorelabel
+
+8. Reboot machine and it will label filesystems and load Targeted policy into the kernel;
+
+9. Login and check that dmesg output doesn't mention that perf_event class is unknown to SELinux subsystem;
+
+10. Check that SELinux is enabled and in Permissive mode
+
+    # getenforce
+    Permissive
+
+11. Turn SELinux into Enforcing mode:
+
+    # setenforce 1
+    # getenforce
+    Enforcing
+
+Opening access to perf_event_open() syscall on Fedora with SELinux
+==================================================================
+
+Access to performance monitoring and observability operations by Perf
+can be limited for superuser or CAP_PERFMON or CAP_SYS_ADMIN privileged
+processes. MAC policy settings (e.g. SELinux) can be loaded into the kernel
+and prevent unauthorized access to perf_event_open() syscall. In such case
+Perf tool provides a message similar to the one below:
+
+   # perf stat
+   Error:
+   Access to performance monitoring and observability operations is limited.
+   Enforced MAC policy settings (SELinux) can limit access to performance
+   monitoring and observability operations. Inspect system audit records for
+   more perf_event access control information and adjusting the policy.
+   Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open
+   access to performance monitoring and observability operations for users
+   without CAP_PERFMON or CAP_SYS_ADMIN Linux capability.
+   perf_event_paranoid setting is -1:
+     -1: Allow use of (almost) all events by all users
+         Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
+   >= 0: Disallow raw and ftrace function tracepoint access
+   >= 1: Disallow CPU event access
+   >= 2: Disallow kernel profiling
+   To make the adjusted perf_event_paranoid setting permanent preserve it
+   in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
+
+To make sure that access is limited by MAC policy settings inspect system
+audit records using journalctl command or /var/log/audit/audit.log so the
+output would contain AVC denied records related to perf_event:
+
+   # journalctl --reverse --no-pager | grep perf_event
+
+   python3[1318099]: SELinux is preventing perf from open access on the perf_event labeled unconfined_t.
+                                         If you believe that perf should be allowed open access on perf_event labeled unconfined_t by default.
+   setroubleshoot[1318099]: SELinux is preventing perf from open access on the perf_event labeled unconfined_t. For complete SELinux messages run: sealert -l 4595ce5b-e58f-462c-9d86-3bc2074935de
+   audit[1318098]: AVC avc:  denied  { open } for  pid=1318098 comm="perf" scontext=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 tcontext=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 tclass=perf_event permissive=0
+
+In order to open access to perf_event_open() syscall MAC policy settings can
+require to be extended. On SELinux system this can be done by loading a special
+policy module extending base policy settings. Perf related policy module can
+be generated using the system audit records about blocking perf_event access.
+Run the command below to generate my-perf.te policy extension file with
+perf_event related rules:
+
+   # ausearch -c 'perf' --raw | audit2allow -M my-perf && cat my-perf.te
+
+   module my-perf 1.0;
+
+   require {
+        type unconfined_t;
+        class perf_event { cpu kernel open read tracepoint write };
+   }
+
+   #============= unconfined_t ==============
+   allow unconfined_t self:perf_event { cpu kernel open read tracepoint write };
+
+Now compile, pack and load my-perf.pp extension module into the kernel:
+
+   # checkmodule -M -m -o my-perf.mod my-perf.te
+   # semodule_package -o my-perf.pp -m my-perf.mod
+   # semodule -X 300 -i my-perf.pp
+
+After all those taken steps above access to perf_event_open() syscall should
+now be allowed by the policy settings. Check access running Perf like this:
+
+   # perf stat
+   ^C
+   Performance counter stats for 'system wide':
+
+         36,387.41 msec cpu-clock                 #    7.999 CPUs utilized
+             2,629      context-switches          #    0.072 K/sec
+                57      cpu-migrations            #    0.002 K/sec
+                 1      page-faults               #    0.000 K/sec
+       263,721,559      cycles                    #    0.007 GHz
+       175,746,713      instructions              #    0.67  insn per cycle
+        19,628,798      branches                  #    0.539 M/sec
+         1,259,201      branch-misses             #    6.42% of all branches
+
+       4.549061439 seconds time elapsed
+
+The generated perf-event.pp related policy extension module can be removed
+from the kernel using this command:
+
+   # semodule -X 300 -r my-perf
+
+Alternatively the module can be temporarily disabled and enabled back using
+these two commands:
+
+   # semodule -d my-perf
+   # semodule -e my-perf
+
+If something went wrong
+=======================
+
+To turn SELinux into Permissive mode:
+   # setenforce 0
+
+To fully disable SELinux during kernel boot [3] set kernel command line parameter selinux=0
+
+To remove SELinux labeling from local filesystems:
+   # find / -mount -print0 | xargs -0 setfattr -h -x security.selinux
+
+To fully turn SELinux off a machine set SELINUX=disabled at /etc/selinux/config file and reboot;
+
+Links
+=====
+
+[1] https://download-ib01.fedoraproject.org/pub/fedora/linux/updates/31/Everything/SRPMS/Packages/s/selinux-policy-3.14.4-49.fc31.src.rpm
+[2] https://docs.fedoraproject.org/en-US/Fedora/11/html/Security-Enhanced_Linux/sect-Security-Enhanced_Linux-Working_with_SELinux-Enabling_and_Disabling_SELinux.html
+[3] https://danwalsh.livejournal.com/10972.html
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
new file mode 100644
index 000000000..825745a64
--- /dev/null
+++ b/tools/perf/Documentation/tips.txt
@@ -0,0 +1,43 @@
+For a higher level overview, try: perf report --sort comm,dso
+Sample related events with: perf record -e '{cycles,instructions}:S'
+Compare performance results with: perf diff [<old file> <new file>]
+Boolean options have negative forms, e.g.: perf report --no-children
+Customize output of perf script with: perf script -F event,ip,sym
+Generate a script for your data: perf script -g <lang>
+Save output of perf stat using: perf stat record <target workload>
+Create an archive with symtabs to analyse on other machine: perf archive
+Search options using a keyword: perf report -h <keyword>
+Use parent filter to see specific call path: perf report -p <regex>
+List events using substring match: perf list <keyword>
+To see list of saved events and attributes: perf evlist -v
+Use --symfs <dir> if your symbol files are in non-standard locations
+To see callchains in a more compact form: perf report -g folded
+Show individual samples with: perf script
+Limit to show entries above 5% only: perf report --percent-limit 5
+Profiling branch (mis)predictions with: perf record -b / perf report
+To show assembler sample contexts use perf record -b / perf script -F +brstackinsn --xed
+Treat branches as callchains: perf report --branch-history
+To count events in every 1000 msec: perf stat -I 1000
+Print event counts in CSV format with: perf stat -x,
+If you have debuginfo enabled, try: perf report -s sym,srcline
+For memory address profiling, try: perf mem record / perf mem report
+For tracepoint events, try: perf report -s trace_fields
+To record callchains for each sample: perf record -g
+To record every process run by a user: perf record -u <user>
+Skip collecting build-id when recording: perf record -B
+To change sampling frequency to 100 Hz: perf record -F 100
+See assembly instructions with percentage: perf annotate <symbol>
+If you prefer Intel style assembly, try: perf annotate -M intel
+For hierarchical output, try: perf report --hierarchy
+Order by the overhead of source file name and line number: perf report -s srcline
+System-wide collection from all CPUs: perf record -a
+Show current config key-value pairs: perf config --list
+Show user configuration overrides: perf config --user --list
+To add Node.js USDT(User-Level Statically Defined Tracing): perf buildid-cache --add `which node`
+To report cacheline events from previous recording: perf c2c report
+To browse sample contexts use perf report --sample 10 and select in context menu
+To separate samples by time use perf report --sort time,overhead,sym
+To set sample time separation other than 100ms with --sort time use --time-quantum
+Add -I to perf record to sample register values, which will be visible in perf report sample context.
+To show IPC for sampling periods use perf record -e '{cycles,instructions}:S' and then browse context
+To show context switches in perf report sample context add --switch-events to perf record.
diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt
new file mode 100644
index 000000000..a15b93fdc
--- /dev/null
+++ b/tools/perf/Documentation/topdown.txt
@@ -0,0 +1,344 @@
+Using TopDown metrics in user space
+-----------------------------------
+
+Intel CPUs (since Sandy Bridge and Silvermont) support a TopDown
+methodology to break down CPU pipeline execution into 4 bottlenecks:
+frontend bound, backend bound, bad speculation, retiring.
+
+For more details on Topdown see [1][5]
+
+Traditionally this was implemented by events in generic counters
+and specific formulas to compute the bottlenecks.
+
+perf stat --topdown implements this.
+
+Full Top Down includes more levels that can break down the
+bottlenecks further. This is not directly implemented in perf,
+but available in other tools that can run on top of perf,
+such as toplev[2] or vtune[3]
+
+New Topdown features in Ice Lake
+===============================
+
+With Ice Lake CPUs the TopDown metrics are directly available as
+fixed counters and do not require generic counters. This allows
+to collect TopDown always in addition to other events.
+
+% perf stat -a --topdown -I1000
+#           time             retiring      bad speculation       frontend bound        backend bound
+     1.001281330                23.0%                15.3%                29.6%                32.1%
+     2.003009005                 5.0%                 6.8%                46.6%                41.6%
+     3.004646182                 6.7%                 6.7%                46.0%                40.6%
+     4.006326375                 5.0%                 6.4%                47.6%                41.0%
+     5.007991804                 5.1%                 6.3%                46.3%                42.3%
+     6.009626773                 6.2%                 7.1%                47.3%                39.3%
+     7.011296356                 4.7%                 6.7%                46.2%                42.4%
+     8.012951831                 4.7%                 6.7%                47.5%                41.1%
+...
+
+This also enables measuring TopDown per thread/process instead
+of only per core.
+
+Using TopDown through RDPMC in applications on Ice Lake
+======================================================
+
+For more fine grained measurements it can be useful to
+access the new  directly from user space. This is more complicated,
+but drastically lowers overhead.
+
+On Ice Lake, there is a new fixed counter 3: SLOTS, which reports
+"pipeline SLOTS" (cycles multiplied by core issue width) and a
+metric register that reports slots ratios for the different bottleneck
+categories.
+
+The metrics counter is CPU model specific and is not available on older
+CPUs.
+
+Example code
+============
+
+Library functions to do the functionality described below
+is also available in libjevents [4]
+
+The application opens a group with fixed counter 3 (SLOTS) and any
+metric event, and allow user programs to read the performance counters.
+
+Fixed counter 3 is mapped to a pseudo event event=0x00, umask=04,
+so the perf_event_attr structure should be initialized with
+{ .config = 0x0400, .type = PERF_TYPE_RAW }
+The metric events are mapped to the pseudo event event=0x00, umask=0x8X.
+For example, the perf_event_attr structure can be initialized with
+{ .config = 0x8000, .type = PERF_TYPE_RAW } for Retiring metric event
+The Fixed counter 3 must be the leader of the group.
+
+#include <linux/perf_event.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+/* Provide own perf_event_open stub because glibc doesn't */
+__attribute__((weak))
+int perf_event_open(struct perf_event_attr *attr, pid_t pid,
+		    int cpu, int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+}
+
+/* Open slots counter file descriptor for current task. */
+struct perf_event_attr slots = {
+	.type = PERF_TYPE_RAW,
+	.size = sizeof(struct perf_event_attr),
+	.config = 0x400,
+	.exclude_kernel = 1,
+};
+
+int slots_fd = perf_event_open(&slots, 0, -1, -1, 0);
+if (slots_fd < 0)
+	... error ...
+
+/* Memory mapping the fd permits _rdpmc calls from userspace */
+void *slots_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, slots_fd, 0);
+if (!slot_p)
+	.... error ...
+
+/*
+ * Open metrics event file descriptor for current task.
+ * Set slots event as the leader of the group.
+ */
+struct perf_event_attr metrics = {
+	.type = PERF_TYPE_RAW,
+	.size = sizeof(struct perf_event_attr),
+	.config = 0x8000,
+	.exclude_kernel = 1,
+};
+
+int metrics_fd = perf_event_open(&metrics, 0, -1, slots_fd, 0);
+if (metrics_fd < 0)
+	... error ...
+
+/* Memory mapping the fd permits _rdpmc calls from userspace */
+void *metrics_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, metrics_fd, 0);
+if (!metrics_p)
+	... error ...
+
+Note: the file descriptors returned by the perf_event_open calls must be memory
+mapped to permit calls to the _rdpmd instruction. Permission may also be granted
+by writing the /sys/devices/cpu/rdpmc sysfs node.
+
+The RDPMC instruction (or _rdpmc compiler intrinsic) can now be used
+to read slots and the topdown metrics at different points of the program:
+
+#include <stdint.h>
+#include <x86intrin.h>
+
+#define RDPMC_FIXED	(1 << 30)	/* return fixed counters */
+#define RDPMC_METRIC	(1 << 29)	/* return metric counters */
+
+#define FIXED_COUNTER_SLOTS		3
+#define METRIC_COUNTER_TOPDOWN_L1_L2	0
+
+static inline uint64_t read_slots(void)
+{
+	return _rdpmc(RDPMC_FIXED | FIXED_COUNTER_SLOTS);
+}
+
+static inline uint64_t read_metrics(void)
+{
+	return _rdpmc(RDPMC_METRIC | METRIC_COUNTER_TOPDOWN_L1_L2);
+}
+
+Then the program can be instrumented to read these metrics at different
+points.
+
+It's not a good idea to do this with too short code regions,
+as the parallelism and overlap in the CPU program execution will
+cause too much measurement inaccuracy. For example instrumenting
+individual basic blocks is definitely too fine grained.
+
+_rdpmc calls should not be mixed with reading the metrics and slots counters
+through system calls, as the kernel will reset these counters after each system
+call.
+
+Decoding metrics values
+=======================
+
+The value reported by read_metrics() contains four 8 bit fields
+that represent a scaled ratio that represent the Level 1 bottleneck.
+All four fields add up to 0xff (= 100%)
+
+The binary ratios in the metric value can be converted to float ratios:
+
+#define GET_METRIC(m, i) (((m) >> (i*8)) & 0xff)
+
+/* L1 Topdown metric events */
+#define TOPDOWN_RETIRING(val)	((float)GET_METRIC(val, 0) / 0xff)
+#define TOPDOWN_BAD_SPEC(val)	((float)GET_METRIC(val, 1) / 0xff)
+#define TOPDOWN_FE_BOUND(val)	((float)GET_METRIC(val, 2) / 0xff)
+#define TOPDOWN_BE_BOUND(val)	((float)GET_METRIC(val, 3) / 0xff)
+
+/*
+ * L2 Topdown metric events.
+ * Available on Sapphire Rapids and later platforms.
+ */
+#define TOPDOWN_HEAVY_OPS(val)		((float)GET_METRIC(val, 4) / 0xff)
+#define TOPDOWN_BR_MISPREDICT(val)	((float)GET_METRIC(val, 5) / 0xff)
+#define TOPDOWN_FETCH_LAT(val)		((float)GET_METRIC(val, 6) / 0xff)
+#define TOPDOWN_MEM_BOUND(val)		((float)GET_METRIC(val, 7) / 0xff)
+
+and then converted to percent for printing.
+
+The ratios in the metric accumulate for the time when the counter
+is enabled. For measuring programs it is often useful to measure
+specific sections. For this it is needed to deltas on metrics.
+
+This can be done by scaling the metrics with the slots counter
+read at the same time.
+
+Then it's possible to take deltas of these slots counts
+measured at different points, and determine the metrics
+for that time period.
+
+	slots_a = read_slots();
+	metric_a = read_metrics();
+
+	... larger code region ...
+
+	slots_b = read_slots()
+	metric_b = read_metrics()
+
+	# compute scaled metrics for measurement a
+	retiring_slots_a = GET_METRIC(metric_a, 0) * slots_a
+	bad_spec_slots_a = GET_METRIC(metric_a, 1) * slots_a
+	fe_bound_slots_a = GET_METRIC(metric_a, 2) * slots_a
+	be_bound_slots_a = GET_METRIC(metric_a, 3) * slots_a
+
+	# compute delta scaled metrics between b and a
+	retiring_slots = GET_METRIC(metric_b, 0) * slots_b - retiring_slots_a
+	bad_spec_slots = GET_METRIC(metric_b, 1) * slots_b - bad_spec_slots_a
+	fe_bound_slots = GET_METRIC(metric_b, 2) * slots_b - fe_bound_slots_a
+	be_bound_slots = GET_METRIC(metric_b, 3) * slots_b - be_bound_slots_a
+
+Later the individual ratios of L1 metric events for the measurement period can
+be recreated from these counts.
+
+	slots_delta = slots_b - slots_a
+	retiring_ratio = (float)retiring_slots / slots_delta
+	bad_spec_ratio = (float)bad_spec_slots / slots_delta
+	fe_bound_ratio = (float)fe_bound_slots / slots_delta
+	be_bound_ratio = (float)be_bound_slots / slota_delta
+
+	printf("Retiring %.2f%% Bad Speculation %.2f%% FE Bound %.2f%% BE Bound %.2f%%\n",
+		retiring_ratio * 100.,
+		bad_spec_ratio * 100.,
+		fe_bound_ratio * 100.,
+		be_bound_ratio * 100.);
+
+The individual ratios of L2 metric events for the measurement period can be
+recreated from L1 and L2 metric counters. (Available on Sapphire Rapids and
+later platforms)
+
+	# compute scaled metrics for measurement a
+	heavy_ops_slots_a = GET_METRIC(metric_a, 4) * slots_a
+	br_mispredict_slots_a = GET_METRIC(metric_a, 5) * slots_a
+	fetch_lat_slots_a = GET_METRIC(metric_a, 6) * slots_a
+	mem_bound_slots_a = GET_METRIC(metric_a, 7) * slots_a
+
+	# compute delta scaled metrics between b and a
+	heavy_ops_slots = GET_METRIC(metric_b, 4) * slots_b - heavy_ops_slots_a
+	br_mispredict_slots = GET_METRIC(metric_b, 5) * slots_b - br_mispredict_slots_a
+	fetch_lat_slots = GET_METRIC(metric_b, 6) * slots_b - fetch_lat_slots_a
+	mem_bound_slots = GET_METRIC(metric_b, 7) * slots_b - mem_bound_slots_a
+
+	slots_delta = slots_b - slots_a
+	heavy_ops_ratio = (float)heavy_ops_slots / slots_delta
+	light_ops_ratio = retiring_ratio - heavy_ops_ratio;
+
+	br_mispredict_ratio = (float)br_mispredict_slots / slots_delta
+	machine_clears_ratio = bad_spec_ratio - br_mispredict_ratio;
+
+	fetch_lat_ratio = (float)fetch_lat_slots / slots_delta
+	fetch_bw_ratio = fe_bound_ratio - fetch_lat_ratio;
+
+	mem_bound_ratio = (float)mem_bound_slots / slota_delta
+	core_bound_ratio = be_bound_ratio - mem_bound_ratio;
+
+	printf("Heavy Operations %.2f%% Light Operations %.2f%% "
+	       "Branch Mispredict %.2f%% Machine Clears %.2f%% "
+	       "Fetch Latency %.2f%% Fetch Bandwidth %.2f%% "
+	       "Mem Bound %.2f%% Core Bound %.2f%%\n",
+		heavy_ops_ratio * 100.,
+		light_ops_ratio * 100.,
+		br_mispredict_ratio * 100.,
+		machine_clears_ratio * 100.,
+		fetch_lat_ratio * 100.,
+		fetch_bw_ratio * 100.,
+		mem_bound_ratio * 100.,
+		core_bound_ratio * 100.);
+
+Resetting metrics counters
+==========================
+
+Since the individual metrics are only 8bit they lose precision for
+short regions over time because the number of cycles covered by each
+fraction bit shrinks. So the counters need to be reset regularly.
+
+When using the kernel perf API the kernel resets on every read.
+So as long as the reading is at reasonable intervals (every few
+seconds) the precision is good.
+
+When using perf stat it is recommended to always use the -I option,
+with no longer interval than a few seconds
+
+	perf stat -I 1000 --topdown ...
+
+For user programs using RDPMC directly the counter can
+be reset explicitly using ioctl:
+
+	ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0);
+
+This "opens" a new measurement period.
+
+A program using RDPMC for TopDown should schedule such a reset
+regularly, as in every few seconds.
+
+Limits on Ice Lake
+==================
+
+Four pseudo TopDown metric events are exposed for the end-users,
+topdown-retiring, topdown-bad-spec, topdown-fe-bound and topdown-be-bound.
+They can be used to collect the TopDown value under the following
+rules:
+- All the TopDown metric events must be in a group with the SLOTS event.
+- The SLOTS event must be the leader of the group.
+- The PERF_FORMAT_GROUP flag must be applied for each TopDown metric
+  events
+
+The SLOTS event and the TopDown metric events can be counting members of
+a sampling read group. Since the SLOTS event must be the leader of a TopDown
+group, the second event of the group is the sampling event.
+For example, perf record -e '{slots, $sampling_event, topdown-retiring}:S'
+
+Extension on Sapphire Rapids Server
+===================================
+The metrics counter is extended to support TMA method level 2 metrics.
+The lower half of the register is the TMA level 1 metrics (legacy).
+The upper half is also divided into four 8-bit fields for the new level 2
+metrics. Four more TopDown metric events are exposed for the end-users,
+topdown-heavy-ops, topdown-br-mispredict, topdown-fetch-lat and
+topdown-mem-bound.
+
+Each of the new level 2 metrics in the upper half is a subset of the
+corresponding level 1 metric in the lower half. Software can deduce the
+other four level 2 metrics by subtracting corresponding metrics as below.
+
+    Light_Operations = Retiring - Heavy_Operations
+    Machine_Clears = Bad_Speculation - Branch_Mispredicts
+    Fetch_Bandwidth = Frontend_Bound - Fetch_Latency
+    Core_Bound = Backend_Bound - Memory_Bound
+
+
+[1] https://software.intel.com/en-us/top-down-microarchitecture-analysis-method-win
+[2] https://github.com/andikleen/pmu-tools/wiki/toplev-manual
+[3] https://software.intel.com/en-us/intel-vtune-amplifier-xe
+[4] https://github.com/andikleen/pmu-tools/tree/master/jevents
+[5] https://sites.google.com/site/analysismethods/yasin-pubs
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:49:45 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:49:45 +0000
commit	2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree	848558de17fb3008cdf4d861b01ac7781903ce39 /tools/perf/Documentation
parent	Initial commit. (diff)
download	linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip