summaryrefslogtreecommitdiffstats
path: root/samples
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--samples/Kconfig156
-rw-r--r--samples/Makefile6
-rw-r--r--samples/auxdisplay/.gitignore1
-rw-r--r--samples/auxdisplay/Makefile10
-rw-r--r--samples/auxdisplay/cfag12864b-example.c267
-rw-r--r--samples/bpf/.gitignore49
-rw-r--r--samples/bpf/Makefile279
-rw-r--r--samples/bpf/README.rst76
-rw-r--r--samples/bpf/bpf_insn.h197
-rw-r--r--samples/bpf/bpf_load.c688
-rw-r--r--samples/bpf/bpf_load.h58
-rw-r--r--samples/bpf/cookie_uid_helper_example.c323
-rw-r--r--samples/bpf/cpustat_kern.c281
-rw-r--r--samples/bpf/cpustat_user.c219
-rw-r--r--samples/bpf/fds_example.c188
-rw-r--r--samples/bpf/hash_func01.h55
-rw-r--r--samples/bpf/lathist_kern.c99
-rw-r--r--samples/bpf/lathist_user.c103
-rw-r--r--samples/bpf/load_sock_ops.c97
-rwxr-xr-xsamples/bpf/lwt_len_hist.sh40
-rw-r--r--samples/bpf/lwt_len_hist_kern.c82
-rw-r--r--samples/bpf/lwt_len_hist_user.c77
-rw-r--r--samples/bpf/map_perf_test_kern.c283
-rw-r--r--samples/bpf/map_perf_test_user.c464
-rw-r--r--samples/bpf/offwaketime_kern.c151
-rw-r--r--samples/bpf/offwaketime_user.c122
-rw-r--r--samples/bpf/parse_ldabs.c42
-rw-r--r--samples/bpf/parse_simple.c49
-rw-r--r--samples/bpf/parse_varlen.c150
-rwxr-xr-xsamples/bpf/run_cookie_uid_helper_example.sh15
-rw-r--r--samples/bpf/sampleip_kern.c38
-rw-r--r--samples/bpf/sampleip_user.c199
-rw-r--r--samples/bpf/sock_example.c106
-rw-r--r--samples/bpf/sock_example.h35
-rw-r--r--samples/bpf/sock_flags_kern.c49
-rw-r--r--samples/bpf/sockex1_kern.c29
-rw-r--r--samples/bpf/sockex1_user.c51
-rw-r--r--samples/bpf/sockex2_kern.c222
-rw-r--r--samples/bpf/sockex2_user.c54
-rw-r--r--samples/bpf/sockex3_kern.c290
-rw-r--r--samples/bpf/sockex3_user.c84
-rw-r--r--samples/bpf/spintest_kern.c68
-rw-r--r--samples/bpf/spintest_user.c52
-rw-r--r--samples/bpf/syscall_nrs.c13
-rw-r--r--samples/bpf/syscall_tp_kern.c76
-rw-r--r--samples/bpf/syscall_tp_user.c111
-rw-r--r--samples/bpf/task_fd_query_kern.c19
-rw-r--r--samples/bpf/task_fd_query_user.c382
-rwxr-xr-xsamples/bpf/tc_l2_redirect.sh174
-rw-r--r--samples/bpf/tc_l2_redirect_kern.c237
-rw-r--r--samples/bpf/tc_l2_redirect_user.c73
-rw-r--r--samples/bpf/tcbpf1_kern.c90
-rw-r--r--samples/bpf/tcp_basertt_kern.c78
-rw-r--r--samples/bpf/tcp_bpf.readme26
-rw-r--r--samples/bpf/tcp_bufs_kern.c88
-rw-r--r--samples/bpf/tcp_clamp_kern.c104
-rw-r--r--samples/bpf/tcp_cong_kern.c85
-rw-r--r--samples/bpf/tcp_iw_kern.c90
-rw-r--r--samples/bpf/tcp_rwnd_kern.c71
-rw-r--r--samples/bpf/tcp_synrto_kern.c71
-rw-r--r--samples/bpf/test_cgrp2_array_pin.c109
-rw-r--r--samples/bpf/test_cgrp2_attach.c172
-rw-r--r--samples/bpf/test_cgrp2_attach2.c442
-rw-r--r--samples/bpf/test_cgrp2_sock.c290
-rwxr-xr-xsamples/bpf/test_cgrp2_sock.sh135
-rw-r--r--samples/bpf/test_cgrp2_sock2.c68
-rwxr-xr-xsamples/bpf/test_cgrp2_sock2.sh85
-rwxr-xr-xsamples/bpf/test_cgrp2_tc.sh185
-rw-r--r--samples/bpf/test_cgrp2_tc_kern.c70
-rwxr-xr-xsamples/bpf/test_cls_bpf.sh38
-rw-r--r--samples/bpf/test_current_task_under_cgroup_kern.c43
-rw-r--r--samples/bpf/test_current_task_under_cgroup_user.c85
-rwxr-xr-xsamples/bpf/test_ipip.sh179
-rw-r--r--samples/bpf/test_lru_dist.c543
-rw-r--r--samples/bpf/test_lwt_bpf.c253
-rwxr-xr-xsamples/bpf/test_lwt_bpf.sh400
-rw-r--r--samples/bpf/test_map_in_map_kern.c173
-rw-r--r--samples/bpf/test_map_in_map_user.c133
-rw-r--r--samples/bpf/test_overhead_kprobe_kern.c41
-rw-r--r--samples/bpf/test_overhead_raw_tp_kern.c17
-rw-r--r--samples/bpf/test_overhead_tp_kern.c36
-rw-r--r--samples/bpf/test_overhead_user.c185
-rwxr-xr-xsamples/bpf/test_override_return.sh16
-rw-r--r--samples/bpf/test_probe_write_user_kern.c52
-rw-r--r--samples/bpf/test_probe_write_user_user.c79
-rw-r--r--samples/bpf/trace_event_kern.c79
-rw-r--r--samples/bpf/trace_event_user.c308
-rw-r--r--samples/bpf/trace_output_kern.c30
-rw-r--r--samples/bpf/trace_output_user.c106
-rw-r--r--samples/bpf/tracex1_kern.c48
-rw-r--r--samples/bpf/tracex1_user.c26
-rw-r--r--samples/bpf/tracex2_kern.c100
-rw-r--r--samples/bpf/tracex2_user.c160
-rw-r--r--samples/bpf/tracex3_kern.c89
-rw-r--r--samples/bpf/tracex3_user.c166
-rw-r--r--samples/bpf/tracex4_kern.c54
-rw-r--r--samples/bpf/tracex4_user.c77
-rw-r--r--samples/bpf/tracex5_kern.c79
-rw-r--r--samples/bpf/tracex5_user.c50
-rw-r--r--samples/bpf/tracex6_kern.c67
-rw-r--r--samples/bpf/tracex6_user.c189
-rw-r--r--samples/bpf/tracex7_kern.c16
-rw-r--r--samples/bpf/tracex7_user.c33
-rw-r--r--samples/bpf/xdp1_kern.c93
-rw-r--r--samples/bpf/xdp1_user.c138
-rw-r--r--samples/bpf/xdp2_kern.c114
-rwxr-xr-xsamples/bpf/xdp2skb_meta.sh220
-rw-r--r--samples/bpf/xdp2skb_meta_kern.c105
-rw-r--r--samples/bpf/xdp_adjust_tail_kern.c152
-rw-r--r--samples/bpf/xdp_adjust_tail_user.c150
-rw-r--r--samples/bpf/xdp_fwd_kern.c138
-rw-r--r--samples/bpf/xdp_fwd_user.c148
-rw-r--r--samples/bpf/xdp_monitor_kern.c259
-rw-r--r--samples/bpf/xdp_monitor_user.c715
-rw-r--r--samples/bpf/xdp_redirect_cpu_kern.c721
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c698
-rw-r--r--samples/bpf/xdp_redirect_kern.c90
-rw-r--r--samples/bpf/xdp_redirect_map_kern.c92
-rw-r--r--samples/bpf/xdp_redirect_map_user.c152
-rw-r--r--samples/bpf/xdp_redirect_user.c150
-rw-r--r--samples/bpf/xdp_router_ipv4_kern.c186
-rw-r--r--samples/bpf/xdp_router_ipv4_user.c660
-rw-r--r--samples/bpf/xdp_rxq_info_kern.c139
-rw-r--r--samples/bpf/xdp_rxq_info_user.c581
-rw-r--r--samples/bpf/xdp_sample_pkts_kern.c66
-rw-r--r--samples/bpf/xdp_sample_pkts_user.c169
-rw-r--r--samples/bpf/xdp_tx_iptunnel_common.h37
-rw-r--r--samples/bpf/xdp_tx_iptunnel_kern.c237
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c267
-rw-r--r--samples/bpf/xdpsock.h11
-rw-r--r--samples/bpf/xdpsock_kern.c56
-rw-r--r--samples/bpf/xdpsock_user.c987
-rw-r--r--samples/configfs/Makefile2
-rw-r--r--samples/configfs/configfs_sample.c404
-rw-r--r--samples/connector/.gitignore1
-rw-r--r--samples/connector/Makefile17
-rw-r--r--samples/connector/cn_test.c201
-rw-r--r--samples/connector/ucon.c250
-rw-r--r--samples/hidraw/.gitignore1
-rw-r--r--samples/hidraw/Makefile10
-rw-r--r--samples/hidraw/hid-example.c182
-rw-r--r--samples/hw_breakpoint/Makefile1
-rw-r--r--samples/hw_breakpoint/data_breakpoint.c90
-rw-r--r--samples/kdb/Makefile1
-rw-r--r--samples/kdb/kdb_hello.c60
-rw-r--r--samples/kfifo/Makefile1
-rw-r--r--samples/kfifo/bytestream-example.c198
-rw-r--r--samples/kfifo/dma-example.c143
-rw-r--r--samples/kfifo/inttype-example.c189
-rw-r--r--samples/kfifo/record-example.c205
-rw-r--r--samples/kobject/Makefile1
-rw-r--r--samples/kobject/kobject-example.c144
-rw-r--r--samples/kobject/kset-example.c287
-rw-r--r--samples/kprobes/Makefile5
-rw-r--r--samples/kprobes/kprobe_example.c117
-rw-r--r--samples/kprobes/kretprobe_example.c105
-rw-r--r--samples/livepatch/Makefile7
-rw-r--r--samples/livepatch/livepatch-callbacks-busymod.c72
-rw-r--r--samples/livepatch/livepatch-callbacks-demo.c219
-rw-r--r--samples/livepatch/livepatch-callbacks-mod.c53
-rw-r--r--samples/livepatch/livepatch-sample.c93
-rw-r--r--samples/livepatch/livepatch-shadow-fix1.c184
-rw-r--r--samples/livepatch/livepatch-shadow-fix2.c156
-rw-r--r--samples/livepatch/livepatch-shadow-mod.c228
-rw-r--r--samples/mei/.gitignore1
-rw-r--r--samples/mei/Makefile10
-rw-r--r--samples/mei/mei-amt-version.c479
-rw-r--r--samples/mic/mpssd/.gitignore1
-rw-r--r--samples/mic/mpssd/Makefile28
-rwxr-xr-xsamples/mic/mpssd/micctrl173
-rwxr-xr-xsamples/mic/mpssd/mpss200
-rw-r--r--samples/mic/mpssd/mpssd.c1826
-rw-r--r--samples/mic/mpssd/mpssd.h103
-rw-r--r--samples/mic/mpssd/sysfs.c102
-rw-r--r--samples/pktgen/README.rst45
-rw-r--r--samples/pktgen/functions.sh169
-rw-r--r--samples/pktgen/parameters.sh116
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh89
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh69
-rwxr-xr-xsamples/pktgen/pktgen_sample01_simple.sh74
-rwxr-xr-xsamples/pktgen/pktgen_sample02_multiqueue.sh79
-rwxr-xr-xsamples/pktgen/pktgen_sample03_burst_single_flow.sh85
-rwxr-xr-xsamples/pktgen/pktgen_sample04_many_flows.sh94
-rwxr-xr-xsamples/pktgen/pktgen_sample05_flow_per_thread.sh82
-rwxr-xr-xsamples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh97
-rw-r--r--samples/qmi/Makefile1
-rw-r--r--samples/qmi/qmi_sample_client.c622
-rw-r--r--samples/rpmsg/Makefile1
-rw-r--r--samples/rpmsg/rpmsg_client_sample.c102
-rw-r--r--samples/seccomp/.gitignore3
-rw-r--r--samples/seccomp/Makefile38
-rw-r--r--samples/seccomp/bpf-direct.c191
-rw-r--r--samples/seccomp/bpf-fancy.c105
-rw-r--r--samples/seccomp/bpf-helper.c96
-rw-r--r--samples/seccomp/bpf-helper.h263
-rw-r--r--samples/seccomp/dropper.c72
-rw-r--r--samples/statx/Makefile7
-rw-r--r--samples/statx/test-statx.c258
-rw-r--r--samples/timers/.gitignore1
-rw-r--r--samples/timers/Makefile16
-rw-r--r--samples/timers/hpet_example.c295
-rw-r--r--samples/trace_events/Makefile14
-rw-r--r--samples/trace_events/trace-events-sample.c139
-rw-r--r--samples/trace_events/trace-events-sample.h524
-rw-r--r--samples/trace_printk/Makefile6
-rw-r--r--samples/trace_printk/trace-printk.c56
-rw-r--r--samples/uhid/Makefile7
-rw-r--r--samples/uhid/uhid-example.c465
-rw-r--r--samples/v4l/Makefile1
-rw-r--r--samples/v4l/v4l2-pci-skeleton.c913
-rw-r--r--samples/vfio-mdev/Makefile4
-rw-r--r--samples/vfio-mdev/mbochs.c1405
-rw-r--r--samples/vfio-mdev/mdpy-defs.h22
-rw-r--r--samples/vfio-mdev/mdpy-fb.c237
-rw-r--r--samples/vfio-mdev/mdpy.c807
-rw-r--r--samples/vfio-mdev/mtty.c1516
-rw-r--r--samples/watchdog/.gitignore1
-rw-r--r--samples/watchdog/Makefile9
-rw-r--r--samples/watchdog/watchdog-simple.c25
219 files changed, 36500 insertions, 0 deletions
diff --git a/samples/Kconfig b/samples/Kconfig
new file mode 100644
index 000000000..ad1ec7016
--- /dev/null
+++ b/samples/Kconfig
@@ -0,0 +1,156 @@
+menuconfig SAMPLES
+ bool "Sample kernel code"
+ depends on !UML
+ help
+ You can build and test sample kernel code here.
+
+if SAMPLES
+
+config SAMPLE_TRACE_EVENTS
+ tristate "Build trace_events examples -- loadable modules only"
+ depends on EVENT_TRACING && m
+ help
+ This build trace event example modules.
+
+config SAMPLE_TRACE_PRINTK
+ tristate "Build trace_printk module - tests various trace_printk formats"
+ depends on EVENT_TRACING && m
+ help
+ This builds a module that calls trace_printk() and can be used to
+ test various trace_printk() calls from a module.
+
+config SAMPLE_KOBJECT
+ tristate "Build kobject examples -- loadable modules only"
+ depends on m
+ help
+ This config option will allow you to build a number of
+ different kobject sample modules showing how to use kobjects,
+ ksets, and ktypes properly.
+
+ If in doubt, say "N" here.
+
+config SAMPLE_KPROBES
+ tristate "Build kprobes examples -- loadable modules only"
+ depends on KPROBES && m
+ help
+ This build several kprobes example modules.
+
+config SAMPLE_KRETPROBES
+ tristate "Build kretprobes example -- loadable modules only"
+ default m
+ depends on SAMPLE_KPROBES && KRETPROBES
+
+config SAMPLE_HW_BREAKPOINT
+ tristate "Build kernel hardware breakpoint examples -- loadable module only"
+ depends on HAVE_HW_BREAKPOINT && m
+ help
+ This builds kernel hardware breakpoint example modules.
+
+config SAMPLE_KFIFO
+ tristate "Build kfifo examples -- loadable modules only"
+ depends on m
+ help
+ This config option will allow you to build a number of
+ different kfifo sample modules showing how to use the
+ generic kfifo API.
+
+ If in doubt, say "N" here.
+
+config SAMPLE_KDB
+ tristate "Build kdb command example -- loadable modules only"
+ depends on KGDB_KDB && m
+ help
+ Build an example of how to dynamically add the hello
+ command to the kdb shell.
+
+config SAMPLE_QMI_CLIENT
+ tristate "Build qmi client sample -- loadable modules only"
+ depends on m
+ depends on ARCH_QCOM
+ depends on NET
+ select QCOM_QMI_HELPERS
+ help
+ Build an QMI client sample driver, which demonstrates how to
+ communicate with a remote QRTR service, using QMI encoded messages.
+
+config SAMPLE_RPMSG_CLIENT
+ tristate "Build rpmsg client sample -- loadable modules only"
+ depends on RPMSG && m
+ help
+ Build an rpmsg client sample driver, which demonstrates how
+ to communicate with an AMP-configured remote processor over
+ the rpmsg bus.
+
+config SAMPLE_LIVEPATCH
+ tristate "Build live patching samples -- loadable modules only"
+ depends on LIVEPATCH && m
+ help
+ Build sample live patch demonstrations.
+
+config SAMPLE_CONFIGFS
+ tristate "Build configfs patching sample -- loadable modules only"
+ depends on CONFIGFS_FS && m
+ help
+ Builds a sample configfs interface.
+
+config SAMPLE_CONNECTOR
+ tristate "Build connector sample -- loadable modules only"
+ depends on CONNECTOR && m
+ help
+ When enabled, this builds both a sample kernel module for
+ the connector interface and a user space tool to communicate
+ with it.
+ See also Documentation/connector/connector.txt
+
+config SAMPLE_SECCOMP
+ tristate "Build seccomp sample code -- loadable modules only"
+ depends on SECCOMP_FILTER && m
+ help
+ Build samples of seccomp filters using various methods of
+ BPF filter construction.
+
+config SAMPLE_VFIO_MDEV_MTTY
+ tristate "Build VFIO mtty example mediated device sample code -- loadable modules only"
+ depends on VFIO_MDEV_DEVICE && m
+ help
+ Build a virtual tty sample driver for use as a VFIO
+ mediated device
+
+config SAMPLE_VFIO_MDEV_MDPY
+ tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
+ depends on VFIO_MDEV_DEVICE && m
+ help
+ Build a virtual display sample driver for use as a VFIO
+ mediated device. It is a simple framebuffer and supports
+ the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
+
+config SAMPLE_VFIO_MDEV_MDPY_FB
+ tristate "Build VFIO mdpy example guest fbdev driver -- loadable module only"
+ depends on FB && m
+ select FB_CFB_FILLRECT
+ select FB_CFB_COPYAREA
+ select FB_CFB_IMAGEBLIT
+ help
+ Guest fbdev driver for the virtual display sample driver.
+
+config SAMPLE_VFIO_MDEV_MBOCHS
+ tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
+ depends on VFIO_MDEV_DEVICE && m
+ select DMA_SHARED_BUFFER
+ help
+ Build a virtual display sample driver for use as a VFIO
+ mediated device. It supports the region display interface
+ (VFIO_GFX_PLANE_TYPE_DMABUF).
+ Emulate enough of qemu stdvga to make bochs-drm.ko happy.
+ That is basically the vram memory bar and the bochs dispi
+ interface vbe registers in the mmio register bar.
+ Specifically it does *not* include any legacy vga stuff.
+ Device looks a lot like "qemu -device secondary-vga".
+
+config SAMPLE_STATX
+ bool "Build example extended-stat using code"
+ depends on BROKEN
+ help
+ Build example userspace program to use the new extended-stat syscall.
+
+endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
new file mode 100644
index 000000000..bd601c038
--- /dev/null
+++ b/samples/Makefile
@@ -0,0 +1,6 @@
+# Makefile for Linux samples code
+
+obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \
+ hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \
+ configfs/ connector/ v4l/ trace_printk/ \
+ vfio-mdev/ statx/ qmi/
diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore
new file mode 100644
index 000000000..7af222860
--- /dev/null
+++ b/samples/auxdisplay/.gitignore
@@ -0,0 +1 @@
+cfag12864b-example
diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile
new file mode 100644
index 000000000..0273bab27
--- /dev/null
+++ b/samples/auxdisplay/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+CC := $(CROSS_COMPILE)gcc
+CFLAGS := -I../../usr/include
+
+PROGS := cfag12864b-example
+
+all: $(PROGS)
+
+clean:
+ rm -fr $(PROGS)
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c
new file mode 100644
index 000000000..85571e901
--- /dev/null
+++ b/samples/auxdisplay/cfag12864b-example.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Filename: cfag12864b-example.c
+ * Version: 0.1.0
+ * Description: cfag12864b LCD userspace example program
+ *
+ * Author: Copyright (C) Miguel Ojeda Sandonis
+ * Date: 2006-10-31
+ */
+
+/*
+ * ------------------------
+ * start of cfag12864b code
+ * ------------------------
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#define CFAG12864B_WIDTH (128)
+#define CFAG12864B_HEIGHT (64)
+#define CFAG12864B_SIZE (128 * 64 / 8)
+#define CFAG12864B_BPB (8)
+#define CFAG12864B_ADDRESS(x, y) ((y) * CFAG12864B_WIDTH / \
+ CFAG12864B_BPB + (x) / CFAG12864B_BPB)
+#define CFAG12864B_BIT(n) (((unsigned char) 1) << (n))
+
+#undef CFAG12864B_DOCHECK
+#ifdef CFAG12864B_DOCHECK
+ #define CFAG12864B_CHECK(x, y) ((x) < CFAG12864B_WIDTH && \
+ (y) < CFAG12864B_HEIGHT)
+#else
+ #define CFAG12864B_CHECK(x, y) (1)
+#endif
+
+int cfag12864b_fd;
+unsigned char * cfag12864b_mem;
+unsigned char cfag12864b_buffer[CFAG12864B_SIZE];
+
+/*
+ * init a cfag12864b framebuffer device
+ *
+ * No error: return = 0
+ * Unable to open: return = -1
+ * Unable to mmap: return = -2
+ */
+static int cfag12864b_init(char *path)
+{
+ cfag12864b_fd = open(path, O_RDWR);
+ if (cfag12864b_fd == -1)
+ return -1;
+
+ cfag12864b_mem = mmap(0, CFAG12864B_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED, cfag12864b_fd, 0);
+ if (cfag12864b_mem == MAP_FAILED) {
+ close(cfag12864b_fd);
+ return -2;
+ }
+
+ return 0;
+}
+
+/*
+ * exit a cfag12864b framebuffer device
+ */
+static void cfag12864b_exit(void)
+{
+ munmap(cfag12864b_mem, CFAG12864B_SIZE);
+ close(cfag12864b_fd);
+}
+
+/*
+ * set (x, y) pixel
+ */
+static void cfag12864b_set(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] |=
+ CFAG12864B_BIT(x % CFAG12864B_BPB);
+}
+
+/*
+ * unset (x, y) pixel
+ */
+static void cfag12864b_unset(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &=
+ ~CFAG12864B_BIT(x % CFAG12864B_BPB);
+}
+
+/*
+ * is set (x, y) pixel?
+ *
+ * Pixel off: return = 0
+ * Pixel on: return = 1
+ */
+static unsigned char cfag12864b_isset(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ if (cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &
+ CFAG12864B_BIT(x % CFAG12864B_BPB))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * not (x, y) pixel
+ */
+static void cfag12864b_not(unsigned char x, unsigned char y)
+{
+ if (cfag12864b_isset(x, y))
+ cfag12864b_unset(x, y);
+ else
+ cfag12864b_set(x, y);
+}
+
+/*
+ * fill (set all pixels)
+ */
+static void cfag12864b_fill(void)
+{
+ unsigned short i;
+
+ for (i = 0; i < CFAG12864B_SIZE; i++)
+ cfag12864b_buffer[i] = 0xFF;
+}
+
+/*
+ * clear (unset all pixels)
+ */
+static void cfag12864b_clear(void)
+{
+ unsigned short i;
+
+ for (i = 0; i < CFAG12864B_SIZE; i++)
+ cfag12864b_buffer[i] = 0;
+}
+
+/*
+ * format a [128*64] matrix
+ *
+ * Pixel off: src[i] = 0
+ * Pixel on: src[i] > 0
+ */
+static void cfag12864b_format(unsigned char * matrix)
+{
+ unsigned char i, j, n;
+
+ for (i = 0; i < CFAG12864B_HEIGHT; i++)
+ for (j = 0; j < CFAG12864B_WIDTH / CFAG12864B_BPB; j++) {
+ cfag12864b_buffer[i * CFAG12864B_WIDTH / CFAG12864B_BPB +
+ j] = 0;
+ for (n = 0; n < CFAG12864B_BPB; n++)
+ if (matrix[i * CFAG12864B_WIDTH +
+ j * CFAG12864B_BPB + n])
+ cfag12864b_buffer[i * CFAG12864B_WIDTH /
+ CFAG12864B_BPB + j] |=
+ CFAG12864B_BIT(n);
+ }
+}
+
+/*
+ * blit buffer to lcd
+ */
+static void cfag12864b_blit(void)
+{
+ memcpy(cfag12864b_mem, cfag12864b_buffer, CFAG12864B_SIZE);
+}
+
+/*
+ * ----------------------
+ * end of cfag12864b code
+ * ----------------------
+ */
+
+#include <stdio.h>
+
+#define EXAMPLES 6
+
+static void example(unsigned char n)
+{
+ unsigned short i, j;
+ unsigned char matrix[CFAG12864B_WIDTH * CFAG12864B_HEIGHT];
+
+ if (n > EXAMPLES)
+ return;
+
+ printf("Example %i/%i - ", n, EXAMPLES);
+
+ switch (n) {
+ case 1:
+ printf("Draw points setting bits");
+ cfag12864b_clear();
+ for (i = 0; i < CFAG12864B_WIDTH; i += 2)
+ for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
+ cfag12864b_set(i, j);
+ break;
+
+ case 2:
+ printf("Clear the LCD");
+ cfag12864b_clear();
+ break;
+
+ case 3:
+ printf("Draw rows formatting a [128*64] matrix");
+ memset(matrix, 0, CFAG12864B_WIDTH * CFAG12864B_HEIGHT);
+ for (i = 0; i < CFAG12864B_WIDTH; i++)
+ for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
+ matrix[j * CFAG12864B_WIDTH + i] = 1;
+ cfag12864b_format(matrix);
+ break;
+
+ case 4:
+ printf("Fill the lcd");
+ cfag12864b_fill();
+ break;
+
+ case 5:
+ printf("Draw columns unsetting bits");
+ for (i = 0; i < CFAG12864B_WIDTH; i += 2)
+ for (j = 0; j < CFAG12864B_HEIGHT; j++)
+ cfag12864b_unset(i, j);
+ break;
+
+ case 6:
+ printf("Do negative not-ing all bits");
+ for (i = 0; i < CFAG12864B_WIDTH; i++)
+ for (j = 0; j < CFAG12864B_HEIGHT; j ++)
+ cfag12864b_not(i, j);
+ break;
+ }
+
+ puts(" - [Press Enter]");
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned char n;
+
+ if (argc != 2) {
+ printf(
+ "Sintax: %s fbdev\n"
+ "Usually: /dev/fb0, /dev/fb1...\n", argv[0]);
+ return -1;
+ }
+
+ if (cfag12864b_init(argv[1])) {
+ printf("Can't init %s fbdev\n", argv[1]);
+ return -2;
+ }
+
+ for (n = 1; n <= EXAMPLES; n++) {
+ example(n);
+ cfag12864b_blit();
+ while (getchar() != '\n');
+ }
+
+ cfag12864b_exit();
+
+ return 0;
+}
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
new file mode 100644
index 000000000..8ae494002
--- /dev/null
+++ b/samples/bpf/.gitignore
@@ -0,0 +1,49 @@
+cpustat
+fds_example
+lathist
+load_sock_ops
+lwt_len_hist
+map_perf_test
+offwaketime
+per_socket_stats_example
+sampleip
+sock_example
+sockex1
+sockex2
+sockex3
+spintest
+syscall_nrs.h
+syscall_tp
+task_fd_query
+tc_l2_redirect
+test_cgrp2_array_pin
+test_cgrp2_attach
+test_cgrp2_attach2
+test_cgrp2_sock
+test_cgrp2_sock2
+test_current_task_under_cgroup
+test_lru_dist
+test_map_in_map
+test_overhead
+test_probe_write_user
+trace_event
+trace_output
+tracex1
+tracex2
+tracex3
+tracex4
+tracex5
+tracex6
+tracex7
+xdp1
+xdp2
+xdp_adjust_tail
+xdp_fwd
+xdp_monitor
+xdp_redirect
+xdp_redirect_cpu
+xdp_redirect_map
+xdp_router_ipv4
+xdp_rxq_info
+xdp_tx_iptunnel
+xdpsock
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
new file mode 100644
index 000000000..85a6e8f5a
--- /dev/null
+++ b/samples/bpf/Makefile
@@ -0,0 +1,279 @@
+# SPDX-License-Identifier: GPL-2.0
+
+BPF_SAMPLES_PATH ?= $(abspath $(srctree)/$(src))
+TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools
+
+# List of programs to build
+hostprogs-y := test_lru_dist
+hostprogs-y += sock_example
+hostprogs-y += fds_example
+hostprogs-y += sockex1
+hostprogs-y += sockex2
+hostprogs-y += sockex3
+hostprogs-y += tracex1
+hostprogs-y += tracex2
+hostprogs-y += tracex3
+hostprogs-y += tracex4
+hostprogs-y += tracex5
+hostprogs-y += tracex6
+hostprogs-y += tracex7
+hostprogs-y += test_probe_write_user
+hostprogs-y += trace_output
+hostprogs-y += lathist
+hostprogs-y += offwaketime
+hostprogs-y += spintest
+hostprogs-y += map_perf_test
+hostprogs-y += test_overhead
+hostprogs-y += test_cgrp2_array_pin
+hostprogs-y += test_cgrp2_attach
+hostprogs-y += test_cgrp2_attach2
+hostprogs-y += test_cgrp2_sock
+hostprogs-y += test_cgrp2_sock2
+hostprogs-y += xdp1
+hostprogs-y += xdp2
+hostprogs-y += xdp_router_ipv4
+hostprogs-y += test_current_task_under_cgroup
+hostprogs-y += trace_event
+hostprogs-y += sampleip
+hostprogs-y += tc_l2_redirect
+hostprogs-y += lwt_len_hist
+hostprogs-y += xdp_tx_iptunnel
+hostprogs-y += test_map_in_map
+hostprogs-y += per_socket_stats_example
+hostprogs-y += load_sock_ops
+hostprogs-y += xdp_redirect
+hostprogs-y += xdp_redirect_map
+hostprogs-y += xdp_redirect_cpu
+hostprogs-y += xdp_monitor
+hostprogs-y += xdp_rxq_info
+hostprogs-y += syscall_tp
+hostprogs-y += cpustat
+hostprogs-y += xdp_adjust_tail
+hostprogs-y += xdpsock
+hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
+hostprogs-y += xdp_sample_pkts
+
+# Libbpf dependencies
+LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
+
+CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
+TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
+
+fds_example-objs := bpf_load.o fds_example.o
+sockex1-objs := bpf_load.o sockex1_user.o
+sockex2-objs := bpf_load.o sockex2_user.o
+sockex3-objs := bpf_load.o sockex3_user.o
+tracex1-objs := bpf_load.o tracex1_user.o
+tracex2-objs := bpf_load.o tracex2_user.o
+tracex3-objs := bpf_load.o tracex3_user.o
+tracex4-objs := bpf_load.o tracex4_user.o
+tracex5-objs := bpf_load.o tracex5_user.o
+tracex6-objs := bpf_load.o tracex6_user.o
+tracex7-objs := bpf_load.o tracex7_user.o
+load_sock_ops-objs := bpf_load.o load_sock_ops.o
+test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
+trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
+lathist-objs := bpf_load.o lathist_user.o
+offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := bpf_load.o spintest_user.o $(TRACE_HELPERS)
+map_perf_test-objs := bpf_load.o map_perf_test_user.o
+test_overhead-objs := bpf_load.o test_overhead_user.o
+test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
+test_cgrp2_attach-objs := test_cgrp2_attach.o
+test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(CGROUP_HELPERS)
+test_cgrp2_sock-objs := test_cgrp2_sock.o
+test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o
+xdp1-objs := xdp1_user.o
+# reuse xdp1 source intentionally
+xdp2-objs := xdp1_user.o
+xdp_router_ipv4-objs := bpf_load.o xdp_router_ipv4_user.o
+test_current_task_under_cgroup-objs := bpf_load.o $(CGROUP_HELPERS) \
+ test_current_task_under_cgroup_user.o
+trace_event-objs := bpf_load.o trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := bpf_load.o sampleip_user.o $(TRACE_HELPERS)
+tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o
+lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o
+xdp_tx_iptunnel-objs := bpf_load.o xdp_tx_iptunnel_user.o
+test_map_in_map-objs := bpf_load.o test_map_in_map_user.o
+per_socket_stats_example-objs := cookie_uid_helper_example.o
+xdp_redirect-objs := bpf_load.o xdp_redirect_user.o
+xdp_redirect_map-objs := bpf_load.o xdp_redirect_map_user.o
+xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
+xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
+xdp_rxq_info-objs := xdp_rxq_info_user.o
+syscall_tp-objs := bpf_load.o syscall_tp_user.o
+cpustat-objs := bpf_load.o cpustat_user.o
+xdp_adjust_tail-objs := xdp_adjust_tail_user.o
+xdpsock-objs := xdpsock_user.o
+xdp_fwd-objs := xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
+xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+always += sockex1_kern.o
+always += sockex2_kern.o
+always += sockex3_kern.o
+always += tracex1_kern.o
+always += tracex2_kern.o
+always += tracex3_kern.o
+always += tracex4_kern.o
+always += tracex5_kern.o
+always += tracex6_kern.o
+always += tracex7_kern.o
+always += sock_flags_kern.o
+always += test_probe_write_user_kern.o
+always += trace_output_kern.o
+always += tcbpf1_kern.o
+always += tc_l2_redirect_kern.o
+always += lathist_kern.o
+always += offwaketime_kern.o
+always += spintest_kern.o
+always += map_perf_test_kern.o
+always += test_overhead_tp_kern.o
+always += test_overhead_raw_tp_kern.o
+always += test_overhead_kprobe_kern.o
+always += parse_varlen.o parse_simple.o parse_ldabs.o
+always += test_cgrp2_tc_kern.o
+always += xdp1_kern.o
+always += xdp2_kern.o
+always += xdp_router_ipv4_kern.o
+always += test_current_task_under_cgroup_kern.o
+always += trace_event_kern.o
+always += sampleip_kern.o
+always += lwt_len_hist_kern.o
+always += xdp_tx_iptunnel_kern.o
+always += test_map_in_map_kern.o
+always += cookie_uid_helper_example.o
+always += tcp_synrto_kern.o
+always += tcp_rwnd_kern.o
+always += tcp_bufs_kern.o
+always += tcp_cong_kern.o
+always += tcp_iw_kern.o
+always += tcp_clamp_kern.o
+always += tcp_basertt_kern.o
+always += xdp_redirect_kern.o
+always += xdp_redirect_map_kern.o
+always += xdp_redirect_cpu_kern.o
+always += xdp_monitor_kern.o
+always += xdp_rxq_info_kern.o
+always += xdp2skb_meta_kern.o
+always += syscall_tp_kern.o
+always += cpustat_kern.o
+always += xdp_adjust_tail_kern.o
+always += xdpsock_kern.o
+always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
+always += xdp_sample_pkts_kern.o
+
+KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
+KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
+KBUILD_HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
+KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
+KBUILD_HOSTCFLAGS += -I$(srctree)/tools/perf
+KBUILD_HOSTCFLAGS += -DHAVE_ATTR_TEST=0
+
+HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
+HOSTCFLAGS_trace_helpers.o += -I$(srctree)/tools/lib/bpf/
+
+HOSTCFLAGS_trace_output_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
+
+KBUILD_HOSTLDLIBS += $(LIBBPF) -lelf
+HOSTLDLIBS_tracex4 += -lrt
+HOSTLDLIBS_trace_output += -lrt
+HOSTLDLIBS_map_perf_test += -lrt
+HOSTLDLIBS_test_overhead += -lrt
+HOSTLDLIBS_xdpsock += -pthread
+
+# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
+# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+LLC ?= llc
+CLANG ?= clang
+LLVM_OBJCOPY ?= llvm-objcopy
+BTF_PAHOLE ?= pahole
+
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+HOSTCC = $(CROSS_COMPILE)gcc
+CLANG_ARCH_ARGS = -target $(ARCH)
+endif
+
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
+
+ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
+ EXTRA_CFLAGS += -g
+ LLC_FLAGS += -mattr=dwarfris
+ DWARF2BTF = y
+endif
+
+# Trick to allow make to be run from this directory
+all:
+ $(MAKE) -C ../../ $(CURDIR)/ BPF_SAMPLES_PATH=$(CURDIR)
+
+clean:
+ $(MAKE) -C ../../ M=$(CURDIR) clean
+ @find $(CURDIR) -type f -name '*~' -delete
+
+$(LIBBPF): FORCE
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+ $(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(BPF_SAMPLES_PATH)/../../ O=
+
+$(obj)/syscall_nrs.s: $(src)/syscall_nrs.c
+ $(call if_changed_dep,cc_s_c)
+
+$(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE
+ $(call filechk,offsets,__SYSCALL_NRS_H__)
+
+clean-files += syscall_nrs.h
+
+FORCE:
+
+
+# Verify LLVM compiler tools are available and bpf target is supported by llc
+.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
+
+verify_cmds: $(CLANG) $(LLC)
+ @for TOOL in $^ ; do \
+ if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
+ echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
+ exit 1; \
+ else true; fi; \
+ done
+
+verify_target_bpf: verify_cmds
+ @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
+ echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
+ echo " NOTICE: LLVM version >= 3.7.1 required" ;\
+ exit 2; \
+ else true; fi
+
+$(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
+$(src)/*.c: verify_target_bpf $(LIBBPF)
+
+$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
+
+# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
+# But, there is no easy way to fix it, so just exclude it since it is
+# useless for BPF samples.
+$(obj)/%.o: $(src)/%.c
+ @echo " CLANG-bpf " $@
+ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
+ -I$(srctree)/tools/testing/selftests/bpf/ \
+ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
+ -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
+ -Wno-gnu-variable-sized-type-not-at-end \
+ -Wno-address-of-packed-member -Wno-tautological-compare \
+ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
+ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+ $(BTF_PAHOLE) -J $@
+endif
diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
new file mode 100644
index 000000000..5f27e4fac
--- /dev/null
+++ b/samples/bpf/README.rst
@@ -0,0 +1,76 @@
+eBPF sample programs
+====================
+
+This directory contains a test stubs, verifier test-suite and examples
+for using eBPF. The examples use libbpf from tools/lib/bpf.
+
+Build dependencies
+==================
+
+Compiling requires having installed:
+ * clang >= version 3.4.0
+ * llvm >= version 3.7.1
+
+Note that LLVM's tool 'llc' must support target 'bpf', list version
+and supported targets with command: ``llc --version``
+
+Kernel headers
+--------------
+
+There are usually dependencies to header files of the current kernel.
+To avoid installing devel kernel headers system wide, as a normal
+user, simply call::
+
+ make headers_install
+
+This will creates a local "usr/include" directory in the git/build top
+level directory, that the make system automatically pickup first.
+
+Compiling
+=========
+
+For building the BPF samples, issue the below command from the kernel
+top level directory::
+
+ make samples/bpf/
+
+Do notice the "/" slash after the directory name.
+
+It is also possible to call make from this directory. This will just
+hide the the invocation of make as above with the appended "/".
+
+Manually compiling LLVM with 'bpf' support
+------------------------------------------
+
+Since version 3.7.0, LLVM adds a proper LLVM backend target for the
+BPF bytecode architecture.
+
+By default llvm will build all non-experimental backends including bpf.
+To generate a smaller llc binary one can use::
+
+ -DLLVM_TARGETS_TO_BUILD="BPF"
+
+Quick sniplet for manually compiling LLVM and clang
+(build dependencies are cmake and gcc-c++)::
+
+ $ git clone http://llvm.org/git/llvm.git
+ $ cd llvm/tools
+ $ git clone --depth 1 http://llvm.org/git/clang.git
+ $ cd ..; mkdir build; cd build
+ $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86"
+ $ make -j $(getconf _NPROCESSORS_ONLN)
+
+It is also possible to point make to the newly compiled 'llc' or
+'clang' command via redefining LLC or CLANG on the make command line::
+
+ make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+
+Cross compiling samples
+-----------------------
+In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH
+environment variables before calling make. This will direct make to build
+samples for the cross target.
+
+export ARCH=arm64
+export CROSS_COMPILE="aarch64-linux-gnu-"
+make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h
new file mode 100644
index 000000000..20dc5cefe
--- /dev/null
+++ b/samples/bpf/bpf_insn.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* eBPF instruction mini library */
+#ifndef __BPF_INSN_H
+#define __BPF_INSN_H
+
+struct bpf_insn;
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_MOV32_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_MOV32_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM) \
+ BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_DW | BPF_IMM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = (__u32) (IMM) }), \
+ ((struct bpf_insn) { \
+ .code = 0, /* zero is reserved opcode */ \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = ((__u64) (IMM)) >> 32 })
+
+#ifndef BPF_PSEUDO_MAP_FD
+# define BPF_PSEUDO_MAP_FD 1
+#endif
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
+
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = CODE, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN() \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_EXIT, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = 0 })
+
+#endif
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
new file mode 100644
index 000000000..176c04a45
--- /dev/null
+++ b/samples/bpf/bpf_load.c
@@ -0,0 +1,688 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <libelf.h>
+#include <gelf.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
+#include <ctype.h>
+#include <assert.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "perf-sys.h"
+
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
+static char license[128];
+static int kern_version;
+static bool processed_sec[128];
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+int map_fd[MAX_MAPS];
+int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
+int prog_cnt;
+int prog_array_fd = -1;
+
+struct bpf_map_data map_data[MAX_MAPS];
+int map_data_count = 0;
+
+static int populate_prog_array(const char *event, int prog_fd)
+{
+ int ind = atoi(event), err;
+
+ err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
+ if (err < 0) {
+ printf("failed to store prog_fd in prog_array\n");
+ return -1;
+ }
+ return 0;
+}
+
+static int write_kprobe_events(const char *val)
+{
+ int fd, ret, flags;
+
+ if (val == NULL)
+ return -1;
+ else if (val[0] == '\0')
+ flags = O_WRONLY | O_TRUNC;
+ else
+ flags = O_WRONLY | O_APPEND;
+
+ fd = open("/sys/kernel/debug/tracing/kprobe_events", flags);
+
+ ret = write(fd, val, strlen(val));
+ close(fd);
+
+ return ret;
+}
+
+static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
+{
+ bool is_socket = strncmp(event, "socket", 6) == 0;
+ bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
+ bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+ bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
+ bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
+ bool is_xdp = strncmp(event, "xdp", 3) == 0;
+ bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
+ bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
+ bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
+ bool is_sockops = strncmp(event, "sockops", 7) == 0;
+ bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
+ bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
+ size_t insns_cnt = size / sizeof(struct bpf_insn);
+ enum bpf_prog_type prog_type;
+ char buf[256];
+ int fd, efd, err, id;
+ struct perf_event_attr attr = {};
+
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+
+ if (is_socket) {
+ prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ } else if (is_kprobe || is_kretprobe) {
+ prog_type = BPF_PROG_TYPE_KPROBE;
+ } else if (is_tracepoint) {
+ prog_type = BPF_PROG_TYPE_TRACEPOINT;
+ } else if (is_raw_tracepoint) {
+ prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
+ } else if (is_xdp) {
+ prog_type = BPF_PROG_TYPE_XDP;
+ } else if (is_perf_event) {
+ prog_type = BPF_PROG_TYPE_PERF_EVENT;
+ } else if (is_cgroup_skb) {
+ prog_type = BPF_PROG_TYPE_CGROUP_SKB;
+ } else if (is_cgroup_sk) {
+ prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
+ } else if (is_sockops) {
+ prog_type = BPF_PROG_TYPE_SOCK_OPS;
+ } else if (is_sk_skb) {
+ prog_type = BPF_PROG_TYPE_SK_SKB;
+ } else if (is_sk_msg) {
+ prog_type = BPF_PROG_TYPE_SK_MSG;
+ } else {
+ printf("Unknown event '%s'\n", event);
+ return -1;
+ }
+
+ if (prog_cnt == MAX_PROGS)
+ return -1;
+
+ fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+ if (fd < 0) {
+ printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
+ return -1;
+ }
+
+ prog_fd[prog_cnt++] = fd;
+
+ if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
+ return 0;
+
+ if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
+ if (is_socket)
+ event += 6;
+ else
+ event += 7;
+ if (*event != '/')
+ return 0;
+ event++;
+ if (!isdigit(*event)) {
+ printf("invalid prog number\n");
+ return -1;
+ }
+ return populate_prog_array(event, fd);
+ }
+
+ if (is_raw_tracepoint) {
+ efd = bpf_raw_tracepoint_open(event + 15, fd);
+ if (efd < 0) {
+ printf("tracepoint %s %s\n", event + 15, strerror(errno));
+ return -1;
+ }
+ event_fd[prog_cnt - 1] = efd;
+ return 0;
+ }
+
+ if (is_kprobe || is_kretprobe) {
+ bool need_normal_check = true;
+ const char *event_prefix = "";
+
+ if (is_kprobe)
+ event += 7;
+ else
+ event += 10;
+
+ if (*event == 0) {
+ printf("event name cannot be empty\n");
+ return -1;
+ }
+
+ if (isdigit(*event))
+ return populate_prog_array(event, fd);
+
+#ifdef __x86_64__
+ if (strncmp(event, "sys_", 4) == 0) {
+ snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s",
+ is_kprobe ? 'p' : 'r', event, event);
+ err = write_kprobe_events(buf);
+ if (err >= 0) {
+ need_normal_check = false;
+ event_prefix = "__x64_";
+ }
+ }
+#endif
+ if (need_normal_check) {
+ snprintf(buf, sizeof(buf), "%c:%s %s",
+ is_kprobe ? 'p' : 'r', event, event);
+ err = write_kprobe_events(buf);
+ if (err < 0) {
+ printf("failed to create kprobe '%s' error '%s'\n",
+ event, strerror(errno));
+ return -1;
+ }
+ }
+
+ strcpy(buf, DEBUGFS);
+ strcat(buf, "events/kprobes/");
+ strcat(buf, event_prefix);
+ strcat(buf, event);
+ strcat(buf, "/id");
+ } else if (is_tracepoint) {
+ event += 11;
+
+ if (*event == 0) {
+ printf("event name cannot be empty\n");
+ return -1;
+ }
+ strcpy(buf, DEBUGFS);
+ strcat(buf, "events/");
+ strcat(buf, event);
+ strcat(buf, "/id");
+ }
+
+ efd = open(buf, O_RDONLY, 0);
+ if (efd < 0) {
+ printf("failed to open event %s\n", event);
+ return -1;
+ }
+
+ err = read(efd, buf, sizeof(buf));
+ if (err < 0 || err >= sizeof(buf)) {
+ printf("read from '%s' failed '%s'\n", event, strerror(errno));
+ return -1;
+ }
+
+ close(efd);
+
+ buf[err] = 0;
+ id = atoi(buf);
+ attr.config = id;
+
+ efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+ if (efd < 0) {
+ printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+ return -1;
+ }
+ event_fd[prog_cnt - 1] = efd;
+ err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+ if (err < 0) {
+ printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
+ strerror(errno));
+ return -1;
+ }
+ err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+ if (err < 0) {
+ printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int load_maps(struct bpf_map_data *maps, int nr_maps,
+ fixup_map_cb fixup_map)
+{
+ int i, numa_node;
+
+ for (i = 0; i < nr_maps; i++) {
+ if (fixup_map) {
+ fixup_map(&maps[i], i);
+ /* Allow userspace to assign map FD prior to creation */
+ if (maps[i].fd != -1) {
+ map_fd[i] = maps[i].fd;
+ continue;
+ }
+ }
+
+ numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ?
+ maps[i].def.numa_node : -1;
+
+ if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+ maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
+
+ map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
+ maps[i].name,
+ maps[i].def.key_size,
+ inner_map_fd,
+ maps[i].def.max_entries,
+ maps[i].def.map_flags,
+ numa_node);
+ } else {
+ map_fd[i] = bpf_create_map_node(maps[i].def.type,
+ maps[i].name,
+ maps[i].def.key_size,
+ maps[i].def.value_size,
+ maps[i].def.max_entries,
+ maps[i].def.map_flags,
+ numa_node);
+ }
+ if (map_fd[i] < 0) {
+ printf("failed to create a map: %d %s\n",
+ errno, strerror(errno));
+ return 1;
+ }
+ maps[i].fd = map_fd[i];
+
+ if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
+ prog_array_fd = map_fd[i];
+ }
+ return 0;
+}
+
+static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
+ GElf_Shdr *shdr, Elf_Data **data)
+{
+ Elf_Scn *scn;
+
+ scn = elf_getscn(elf, i);
+ if (!scn)
+ return 1;
+
+ if (gelf_getshdr(scn, shdr) != shdr)
+ return 2;
+
+ *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+ if (!*shname || !shdr->sh_size)
+ return 3;
+
+ *data = elf_getdata(scn, 0);
+ if (!*data || elf_getdata(scn, *data) != NULL)
+ return 4;
+
+ return 0;
+}
+
+static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
+ GElf_Shdr *shdr, struct bpf_insn *insn,
+ struct bpf_map_data *maps, int nr_maps)
+{
+ int i, nrels;
+
+ nrels = shdr->sh_size / shdr->sh_entsize;
+
+ for (i = 0; i < nrels; i++) {
+ GElf_Sym sym;
+ GElf_Rel rel;
+ unsigned int insn_idx;
+ bool match = false;
+ int j, map_idx;
+
+ gelf_getrel(data, i, &rel);
+
+ insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+
+ gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
+
+ if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
+ printf("invalid relo for insn[%d].code 0x%x\n",
+ insn_idx, insn[insn_idx].code);
+ return 1;
+ }
+ insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
+
+ /* Match FD relocation against recorded map_data[] offset */
+ for (map_idx = 0; map_idx < nr_maps; map_idx++) {
+ if (maps[map_idx].elf_offset == sym.st_value) {
+ match = true;
+ break;
+ }
+ }
+ if (match) {
+ insn[insn_idx].imm = maps[map_idx].fd;
+ } else {
+ printf("invalid relo for insn[%d] no map_data match\n",
+ insn_idx);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int cmp_symbols(const void *l, const void *r)
+{
+ const GElf_Sym *lsym = (const GElf_Sym *)l;
+ const GElf_Sym *rsym = (const GElf_Sym *)r;
+
+ if (lsym->st_value < rsym->st_value)
+ return -1;
+ else if (lsym->st_value > rsym->st_value)
+ return 1;
+ else
+ return 0;
+}
+
+static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
+ Elf *elf, Elf_Data *symbols, int strtabidx)
+{
+ int map_sz_elf, map_sz_copy;
+ bool validate_zero = false;
+ Elf_Data *data_maps;
+ int i, nr_maps;
+ GElf_Sym *sym;
+ Elf_Scn *scn;
+ int copy_sz;
+
+ if (maps_shndx < 0)
+ return -EINVAL;
+ if (!symbols)
+ return -EINVAL;
+
+ /* Get data for maps section via elf index */
+ scn = elf_getscn(elf, maps_shndx);
+ if (scn)
+ data_maps = elf_getdata(scn, NULL);
+ if (!scn || !data_maps) {
+ printf("Failed to get Elf_Data from maps section %d\n",
+ maps_shndx);
+ return -EINVAL;
+ }
+
+ /* For each map get corrosponding symbol table entry */
+ sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
+ for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+ assert(nr_maps < MAX_MAPS+1);
+ if (!gelf_getsym(symbols, i, &sym[nr_maps]))
+ continue;
+ if (sym[nr_maps].st_shndx != maps_shndx)
+ continue;
+ /* Only increment iif maps section */
+ nr_maps++;
+ }
+
+ /* Align to map_fd[] order, via sort on offset in sym.st_value */
+ qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+
+ /* Keeping compatible with ELF maps section changes
+ * ------------------------------------------------
+ * The program size of struct bpf_load_map_def is known by loader
+ * code, but struct stored in ELF file can be different.
+ *
+ * Unfortunately sym[i].st_size is zero. To calculate the
+ * struct size stored in the ELF file, assume all struct have
+ * the same size, and simply divide with number of map
+ * symbols.
+ */
+ map_sz_elf = data_maps->d_size / nr_maps;
+ map_sz_copy = sizeof(struct bpf_load_map_def);
+ if (map_sz_elf < map_sz_copy) {
+ /*
+ * Backward compat, loading older ELF file with
+ * smaller struct, keeping remaining bytes zero.
+ */
+ map_sz_copy = map_sz_elf;
+ } else if (map_sz_elf > map_sz_copy) {
+ /*
+ * Forward compat, loading newer ELF file with larger
+ * struct with unknown features. Assume zero means
+ * feature not used. Thus, validate rest of struct
+ * data is zero.
+ */
+ validate_zero = true;
+ }
+
+ /* Memcpy relevant part of ELF maps data to loader maps */
+ for (i = 0; i < nr_maps; i++) {
+ struct bpf_load_map_def *def;
+ unsigned char *addr, *end;
+ const char *map_name;
+ size_t offset;
+
+ map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
+ maps[i].name = strdup(map_name);
+ if (!maps[i].name) {
+ printf("strdup(%s): %s(%d)\n", map_name,
+ strerror(errno), errno);
+ free(sym);
+ return -errno;
+ }
+
+ /* Symbol value is offset into ELF maps section data area */
+ offset = sym[i].st_value;
+ def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
+ maps[i].elf_offset = offset;
+ memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
+ memcpy(&maps[i].def, def, map_sz_copy);
+
+ /* Verify no newer features were requested */
+ if (validate_zero) {
+ addr = (unsigned char*) def + map_sz_copy;
+ end = (unsigned char*) def + map_sz_elf;
+ for (; addr < end; addr++) {
+ if (*addr != 0) {
+ free(sym);
+ return -EFBIG;
+ }
+ }
+ }
+ }
+
+ free(sym);
+ return nr_maps;
+}
+
+static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
+{
+ int fd, i, ret, maps_shndx = -1, strtabidx = -1;
+ Elf *elf;
+ GElf_Ehdr ehdr;
+ GElf_Shdr shdr, shdr_prog;
+ Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
+ char *shname, *shname_prog;
+ int nr_maps = 0;
+
+ /* reset global variables */
+ kern_version = 0;
+ memset(license, 0, sizeof(license));
+ memset(processed_sec, 0, sizeof(processed_sec));
+
+ if (elf_version(EV_CURRENT) == EV_NONE)
+ return 1;
+
+ fd = open(path, O_RDONLY, 0);
+ if (fd < 0)
+ return 1;
+
+ elf = elf_begin(fd, ELF_C_READ, NULL);
+
+ if (!elf)
+ return 1;
+
+ if (gelf_getehdr(elf, &ehdr) != &ehdr)
+ return 1;
+
+ /* clear all kprobes */
+ i = write_kprobe_events("");
+
+ /* scan over all elf sections to get license and map info */
+ for (i = 1; i < ehdr.e_shnum; i++) {
+
+ if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+ continue;
+
+ if (0) /* helpful for llvm debugging */
+ printf("section %d:%s data %p size %zd link %d flags %d\n",
+ i, shname, data->d_buf, data->d_size,
+ shdr.sh_link, (int) shdr.sh_flags);
+
+ if (strcmp(shname, "license") == 0) {
+ processed_sec[i] = true;
+ memcpy(license, data->d_buf, data->d_size);
+ } else if (strcmp(shname, "version") == 0) {
+ processed_sec[i] = true;
+ if (data->d_size != sizeof(int)) {
+ printf("invalid size of version section %zd\n",
+ data->d_size);
+ return 1;
+ }
+ memcpy(&kern_version, data->d_buf, sizeof(int));
+ } else if (strcmp(shname, "maps") == 0) {
+ int j;
+
+ maps_shndx = i;
+ data_maps = data;
+ for (j = 0; j < MAX_MAPS; j++)
+ map_data[j].fd = -1;
+ } else if (shdr.sh_type == SHT_SYMTAB) {
+ strtabidx = shdr.sh_link;
+ symbols = data;
+ }
+ }
+
+ ret = 1;
+
+ if (!symbols) {
+ printf("missing SHT_SYMTAB section\n");
+ goto done;
+ }
+
+ if (data_maps) {
+ nr_maps = load_elf_maps_section(map_data, maps_shndx,
+ elf, symbols, strtabidx);
+ if (nr_maps < 0) {
+ printf("Error: Failed loading ELF maps (errno:%d):%s\n",
+ nr_maps, strerror(-nr_maps));
+ goto done;
+ }
+ if (load_maps(map_data, nr_maps, fixup_map))
+ goto done;
+ map_data_count = nr_maps;
+
+ processed_sec[maps_shndx] = true;
+ }
+
+ /* process all relo sections, and rewrite bpf insns for maps */
+ for (i = 1; i < ehdr.e_shnum; i++) {
+ if (processed_sec[i])
+ continue;
+
+ if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+ continue;
+
+ if (shdr.sh_type == SHT_REL) {
+ struct bpf_insn *insns;
+
+ /* locate prog sec that need map fixup (relocations) */
+ if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
+ &shdr_prog, &data_prog))
+ continue;
+
+ if (shdr_prog.sh_type != SHT_PROGBITS ||
+ !(shdr_prog.sh_flags & SHF_EXECINSTR))
+ continue;
+
+ insns = (struct bpf_insn *) data_prog->d_buf;
+ processed_sec[i] = true; /* relo section */
+
+ if (parse_relo_and_apply(data, symbols, &shdr, insns,
+ map_data, nr_maps))
+ continue;
+ }
+ }
+
+ /* load programs */
+ for (i = 1; i < ehdr.e_shnum; i++) {
+
+ if (processed_sec[i])
+ continue;
+
+ if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+ continue;
+
+ if (memcmp(shname, "kprobe/", 7) == 0 ||
+ memcmp(shname, "kretprobe/", 10) == 0 ||
+ memcmp(shname, "tracepoint/", 11) == 0 ||
+ memcmp(shname, "raw_tracepoint/", 15) == 0 ||
+ memcmp(shname, "xdp", 3) == 0 ||
+ memcmp(shname, "perf_event", 10) == 0 ||
+ memcmp(shname, "socket", 6) == 0 ||
+ memcmp(shname, "cgroup/", 7) == 0 ||
+ memcmp(shname, "sockops", 7) == 0 ||
+ memcmp(shname, "sk_skb", 6) == 0 ||
+ memcmp(shname, "sk_msg", 6) == 0) {
+ ret = load_and_attach(shname, data->d_buf,
+ data->d_size);
+ if (ret != 0)
+ goto done;
+ }
+ }
+
+done:
+ close(fd);
+ return ret;
+}
+
+int load_bpf_file(char *path)
+{
+ return do_load_bpf_file(path, NULL);
+}
+
+int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
+{
+ return do_load_bpf_file(path, fixup_map);
+}
+
+void read_trace_pipe(void)
+{
+ int trace_fd;
+
+ trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0)
+ return;
+
+ while (1) {
+ static char buf[4096];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf) - 1);
+ if (sz > 0) {
+ buf[sz] = 0;
+ puts(buf);
+ }
+ }
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
new file mode 100644
index 000000000..814894a12
--- /dev/null
+++ b/samples/bpf/bpf_load.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_LOAD_H
+#define __BPF_LOAD_H
+
+#include <bpf/bpf.h>
+
+#define MAX_MAPS 32
+#define MAX_PROGS 32
+
+struct bpf_load_map_def {
+ unsigned int type;
+ unsigned int key_size;
+ unsigned int value_size;
+ unsigned int max_entries;
+ unsigned int map_flags;
+ unsigned int inner_map_idx;
+ unsigned int numa_node;
+};
+
+struct bpf_map_data {
+ int fd;
+ char *name;
+ size_t elf_offset;
+ struct bpf_load_map_def def;
+};
+
+typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
+
+extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
+extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
+extern int prog_cnt;
+
+/* There is a one-to-one mapping between map_fd[] and map_data[].
+ * The map_data[] just contains more rich info on the given map.
+ */
+extern int map_fd[MAX_MAPS];
+extern struct bpf_map_data map_data[MAX_MAPS];
+extern int map_data_count;
+
+/* parses elf file compiled by llvm .c->.o
+ * . parses 'maps' section and creates maps via BPF syscall
+ * . parses 'license' section and passes it to syscall
+ * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by
+ * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD
+ * . loads eBPF programs via BPF syscall
+ *
+ * One ELF file can contain multiple BPF programs which will be loaded
+ * and their FDs stored stored in prog_fd array
+ *
+ * returns zero on success
+ */
+int load_bpf_file(char *path);
+int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
+
+void read_trace_pipe(void);
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
+#endif
diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c
new file mode 100644
index 000000000..deb0e3e03
--- /dev/null
+++ b/samples/bpf/cookie_uid_helper_example.c
@@ -0,0 +1,323 @@
+/* This test is a demo of using get_socket_uid and get_socket_cookie
+ * helper function to do per socket based network traffic monitoring.
+ * It requires iptables version higher then 1.6.1. to load pinned eBPF
+ * program into the xt_bpf match.
+ *
+ * TEST:
+ * ./run_cookie_uid_helper_example.sh -option
+ * option:
+ * -t: do traffic monitoring test, the program will continuously
+ * print out network traffic happens after program started A sample
+ * output is shown below:
+ *
+ * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058
+ * cookie: 132, uid: 0x0, Pakcet Count: 2, Bytes Count: 286
+ * cookie: 812, uid: 0x3e8, Pakcet Count: 3, Bytes Count: 1726
+ * cookie: 802, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104
+ * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058
+ * cookie: 831, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104
+ * cookie: 0, uid: 0x0, Pakcet Count: 6, Bytes Count: 712
+ * cookie: 880, uid: 0xfffe, Pakcet Count: 1, Bytes Count: 70
+ *
+ * -s: do getsockopt SO_COOKIE test, the program will set up a pair of
+ * UDP sockets and send packets between them. And read out the traffic data
+ * directly from the ebpf map based on the socket cookie.
+ *
+ * Clean up: if using shell script, the script file will delete the iptables
+ * rule and unmount the bpf program when exit. Else the iptables rule need
+ * to be deleted by hand, see run_cookie_uid_helper_example.sh for detail.
+ */
+
+#define _GNU_SOURCE
+
+#define offsetof(type, member) __builtin_offsetof(type, member)
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <limits.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include "bpf_insn.h"
+
+#define PORT 8888
+
+struct stats {
+ uint32_t uid;
+ uint64_t packets;
+ uint64_t bytes;
+};
+
+static int map_fd, prog_fd;
+
+static bool test_finish;
+
+static void maps_create(void)
+{
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t),
+ sizeof(struct stats), 100, 0);
+ if (map_fd < 0)
+ error(1, errno, "map create failed!\n");
+}
+
+static void prog_load(void)
+{
+ static char log_buf[1 << 16];
+
+ struct bpf_insn prog[] = {
+ /*
+ * Save sk_buff for future usage. value stored in R6 to R10 will
+ * not be reset after a bpf helper function call.
+ */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ /*
+ * pc1: BPF_FUNC_get_socket_cookie takes one parameter,
+ * R1: sk_buff
+ */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_socket_cookie),
+ /* pc2-4: save &socketCookie to r7 for future usage*/
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ /*
+ * pc5-8: set up the registers for BPF_FUNC_map_lookup_elem,
+ * it takes two parameters (R1: map_fd, R2: &socket_cookie)
+ */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ /*
+ * pc9. if r0 != 0x0, go to pc+14, since we have the cookie
+ * stored already
+ * Otherwise do pc10-22 to setup a new data entry.
+ */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 14),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_socket_uid),
+ /*
+ * Place a struct stats in the R10 stack and sequentially
+ * place the member value into the memory. Packets value
+ * is set by directly place a IMM value 1 into the stack.
+ */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0,
+ -32 + (__s16)offsetof(struct stats, uid)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10,
+ -32 + (__s16)offsetof(struct stats, packets), 1),
+ /*
+ * __sk_buff is a special struct used for eBPF program to
+ * directly access some sk_buff field.
+ */
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1,
+ -32 + (__s16)offsetof(struct stats, bytes)),
+ /*
+ * add new map entry using BPF_FUNC_map_update_elem, it takes
+ * 4 parameters (R1: map_fd, R2: &socket_cookie, R3: &stats,
+ * R4: flags)
+ */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -32),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_update_elem),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 5),
+ /*
+ * pc24-30 update the packet info to a exist data entry, it can
+ * be done by directly write to pointers instead of using
+ * BPF_FUNC_map_update_elem helper function
+ */
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1,
+ offsetof(struct stats, packets)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1,
+ offsetof(struct stats, bytes)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_EXIT_INSN(),
+ };
+ prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog,
+ ARRAY_SIZE(prog), "GPL", 0,
+ log_buf, sizeof(log_buf));
+ if (prog_fd < 0)
+ error(1, errno, "failed to load prog\n%s\n", log_buf);
+}
+
+static void prog_attach_iptables(char *file)
+{
+ int ret;
+ char rules[100];
+
+ if (bpf_obj_pin(prog_fd, file))
+ error(1, errno, "bpf_obj_pin");
+ if (strlen(file) > 50) {
+ printf("file path too long: %s\n", file);
+ exit(1);
+ }
+ sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT",
+ file);
+ ret = system(rules);
+ if (ret < 0) {
+ printf("iptables rule update failed: %d/n", WEXITSTATUS(ret));
+ exit(1);
+ }
+}
+
+static void print_table(void)
+{
+ struct stats curEntry;
+ uint32_t curN = UINT32_MAX;
+ uint32_t nextN;
+ int res;
+
+ while (bpf_map_get_next_key(map_fd, &curN, &nextN) > -1) {
+ curN = nextN;
+ res = bpf_map_lookup_elem(map_fd, &curN, &curEntry);
+ if (res < 0) {
+ error(1, errno, "fail to get entry value of Key: %u\n",
+ curN);
+ } else {
+ printf("cookie: %u, uid: 0x%x, Packet Count: %lu,"
+ " Bytes Count: %lu\n", curN, curEntry.uid,
+ curEntry.packets, curEntry.bytes);
+ }
+ }
+}
+
+static void udp_client(void)
+{
+ struct sockaddr_in si_other = {0};
+ struct sockaddr_in si_me = {0};
+ struct stats dataEntry;
+ int s_rcv, s_send, i, recv_len;
+ char message = 'a';
+ char buf;
+ uint64_t cookie;
+ int res;
+ socklen_t cookie_len = sizeof(cookie);
+ socklen_t slen = sizeof(si_other);
+
+ s_rcv = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s_rcv < 0)
+ error(1, errno, "rcv socket creat failed!\n");
+ si_other.sin_family = AF_INET;
+ si_other.sin_port = htons(PORT);
+ if (inet_aton("127.0.0.1", &si_other.sin_addr) == 0)
+ error(1, errno, "inet_aton\n");
+ if (bind(s_rcv, (struct sockaddr *)&si_other, sizeof(si_other)) == -1)
+ error(1, errno, "bind\n");
+ s_send = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s_send < 0)
+ error(1, errno, "send socket creat failed!\n");
+ res = getsockopt(s_send, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len);
+ if (res < 0)
+ printf("get cookie failed: %s\n", strerror(errno));
+ res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry);
+ if (res != -1)
+ error(1, errno, "socket stat found while flow not active\n");
+ for (i = 0; i < 10; i++) {
+ res = sendto(s_send, &message, sizeof(message), 0,
+ (struct sockaddr *)&si_other, slen);
+ if (res == -1)
+ error(1, errno, "send\n");
+ if (res != sizeof(message))
+ error(1, 0, "%uB != %luB\n", res, sizeof(message));
+ recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0,
+ (struct sockaddr *)&si_me, &slen);
+ if (recv_len < 0)
+ error(1, errno, "receive\n");
+ res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr),
+ sizeof(si_me.sin_addr));
+ if (res != 0)
+ error(1, EFAULT, "sender addr error: %d\n", res);
+ printf("Message received: %c\n", buf);
+ res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry);
+ if (res < 0)
+ error(1, errno, "lookup sk stat failed, cookie: %lu\n",
+ cookie);
+ printf("cookie: %lu, uid: 0x%x, Packet Count: %lu,"
+ " Bytes Count: %lu\n\n", cookie, dataEntry.uid,
+ dataEntry.packets, dataEntry.bytes);
+ }
+ close(s_send);
+ close(s_rcv);
+}
+
+static int usage(void)
+{
+ printf("Usage: ./run_cookie_uid_helper_example.sh"
+ " bpfObjName -option\n"
+ " -t traffic monitor test\n"
+ " -s getsockopt cookie test\n");
+ return 1;
+}
+
+static void finish(int ret)
+{
+ test_finish = true;
+}
+
+int main(int argc, char *argv[])
+{
+ int opt;
+ bool cfg_test_traffic = false;
+ bool cfg_test_cookie = false;
+
+ if (argc != 3)
+ return usage();
+ while ((opt = getopt(argc, argv, "ts")) != -1) {
+ switch (opt) {
+ case 't':
+ cfg_test_traffic = true;
+ break;
+ case 's':
+ cfg_test_cookie = true;
+ break;
+
+ default:
+ printf("unknown option %c\n", opt);
+ usage();
+ return -1;
+ }
+ }
+ maps_create();
+ prog_load();
+ prog_attach_iptables(argv[2]);
+ if (cfg_test_traffic) {
+ if (signal(SIGINT, finish) == SIG_ERR)
+ error(1, errno, "register SIGINT handler failed");
+ if (signal(SIGTERM, finish) == SIG_ERR)
+ error(1, errno, "register SIGTERM handler failed");
+ while (!test_finish) {
+ print_table();
+ printf("\n");
+ sleep(1);
+ };
+ } else if (cfg_test_cookie) {
+ udp_client();
+ }
+ close(prog_fd);
+ close(map_fd);
+ return 0;
+}
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
new file mode 100644
index 000000000..68c84da06
--- /dev/null
+++ b/samples/bpf/cpustat_kern.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/*
+ * The CPU number, cstate number and pstate number are based
+ * on 96boards Hikey with octa CA53 CPUs.
+ *
+ * Every CPU have three idle states for cstate:
+ * WFI, CPU_OFF, CLUSTER_OFF
+ *
+ * Every CPU have 5 operating points:
+ * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
+ *
+ * This code is based on these assumption and other platforms
+ * need to adjust these definitions.
+ */
+#define MAX_CPU 8
+#define MAX_PSTATE_ENTRIES 5
+#define MAX_CSTATE_ENTRIES 3
+
+static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
+
+/*
+ * my_map structure is used to record cstate and pstate index and
+ * timestamp (Idx, Ts), when new event incoming we need to update
+ * combination for new state index and timestamp (Idx`, Ts`).
+ *
+ * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
+ * interval for the previous state: Duration(Idx) = Ts` - Ts.
+ *
+ * Every CPU has one below array for recording state index and
+ * timestamp, and record for cstate and pstate saperately:
+ *
+ * +--------------------------+
+ * | cstate timestamp |
+ * +--------------------------+
+ * | cstate index |
+ * +--------------------------+
+ * | pstate timestamp |
+ * +--------------------------+
+ * | pstate index |
+ * +--------------------------+
+ */
+#define MAP_OFF_CSTATE_TIME 0
+#define MAP_OFF_CSTATE_IDX 1
+#define MAP_OFF_PSTATE_TIME 2
+#define MAP_OFF_PSTATE_IDX 3
+#define MAP_OFF_NUM 4
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_CPU * MAP_OFF_NUM,
+};
+
+/* cstate_duration records duration time for every idle state per CPU */
+struct bpf_map_def SEC("maps") cstate_duration = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
+};
+
+/* pstate_duration records duration time for every operating point per CPU */
+struct bpf_map_def SEC("maps") pstate_duration = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
+};
+
+/*
+ * The trace events for cpu_idle and cpu_frequency are taken from:
+ * /sys/kernel/debug/tracing/events/power/cpu_idle/format
+ * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
+ *
+ * These two events have same format, so define one common structure.
+ */
+struct cpu_args {
+ u64 pad;
+ u32 state;
+ u32 cpu_id;
+};
+
+/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
+static u32 find_cpu_pstate_idx(u32 frequency)
+{
+ u32 i;
+
+ for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
+ if (frequency == cpu_opps[i])
+ return i;
+ }
+
+ return i;
+}
+
+SEC("tracepoint/power/cpu_idle")
+int bpf_prog1(struct cpu_args *ctx)
+{
+ u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+ u32 key, cpu, pstate_idx;
+ u64 *val;
+
+ if (ctx->cpu_id > MAX_CPU)
+ return 0;
+
+ cpu = ctx->cpu_id;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
+ cts = bpf_map_lookup_elem(&my_map, &key);
+ if (!cts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+ cstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!cstate)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+ pts = bpf_map_lookup_elem(&my_map, &key);
+ if (!pts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+ pstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!pstate)
+ return 0;
+
+ prev_state = *cstate;
+ *cstate = ctx->state;
+
+ if (!*cts) {
+ *cts = bpf_ktime_get_ns();
+ return 0;
+ }
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = cur_ts - *cts;
+ *cts = cur_ts;
+
+ /*
+ * When state doesn't equal to (u32)-1, the cpu will enter
+ * one idle state; for this case we need to record interval
+ * for the pstate.
+ *
+ * OPP2
+ * +---------------------+
+ * OPP1 | |
+ * ---------+ |
+ * | Idle state
+ * +---------------
+ *
+ * |<- pstate duration ->|
+ * ^ ^
+ * pts cur_ts
+ */
+ if (ctx->state != (u32)-1) {
+
+ /* record pstate after have first cpu_frequency event */
+ if (!*pts)
+ return 0;
+
+ delta = cur_ts - *pts;
+
+ pstate_idx = find_cpu_pstate_idx(*pstate);
+ if (pstate_idx >= MAX_PSTATE_ENTRIES)
+ return 0;
+
+ key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+ val = bpf_map_lookup_elem(&pstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+
+ /*
+ * When state equal to (u32)-1, the cpu just exits from one
+ * specific idle state; for this case we need to record
+ * interval for the pstate.
+ *
+ * OPP2
+ * -----------+
+ * | OPP1
+ * | +-----------
+ * | Idle state |
+ * +---------------------+
+ *
+ * |<- cstate duration ->|
+ * ^ ^
+ * cts cur_ts
+ */
+ } else {
+
+ key = cpu * MAX_CSTATE_ENTRIES + prev_state;
+ val = bpf_map_lookup_elem(&cstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+ }
+
+ /* Update timestamp for pstate as new start time */
+ if (*pts)
+ *pts = cur_ts;
+
+ return 0;
+}
+
+SEC("tracepoint/power/cpu_frequency")
+int bpf_prog2(struct cpu_args *ctx)
+{
+ u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+ u32 key, cpu, pstate_idx;
+ u64 *val;
+
+ cpu = ctx->cpu_id;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+ pts = bpf_map_lookup_elem(&my_map, &key);
+ if (!pts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+ pstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!pstate)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+ cstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!cstate)
+ return 0;
+
+ prev_state = *pstate;
+ *pstate = ctx->state;
+
+ if (!*pts) {
+ *pts = bpf_ktime_get_ns();
+ return 0;
+ }
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = cur_ts - *pts;
+ *pts = cur_ts;
+
+ /* When CPU is in idle, bail out to skip pstate statistics */
+ if (*cstate != (u32)(-1))
+ return 0;
+
+ /*
+ * The cpu changes to another different OPP (in below diagram
+ * change frequency from OPP3 to OPP1), need recording interval
+ * for previous frequency OPP3 and update timestamp as start
+ * time for new frequency OPP1.
+ *
+ * OPP3
+ * +---------------------+
+ * OPP2 | |
+ * ---------+ |
+ * | OPP1
+ * +---------------
+ *
+ * |<- pstate duration ->|
+ * ^ ^
+ * pts cur_ts
+ */
+ pstate_idx = find_cpu_pstate_idx(*pstate);
+ if (pstate_idx >= MAX_PSTATE_ENTRIES)
+ return 0;
+
+ key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+ val = bpf_map_lookup_elem(&pstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
new file mode 100644
index 000000000..869a99406
--- /dev/null
+++ b/samples/bpf/cpustat_user.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sched.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+#define MAX_CPU 8
+#define MAX_PSTATE_ENTRIES 5
+#define MAX_CSTATE_ENTRIES 3
+#define MAX_STARS 40
+
+#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq"
+#define CPUFREQ_LOWEST_FREQ "208000"
+#define CPUFREQ_HIGHEST_FREQ "12000000"
+
+struct cpu_stat_data {
+ unsigned long cstate[MAX_CSTATE_ENTRIES];
+ unsigned long pstate[MAX_PSTATE_ENTRIES];
+};
+
+static struct cpu_stat_data stat_data[MAX_CPU];
+
+static void cpu_stat_print(void)
+{
+ int i, j;
+ char state_str[sizeof("cstate-9")];
+ struct cpu_stat_data *data;
+
+ /* Clear screen */
+ printf("\033[2J");
+
+ /* Header */
+ printf("\nCPU states statistics:\n");
+ printf("%-10s ", "state(ms)");
+
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+ sprintf(state_str, "cstate-%d", i);
+ printf("%-11s ", state_str);
+ }
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+ sprintf(state_str, "pstate-%d", i);
+ printf("%-11s ", state_str);
+ }
+
+ printf("\n");
+
+ for (j = 0; j < MAX_CPU; j++) {
+ data = &stat_data[j];
+
+ printf("CPU-%-6d ", j);
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++)
+ printf("%-11ld ", data->cstate[i] / 1000000);
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++)
+ printf("%-11ld ", data->pstate[i] / 1000000);
+
+ printf("\n");
+ }
+}
+
+static void cpu_stat_update(int cstate_fd, int pstate_fd)
+{
+ unsigned long key, value;
+ int c, i;
+
+ for (c = 0; c < MAX_CPU; c++) {
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+ key = c * MAX_CSTATE_ENTRIES + i;
+ bpf_map_lookup_elem(cstate_fd, &key, &value);
+ stat_data[c].cstate[i] = value;
+ }
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+ key = c * MAX_PSTATE_ENTRIES + i;
+ bpf_map_lookup_elem(pstate_fd, &key, &value);
+ stat_data[c].pstate[i] = value;
+ }
+ }
+}
+
+/*
+ * This function is copied from 'idlestat' tool function
+ * idlestat_wake_all() in idlestate.c.
+ *
+ * It sets the self running task affinity to cpus one by one so can wake up
+ * the specific CPU to handle scheduling; this results in all cpus can be
+ * waken up once and produce ftrace event 'trace_cpu_idle'.
+ */
+static int cpu_stat_inject_cpu_idle_event(void)
+{
+ int rcpu, i, ret;
+ cpu_set_t cpumask;
+ cpu_set_t original_cpumask;
+
+ ret = sysconf(_SC_NPROCESSORS_CONF);
+ if (ret < 0)
+ return -1;
+
+ rcpu = sched_getcpu();
+ if (rcpu < 0)
+ return -1;
+
+ /* Keep track of the CPUs we will run on */
+ sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask);
+
+ for (i = 0; i < ret; i++) {
+
+ /* Pointless to wake up ourself */
+ if (i == rcpu)
+ continue;
+
+ /* Pointless to wake CPUs we will not run on */
+ if (!CPU_ISSET(i, &original_cpumask))
+ continue;
+
+ CPU_ZERO(&cpumask);
+ CPU_SET(i, &cpumask);
+
+ sched_setaffinity(0, sizeof(cpumask), &cpumask);
+ }
+
+ /* Enable all the CPUs of the original mask */
+ sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask);
+ return 0;
+}
+
+/*
+ * It's possible to have no any frequency change for long time and cannot
+ * get ftrace event 'trace_cpu_frequency' for long period, this introduces
+ * big deviation for pstate statistics.
+ *
+ * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz
+ * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to
+ * the maximum frequency value 1.2GHz.
+ */
+static int cpu_stat_inject_cpu_frequency_event(void)
+{
+ int len, fd;
+
+ fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY);
+ if (fd < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ return fd;
+ }
+
+ len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ));
+ if (len < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ goto err;
+ }
+
+ len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ));
+ if (len < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ goto err;
+ }
+
+err:
+ close(fd);
+ return len;
+}
+
+static void int_exit(int sig)
+{
+ cpu_stat_inject_cpu_idle_event();
+ cpu_stat_inject_cpu_frequency_event();
+ cpu_stat_update(map_fd[1], map_fd[2]);
+ cpu_stat_print();
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+ int ret;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ ret = cpu_stat_inject_cpu_idle_event();
+ if (ret < 0)
+ return 1;
+
+ ret = cpu_stat_inject_cpu_frequency_event();
+ if (ret < 0)
+ return 1;
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ while (1) {
+ cpu_stat_update(map_fd[1], map_fd[2]);
+ cpu_stat_print();
+ sleep(5);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
new file mode 100644
index 000000000..9854854f0
--- /dev/null
+++ b/samples/bpf/fds_example.c
@@ -0,0 +1,188 @@
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf.h>
+
+#include "bpf_insn.h"
+#include "bpf_load.h"
+#include "sock_example.h"
+
+#define BPF_F_PIN (1 << 0)
+#define BPF_F_GET (1 << 1)
+#define BPF_F_PIN_GET (BPF_F_PIN | BPF_F_GET)
+
+#define BPF_F_KEY (1 << 2)
+#define BPF_F_VAL (1 << 3)
+#define BPF_F_KEY_VAL (BPF_F_KEY | BPF_F_VAL)
+
+#define BPF_M_UNSPEC 0
+#define BPF_M_MAP 1
+#define BPF_M_PROG 2
+
+static void usage(void)
+{
+ printf("Usage: fds_example [...]\n");
+ printf(" -F <file> File to pin/get object\n");
+ printf(" -P |- pin object\n");
+ printf(" -G `- get object\n");
+ printf(" -m eBPF map mode\n");
+ printf(" -k <key> |- map key\n");
+ printf(" -v <value> `- map value\n");
+ printf(" -p eBPF prog mode\n");
+ printf(" -o <object> `- object file\n");
+ printf(" -h Display this help.\n");
+}
+
+static int bpf_map_create(void)
+{
+ return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t),
+ sizeof(uint32_t), 1024, 0);
+}
+
+static int bpf_prog_create(const char *object)
+{
+ static struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn);
+
+ if (object) {
+ assert(!load_bpf_file((char *)object));
+ return prog_fd[0];
+ } else {
+ return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER,
+ insns, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+ }
+}
+
+static int bpf_do_map(const char *file, uint32_t flags, uint32_t key,
+ uint32_t value)
+{
+ int fd, ret;
+
+ if (flags & BPF_F_PIN) {
+ fd = bpf_map_create();
+ printf("bpf: map fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+
+ ret = bpf_obj_pin(fd, file);
+ printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno));
+ assert(ret == 0);
+ } else {
+ fd = bpf_obj_get(file);
+ printf("bpf: get fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+ }
+
+ if ((flags & BPF_F_KEY_VAL) == BPF_F_KEY_VAL) {
+ ret = bpf_map_update_elem(fd, &key, &value, 0);
+ printf("bpf: fd:%d u->(%u:%u) ret:(%d,%s)\n", fd, key, value,
+ ret, strerror(errno));
+ assert(ret == 0);
+ } else if (flags & BPF_F_KEY) {
+ ret = bpf_map_lookup_elem(fd, &key, &value);
+ printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value,
+ ret, strerror(errno));
+ assert(ret == 0);
+ }
+
+ return 0;
+}
+
+static int bpf_do_prog(const char *file, uint32_t flags, const char *object)
+{
+ int fd, sock, ret;
+
+ if (flags & BPF_F_PIN) {
+ fd = bpf_prog_create(object);
+ printf("bpf: prog fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+
+ ret = bpf_obj_pin(fd, file);
+ printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno));
+ assert(ret == 0);
+ } else {
+ fd = bpf_obj_get(file);
+ printf("bpf: get fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+ }
+
+ sock = open_raw_sock("lo");
+ assert(sock > 0);
+
+ ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &fd, sizeof(fd));
+ printf("bpf: sock:%d <- fd:%d attached ret:(%d,%s)\n", sock, fd,
+ ret, strerror(errno));
+ assert(ret == 0);
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ const char *file = NULL, *object = NULL;
+ uint32_t key = 0, value = 0, flags = 0;
+ int opt, mode = BPF_M_UNSPEC;
+
+ while ((opt = getopt(argc, argv, "F:PGmk:v:po:")) != -1) {
+ switch (opt) {
+ /* General args */
+ case 'F':
+ file = optarg;
+ break;
+ case 'P':
+ flags |= BPF_F_PIN;
+ break;
+ case 'G':
+ flags |= BPF_F_GET;
+ break;
+ /* Map-related args */
+ case 'm':
+ mode = BPF_M_MAP;
+ break;
+ case 'k':
+ key = strtoul(optarg, NULL, 0);
+ flags |= BPF_F_KEY;
+ break;
+ case 'v':
+ value = strtoul(optarg, NULL, 0);
+ flags |= BPF_F_VAL;
+ break;
+ /* Prog-related args */
+ case 'p':
+ mode = BPF_M_PROG;
+ break;
+ case 'o':
+ object = optarg;
+ break;
+ default:
+ goto out;
+ }
+ }
+
+ if (!(flags & BPF_F_PIN_GET) || !file)
+ goto out;
+
+ switch (mode) {
+ case BPF_M_MAP:
+ return bpf_do_map(file, flags, key, value);
+ case BPF_M_PROG:
+ return bpf_do_prog(file, flags, object);
+ }
+out:
+ usage();
+ return -1;
+}
diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h
new file mode 100644
index 000000000..38255812e
--- /dev/null
+++ b/samples/bpf/hash_func01.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1
+ *
+ * Based on Paul Hsieh's (LGPG 2.1) hash function
+ * From: http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+#define get16bits(d) (*((const __u16 *) (d)))
+
+static __always_inline
+__u32 SuperFastHash (const char *data, int len, __u32 initval) {
+ __u32 hash = initval;
+ __u32 tmp;
+ int rem;
+
+ if (len <= 0 || data == NULL) return 0;
+
+ rem = len & 3;
+ len >>= 2;
+
+ /* Main loop */
+#pragma clang loop unroll(full)
+ for (;len > 0; len--) {
+ hash += get16bits (data);
+ tmp = (get16bits (data+2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ data += 2*sizeof (__u16);
+ hash += hash >> 11;
+ }
+
+ /* Handle end cases */
+ switch (rem) {
+ case 3: hash += get16bits (data);
+ hash ^= hash << 16;
+ hash ^= ((signed char)data[sizeof (__u16)]) << 18;
+ hash += hash >> 11;
+ break;
+ case 2: hash += get16bits (data);
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ break;
+ case 1: hash += (signed char)*data;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c
new file mode 100644
index 000000000..18fa08847
--- /dev/null
+++ b/samples/bpf/lathist_kern.c
@@ -0,0 +1,99 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define MAX_ENTRIES 20
+#define MAX_CPU 4
+
+/* We need to stick to static allocated memory (an array instead of
+ * hash table) because managing dynamic memory from the
+ * trace_preempt_[on|off] tracepoints hooks is not supported.
+ */
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_CPU,
+};
+
+SEC("kprobe/trace_preempt_off")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ int cpu = bpf_get_smp_processor_id();
+ u64 *ts = bpf_map_lookup_elem(&my_map, &cpu);
+
+ if (ts)
+ *ts = bpf_ktime_get_ns();
+
+ return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_lat = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(long),
+ .max_entries = MAX_CPU * MAX_ENTRIES,
+};
+
+SEC("kprobe/trace_preempt_on")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ u64 *ts, cur_ts, delta;
+ int key, cpu;
+ long *val;
+
+ cpu = bpf_get_smp_processor_id();
+ ts = bpf_map_lookup_elem(&my_map, &cpu);
+ if (!ts)
+ return 0;
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = log2l(cur_ts - *ts);
+
+ if (delta > MAX_ENTRIES - 1)
+ delta = MAX_ENTRIES - 1;
+
+ key = cpu * MAX_ENTRIES + delta;
+ val = bpf_map_lookup_elem(&my_lat, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, 1);
+
+ return 0;
+
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c
new file mode 100644
index 000000000..c8e88cc84
--- /dev/null
+++ b/samples/bpf/lathist_user.c
@@ -0,0 +1,103 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+#define MAX_ENTRIES 20
+#define MAX_CPU 4
+#define MAX_STARS 40
+
+struct cpu_hist {
+ long data[MAX_ENTRIES];
+ long max;
+};
+
+static struct cpu_hist cpu_hist[MAX_CPU];
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(void)
+{
+ char starstr[MAX_STARS];
+ struct cpu_hist *hist;
+ int i, j;
+
+ /* clear screen */
+ printf("\033[2J");
+
+ for (j = 0; j < MAX_CPU; j++) {
+ hist = &cpu_hist[j];
+
+ /* ignore CPUs without data (maybe offline?) */
+ if (hist->max == 0)
+ continue;
+
+ printf("CPU %d\n", j);
+ printf(" latency : count distribution\n");
+ for (i = 1; i <= MAX_ENTRIES; i++) {
+ stars(starstr, hist->data[i - 1], hist->max, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1,
+ hist->data[i - 1], MAX_STARS, starstr);
+ }
+ }
+}
+
+static void get_data(int fd)
+{
+ long key, value;
+ int c, i;
+
+ for (i = 0; i < MAX_CPU; i++)
+ cpu_hist[i].max = 0;
+
+ for (c = 0; c < MAX_CPU; c++) {
+ for (i = 0; i < MAX_ENTRIES; i++) {
+ key = c * MAX_ENTRIES + i;
+ bpf_map_lookup_elem(fd, &key, &value);
+
+ cpu_hist[c].data[i] = value;
+ if (value > cpu_hist[c].max)
+ cpu_hist[c].max = value;
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ while (1) {
+ get_data(map_fd[1]);
+ print_hist();
+ sleep(5);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c
new file mode 100644
index 000000000..8ecb41ea0
--- /dev/null
+++ b/samples/bpf/load_sock_ops.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+
+static void usage(char *pname)
+{
+ printf("USAGE:\n %s [-l] <cg-path> <prog filename>\n", pname);
+ printf("\tLoad and attach a sock_ops program to the specified "
+ "cgroup\n");
+ printf("\tIf \"-l\" is used, the program will continue to run\n");
+ printf("\tprinting the BPF log buffer\n");
+ printf("\tIf the specified filename does not end in \".o\", it\n");
+ printf("\tappends \"_kern.o\" to the name\n");
+ printf("\n");
+ printf(" %s -r <cg-path>\n", pname);
+ printf("\tDetaches the currently attached sock_ops program\n");
+ printf("\tfrom the specified cgroup\n");
+ printf("\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ int logFlag = 0;
+ int error = 0;
+ char *cg_path;
+ char fn[500];
+ char *prog;
+ int cg_fd;
+
+ if (argc < 3)
+ usage(argv[0]);
+
+ if (!strcmp(argv[1], "-r")) {
+ cg_path = argv[2];
+ cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
+ error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+ if (error) {
+ printf("ERROR: bpf_prog_detach: %d (%s)\n",
+ error, strerror(errno));
+ return 2;
+ }
+ return 0;
+ } else if (!strcmp(argv[1], "-h")) {
+ usage(argv[0]);
+ } else if (!strcmp(argv[1], "-l")) {
+ logFlag = 1;
+ if (argc < 4)
+ usage(argv[0]);
+ }
+
+ prog = argv[argc - 1];
+ cg_path = argv[argc - 2];
+ if (strlen(prog) > 480) {
+ fprintf(stderr, "ERROR: program name too long (> 480 chars)\n");
+ return 3;
+ }
+ cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
+
+ if (!strcmp(prog + strlen(prog)-2, ".o"))
+ strcpy(fn, prog);
+ else
+ sprintf(fn, "%s_kern.o", prog);
+ if (logFlag)
+ printf("loading bpf file:%s\n", fn);
+ if (load_bpf_file(fn)) {
+ printf("ERROR: load_bpf_file failed for: %s\n", fn);
+ printf("%s", bpf_log_buf);
+ return 4;
+ }
+ if (logFlag)
+ printf("TCP BPF Loaded %s\n", fn);
+
+ error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (error) {
+ printf("ERROR: bpf_prog_attach: %d (%s)\n",
+ error, strerror(errno));
+ return 5;
+ } else if (logFlag) {
+ read_trace_pipe();
+ }
+
+ return error;
+}
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh
new file mode 100755
index 000000000..0eda9754f
--- /dev/null
+++ b/samples/bpf/lwt_len_hist.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NS1=lwt_ns1
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+
+TRACE_ROOT=/sys/kernel/debug/tracing
+
+function cleanup {
+ # To reset saved histogram, remove pinned map
+ rm /sys/fs/bpf/tc/globals/lwt_len_hist_map
+ ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null
+ ip link del $VETH0 2> /dev/null
+ ip link del $VETH1 2> /dev/null
+ ip netns exec $NS1 killall netserver
+ ip netns delete $NS1 2> /dev/null
+}
+
+cleanup
+
+ip netns add $NS1
+ip link add $VETH0 type veth peer name $VETH1
+ip link set dev $VETH0 up
+ip addr add 192.168.253.1/24 dev $VETH0
+ip link set $VETH1 netns $NS1
+ip netns exec $NS1 ip link set dev $VETH1 up
+ip netns exec $NS1 ip addr add 192.168.253.2/24 dev $VETH1
+ip netns exec $NS1 netserver
+
+echo 1 > ${TRACE_ROOT}/tracing_on
+cp /dev/null ${TRACE_ROOT}/trace
+ip route add 192.168.253.2/32 encap bpf out obj lwt_len_hist_kern.o section len_hist dev $VETH0
+netperf -H 192.168.253.2 -t TCP_STREAM
+cat ${TRACE_ROOT}/trace | grep -v '^#'
+./lwt_len_hist
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+
+exit 0
diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c
new file mode 100644
index 000000000..df7538328
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_kern.c
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/in.h>
+#include "bpf_helpers.h"
+
+# define printk(fmt, ...) \
+ ({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+ })
+
+struct bpf_elf_map {
+ __u32 type;
+ __u32 size_key;
+ __u32 size_value;
+ __u32 max_elem;
+ __u32 flags;
+ __u32 id;
+ __u32 pinning;
+};
+
+struct bpf_elf_map SEC("maps") lwt_len_hist_map = {
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+ .size_key = sizeof(__u64),
+ .size_value = sizeof(__u64),
+ .pinning = 2,
+ .max_elem = 1024,
+};
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+SEC("len_hist")
+int do_len_hist(struct __sk_buff *skb)
+{
+ __u64 *value, key, init_val = 1;
+
+ key = log2l(skb->len);
+
+ value = bpf_map_lookup_elem(&lwt_len_hist_map, &key);
+ if (value)
+ __sync_fetch_and_add(value, 1);
+ else
+ bpf_map_update_elem(&lwt_len_hist_map, &key, &init_val, BPF_ANY);
+
+ return BPF_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c
new file mode 100644
index 000000000..430a4b7e3
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_user.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <arpa/inet.h>
+
+#include <bpf/bpf.h>
+#include "bpf_util.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+int main(int argc, char **argv)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ const char *map_filename = "/sys/fs/bpf/tc/globals/lwt_len_hist_map";
+ uint64_t values[nr_cpus], sum, max_value = 0, data[MAX_INDEX] = {};
+ uint64_t key = 0, next_key, max_key = 0;
+ char starstr[MAX_STARS];
+ int i, map_fd;
+
+ map_fd = bpf_obj_get(map_filename);
+ if (map_fd < 0) {
+ fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
+ map_filename, strerror(errno), errno);
+ return -1;
+ }
+
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
+ if (next_key >= MAX_INDEX) {
+ fprintf(stderr, "Key %lu out of bounds\n", next_key);
+ continue;
+ }
+
+ bpf_map_lookup_elem(map_fd, &next_key, values);
+
+ sum = 0;
+ for (i = 0; i < nr_cpus; i++)
+ sum += values[i];
+
+ data[next_key] = sum;
+ if (sum && next_key > max_key)
+ max_key = next_key;
+
+ if (sum > max_value)
+ max_value = sum;
+
+ key = next_key;
+ }
+
+ for (i = 1; i <= max_key + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+
+ close(map_fd);
+
+ return 0;
+}
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
new file mode 100644
index 000000000..2b2ffb970
--- /dev/null
+++ b/samples/bpf/map_perf_test_kern.c
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define MAX_ENTRIES 1000
+#define MAX_NR_CPUS 1024
+
+struct bpf_map_def SEC("maps") hash_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = MAX_ENTRIES,
+};
+
+struct bpf_map_def SEC("maps") lru_hash_map = {
+ .type = BPF_MAP_TYPE_LRU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") nocommon_lru_hash_map = {
+ .type = BPF_MAP_TYPE_LRU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 10000,
+ .map_flags = BPF_F_NO_COMMON_LRU,
+};
+
+struct bpf_map_def SEC("maps") inner_lru_hash_map = {
+ .type = BPF_MAP_TYPE_LRU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = MAX_ENTRIES,
+ .map_flags = BPF_F_NUMA_NODE,
+ .numa_node = 0,
+};
+
+struct bpf_map_def SEC("maps") array_of_lru_hashs = {
+ .type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ .key_size = sizeof(u32),
+ .max_entries = MAX_NR_CPUS,
+};
+
+struct bpf_map_def SEC("maps") percpu_hash_map = {
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = MAX_ENTRIES,
+};
+
+struct bpf_map_def SEC("maps") hash_map_alloc = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = MAX_ENTRIES,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+struct bpf_map_def SEC("maps") percpu_hash_map_alloc = {
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = MAX_ENTRIES,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+struct bpf_map_def SEC("maps") lpm_trie_map_alloc = {
+ .type = BPF_MAP_TYPE_LPM_TRIE,
+ .key_size = 8,
+ .value_size = sizeof(long),
+ .max_entries = 10000,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+struct bpf_map_def SEC("maps") array_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = MAX_ENTRIES,
+};
+
+struct bpf_map_def SEC("maps") lru_hash_lookup_map = {
+ .type = BPF_MAP_TYPE_LRU_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = MAX_ENTRIES,
+};
+
+SEC("kprobe/sys_getuid")
+int stress_hmap(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+
+ bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&hash_map, &key);
+ if (value)
+ bpf_map_delete_elem(&hash_map, &key);
+
+ return 0;
+}
+
+SEC("kprobe/sys_geteuid")
+int stress_percpu_hmap(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+
+ bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&percpu_hash_map, &key);
+ if (value)
+ bpf_map_delete_elem(&percpu_hash_map, &key);
+ return 0;
+}
+
+SEC("kprobe/sys_getgid")
+int stress_hmap_alloc(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+
+ bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&hash_map_alloc, &key);
+ if (value)
+ bpf_map_delete_elem(&hash_map_alloc, &key);
+ return 0;
+}
+
+SEC("kprobe/sys_getegid")
+int stress_percpu_hmap_alloc(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+
+ bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key);
+ if (value)
+ bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
+ return 0;
+}
+
+SEC("kprobe/sys_connect")
+int stress_lru_hmap_alloc(struct pt_regs *ctx)
+{
+ char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn";
+ union {
+ u16 dst6[8];
+ struct {
+ u16 magic0;
+ u16 magic1;
+ u16 tcase;
+ u16 unused16;
+ u32 unused32;
+ u32 key;
+ };
+ } test_params;
+ struct sockaddr_in6 *in6;
+ u16 test_case;
+ int addrlen, ret;
+ long val = 1;
+ u32 key = 0;
+
+ in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx);
+ addrlen = (int)PT_REGS_PARM3(ctx);
+
+ if (addrlen != sizeof(*in6))
+ return 0;
+
+ ret = bpf_probe_read(test_params.dst6, sizeof(test_params.dst6),
+ &in6->sin6_addr);
+ if (ret)
+ goto done;
+
+ if (test_params.magic0 != 0xdead ||
+ test_params.magic1 != 0xbeef)
+ return 0;
+
+ test_case = test_params.tcase;
+ if (test_case != 3)
+ key = bpf_get_prandom_u32();
+
+ if (test_case == 0) {
+ ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
+ } else if (test_case == 1) {
+ ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val,
+ BPF_ANY);
+ } else if (test_case == 2) {
+ void *nolocal_lru_map;
+ int cpu = bpf_get_smp_processor_id();
+
+ nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs,
+ &cpu);
+ if (!nolocal_lru_map) {
+ ret = -ENOENT;
+ goto done;
+ }
+
+ ret = bpf_map_update_elem(nolocal_lru_map, &key, &val,
+ BPF_ANY);
+ } else if (test_case == 3) {
+ u32 i;
+
+ key = test_params.key;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 32; i++) {
+ bpf_map_lookup_elem(&lru_hash_lookup_map, &key);
+ key++;
+ }
+ } else {
+ ret = -EINVAL;
+ }
+
+done:
+ if (ret)
+ bpf_trace_printk(fmt, sizeof(fmt), ret);
+
+ return 0;
+}
+
+SEC("kprobe/sys_gettid")
+int stress_lpm_trie_map_alloc(struct pt_regs *ctx)
+{
+ union {
+ u32 b32[2];
+ u8 b8[8];
+ } key;
+ unsigned int i;
+
+ key.b32[0] = 32;
+ key.b8[4] = 192;
+ key.b8[5] = 168;
+ key.b8[6] = 0;
+ key.b8[7] = 1;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 32; ++i)
+ bpf_map_lookup_elem(&lpm_trie_map_alloc, &key);
+
+ return 0;
+}
+
+SEC("kprobe/sys_getpgid")
+int stress_hash_map_lookup(struct pt_regs *ctx)
+{
+ u32 key = 1, i;
+ long *value;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 64; ++i)
+ value = bpf_map_lookup_elem(&hash_map, &key);
+
+ return 0;
+}
+
+SEC("kprobe/sys_getppid")
+int stress_array_map_lookup(struct pt_regs *ctx)
+{
+ u32 key = 1, i;
+ long *value;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 64; ++i)
+ value = bpf_map_lookup_elem(&array_map, &key);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
new file mode 100644
index 000000000..38b7b1a96
--- /dev/null
+++ b/samples/bpf/map_perf_test_user.c
@@ -0,0 +1,464 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <asm/unistd.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <time.h>
+#include <sys/resource.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+#define TEST_BIT(t) (1U << (t))
+#define MAX_NR_CPUS 1024
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+enum test_type {
+ HASH_PREALLOC,
+ PERCPU_HASH_PREALLOC,
+ HASH_KMALLOC,
+ PERCPU_HASH_KMALLOC,
+ LRU_HASH_PREALLOC,
+ NOCOMMON_LRU_HASH_PREALLOC,
+ LPM_KMALLOC,
+ HASH_LOOKUP,
+ ARRAY_LOOKUP,
+ INNER_LRU_HASH_PREALLOC,
+ LRU_HASH_LOOKUP,
+ NR_TESTS,
+};
+
+const char *test_map_names[NR_TESTS] = {
+ [HASH_PREALLOC] = "hash_map",
+ [PERCPU_HASH_PREALLOC] = "percpu_hash_map",
+ [HASH_KMALLOC] = "hash_map_alloc",
+ [PERCPU_HASH_KMALLOC] = "percpu_hash_map_alloc",
+ [LRU_HASH_PREALLOC] = "lru_hash_map",
+ [NOCOMMON_LRU_HASH_PREALLOC] = "nocommon_lru_hash_map",
+ [LPM_KMALLOC] = "lpm_trie_map_alloc",
+ [HASH_LOOKUP] = "hash_map",
+ [ARRAY_LOOKUP] = "array_map",
+ [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map",
+ [LRU_HASH_LOOKUP] = "lru_hash_lookup_map",
+};
+
+static int test_flags = ~0;
+static uint32_t num_map_entries;
+static uint32_t inner_lru_hash_size;
+static int inner_lru_hash_idx = -1;
+static int array_of_lru_hashs_idx = -1;
+static int lru_hash_lookup_idx = -1;
+static int lru_hash_lookup_test_entries = 32;
+static uint32_t max_cnt = 1000000;
+
+static int check_test_flags(enum test_type t)
+{
+ return test_flags & TEST_BIT(t);
+}
+
+static void test_hash_prealloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getuid);
+ printf("%d:hash_map_perf pre-alloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static int pre_test_lru_hash_lookup(int tasks)
+{
+ int fd = map_fd[lru_hash_lookup_idx];
+ uint32_t key;
+ long val = 1;
+ int ret;
+
+ if (num_map_entries > lru_hash_lookup_test_entries)
+ lru_hash_lookup_test_entries = num_map_entries;
+
+ /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test.
+ *
+ * It is fine that the user requests for a map with
+ * num_map_entries < 32 and some of the later lru hash lookup
+ * may return not found. For LRU map, we are not interested
+ * in such small map performance.
+ */
+ for (key = 0; key < lru_hash_lookup_test_entries; key++) {
+ ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void do_test_lru(enum test_type test, int cpu)
+{
+ static int inner_lru_map_fds[MAX_NR_CPUS];
+
+ struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
+ const char *test_name;
+ __u64 start_time;
+ int i, ret;
+
+ if (test == INNER_LRU_HASH_PREALLOC) {
+ int outer_fd = map_fd[array_of_lru_hashs_idx];
+ unsigned int mycpu, mynode;
+
+ assert(cpu < MAX_NR_CPUS);
+
+ if (cpu) {
+ ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL);
+ assert(!ret);
+
+ inner_lru_map_fds[cpu] =
+ bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH,
+ test_map_names[INNER_LRU_HASH_PREALLOC],
+ sizeof(uint32_t),
+ sizeof(long),
+ inner_lru_hash_size, 0,
+ mynode);
+ if (inner_lru_map_fds[cpu] == -1) {
+ printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n",
+ strerror(errno), errno);
+ exit(1);
+ }
+ } else {
+ inner_lru_map_fds[cpu] = map_fd[inner_lru_hash_idx];
+ }
+
+ ret = bpf_map_update_elem(outer_fd, &cpu,
+ &inner_lru_map_fds[cpu],
+ BPF_ANY);
+ if (ret) {
+ printf("cannot update ARRAY_OF_LRU_HASHS with key:%u. %s(%d)\n",
+ cpu, strerror(errno), errno);
+ exit(1);
+ }
+ }
+
+ in6.sin6_addr.s6_addr16[0] = 0xdead;
+ in6.sin6_addr.s6_addr16[1] = 0xbeef;
+
+ if (test == LRU_HASH_PREALLOC) {
+ test_name = "lru_hash_map_perf";
+ in6.sin6_addr.s6_addr16[2] = 0;
+ } else if (test == NOCOMMON_LRU_HASH_PREALLOC) {
+ test_name = "nocommon_lru_hash_map_perf";
+ in6.sin6_addr.s6_addr16[2] = 1;
+ } else if (test == INNER_LRU_HASH_PREALLOC) {
+ test_name = "inner_lru_hash_map_perf";
+ in6.sin6_addr.s6_addr16[2] = 2;
+ } else if (test == LRU_HASH_LOOKUP) {
+ test_name = "lru_hash_lookup_perf";
+ in6.sin6_addr.s6_addr16[2] = 3;
+ in6.sin6_addr.s6_addr32[3] = 0;
+ } else {
+ assert(0);
+ }
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++) {
+ ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6));
+ assert(ret == -1 && errno == EBADF);
+ if (in6.sin6_addr.s6_addr32[3] <
+ lru_hash_lookup_test_entries - 32)
+ in6.sin6_addr.s6_addr32[3] += 32;
+ else
+ in6.sin6_addr.s6_addr32[3] = 0;
+ }
+ printf("%d:%s pre-alloc %lld events per sec\n",
+ cpu, test_name,
+ max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_lru_hash_prealloc(int cpu)
+{
+ do_test_lru(LRU_HASH_PREALLOC, cpu);
+}
+
+static void test_nocommon_lru_hash_prealloc(int cpu)
+{
+ do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu);
+}
+
+static void test_inner_lru_hash_prealloc(int cpu)
+{
+ do_test_lru(INNER_LRU_HASH_PREALLOC, cpu);
+}
+
+static void test_lru_hash_lookup(int cpu)
+{
+ do_test_lru(LRU_HASH_LOOKUP, cpu);
+}
+
+static void test_percpu_hash_prealloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_geteuid);
+ printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_hash_kmalloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getgid);
+ printf("%d:hash_map_perf kmalloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_percpu_hash_kmalloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getegid);
+ printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_lpm_kmalloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_gettid);
+ printf("%d:lpm_perf kmalloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_hash_lookup(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getpgid, 0);
+ printf("%d:hash_lookup %lld lookups per sec\n",
+ cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
+}
+
+static void test_array_lookup(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getppid, 0);
+ printf("%d:array_lookup %lld lookups per sec\n",
+ cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
+}
+
+typedef int (*pre_test_func)(int tasks);
+const pre_test_func pre_test_funcs[] = {
+ [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup,
+};
+
+typedef void (*test_func)(int cpu);
+const test_func test_funcs[] = {
+ [HASH_PREALLOC] = test_hash_prealloc,
+ [PERCPU_HASH_PREALLOC] = test_percpu_hash_prealloc,
+ [HASH_KMALLOC] = test_hash_kmalloc,
+ [PERCPU_HASH_KMALLOC] = test_percpu_hash_kmalloc,
+ [LRU_HASH_PREALLOC] = test_lru_hash_prealloc,
+ [NOCOMMON_LRU_HASH_PREALLOC] = test_nocommon_lru_hash_prealloc,
+ [LPM_KMALLOC] = test_lpm_kmalloc,
+ [HASH_LOOKUP] = test_hash_lookup,
+ [ARRAY_LOOKUP] = test_array_lookup,
+ [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc,
+ [LRU_HASH_LOOKUP] = test_lru_hash_lookup,
+};
+
+static int pre_test(int tasks)
+{
+ int i;
+
+ for (i = 0; i < NR_TESTS; i++) {
+ if (pre_test_funcs[i] && check_test_flags(i)) {
+ int ret = pre_test_funcs[i](tasks);
+
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void loop(int cpu)
+{
+ cpu_set_t cpuset;
+ int i;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+ for (i = 0; i < NR_TESTS; i++) {
+ if (check_test_flags(i))
+ test_funcs[i](cpu);
+ }
+}
+
+static void run_perf_test(int tasks)
+{
+ pid_t pid[tasks];
+ int i;
+
+ assert(!pre_test(tasks));
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ loop(i);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ }
+ }
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void fill_lpm_trie(void)
+{
+ struct bpf_lpm_trie_key *key;
+ unsigned long value = 0;
+ unsigned int i;
+ int r;
+
+ key = alloca(sizeof(*key) + 4);
+ key->prefixlen = 32;
+
+ for (i = 0; i < 512; ++i) {
+ key->prefixlen = rand() % 33;
+ key->data[0] = rand() & 0xff;
+ key->data[1] = rand() & 0xff;
+ key->data[2] = rand() & 0xff;
+ key->data[3] = rand() & 0xff;
+ r = bpf_map_update_elem(map_fd[6], key, &value, 0);
+ assert(!r);
+ }
+
+ key->prefixlen = 32;
+ key->data[0] = 192;
+ key->data[1] = 168;
+ key->data[2] = 0;
+ key->data[3] = 1;
+ value = 128;
+
+ r = bpf_map_update_elem(map_fd[6], key, &value, 0);
+ assert(!r);
+}
+
+static void fixup_map(struct bpf_map_data *map, int idx)
+{
+ int i;
+
+ if (!strcmp("inner_lru_hash_map", map->name)) {
+ inner_lru_hash_idx = idx;
+ inner_lru_hash_size = map->def.max_entries;
+ }
+
+ if (!strcmp("array_of_lru_hashs", map->name)) {
+ if (inner_lru_hash_idx == -1) {
+ printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n");
+ exit(1);
+ }
+ map->def.inner_map_idx = inner_lru_hash_idx;
+ array_of_lru_hashs_idx = idx;
+ }
+
+ if (!strcmp("lru_hash_lookup_map", map->name))
+ lru_hash_lookup_idx = idx;
+
+ if (num_map_entries <= 0)
+ return;
+
+ inner_lru_hash_size = num_map_entries;
+
+ /* Only change the max_entries for the enabled test(s) */
+ for (i = 0; i < NR_TESTS; i++) {
+ if (!strcmp(test_map_names[i], map->name) &&
+ (check_test_flags(i))) {
+ map->def.max_entries = num_map_entries;
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+ int num_cpu = 8;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ if (argc > 1)
+ test_flags = atoi(argv[1]) ? : test_flags;
+
+ if (argc > 2)
+ num_cpu = atoi(argv[2]) ? : num_cpu;
+
+ if (argc > 3)
+ num_map_entries = atoi(argv[3]);
+
+ if (argc > 4)
+ max_cnt = atoi(argv[4]);
+
+ if (load_bpf_file_fixup_map(filename, fixup_map)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ fill_lpm_trie();
+
+ run_perf_test(num_cpu);
+
+ return 0;
+}
diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c
new file mode 100644
index 000000000..e7d9a0a3d
--- /dev/null
+++ b/samples/bpf/offwaketime_kern.c
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/perf_event.h>
+#include <linux/version.h>
+#include <linux/sched.h>
+
+#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+#define MINBLOCK_US 1
+
+struct key_t {
+ char waker[TASK_COMM_LEN];
+ char target[TASK_COMM_LEN];
+ u32 wret;
+ u32 tret;
+};
+
+struct bpf_map_def SEC("maps") counts = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct key_t),
+ .value_size = sizeof(u64),
+ .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") start = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = 10000,
+};
+
+struct wokeby_t {
+ char name[TASK_COMM_LEN];
+ u32 ret;
+};
+
+struct bpf_map_def SEC("maps") wokeby = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct wokeby_t),
+ .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") stackmap = {
+ .type = BPF_MAP_TYPE_STACK_TRACE,
+ .key_size = sizeof(u32),
+ .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
+ .max_entries = 10000,
+};
+
+#define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
+
+SEC("kprobe/try_to_wake_up")
+int waker(struct pt_regs *ctx)
+{
+ struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
+ struct wokeby_t woke;
+ u32 pid;
+
+ pid = _(p->pid);
+
+ bpf_get_current_comm(&woke.name, sizeof(woke.name));
+ woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
+
+ bpf_map_update_elem(&wokeby, &pid, &woke, BPF_ANY);
+ return 0;
+}
+
+static inline int update_counts(void *ctx, u32 pid, u64 delta)
+{
+ struct wokeby_t *woke;
+ u64 zero = 0, *val;
+ struct key_t key;
+
+ __builtin_memset(&key.waker, 0, sizeof(key.waker));
+ bpf_get_current_comm(&key.target, sizeof(key.target));
+ key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
+ key.wret = 0;
+
+ woke = bpf_map_lookup_elem(&wokeby, &pid);
+ if (woke) {
+ key.wret = woke->ret;
+ __builtin_memcpy(&key.waker, woke->name, sizeof(key.waker));
+ bpf_map_delete_elem(&wokeby, &pid);
+ }
+
+ val = bpf_map_lookup_elem(&counts, &key);
+ if (!val) {
+ bpf_map_update_elem(&counts, &key, &zero, BPF_NOEXIST);
+ val = bpf_map_lookup_elem(&counts, &key);
+ if (!val)
+ return 0;
+ }
+ (*val) += delta;
+ return 0;
+}
+
+#if 1
+/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+struct sched_switch_args {
+ unsigned long long pad;
+ char prev_comm[16];
+ int prev_pid;
+ int prev_prio;
+ long long prev_state;
+ char next_comm[16];
+ int next_pid;
+ int next_prio;
+};
+SEC("tracepoint/sched/sched_switch")
+int oncpu(struct sched_switch_args *ctx)
+{
+ /* record previous thread sleep time */
+ u32 pid = ctx->prev_pid;
+#else
+SEC("kprobe/finish_task_switch")
+int oncpu(struct pt_regs *ctx)
+{
+ struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
+ /* record previous thread sleep time */
+ u32 pid = _(p->pid);
+#endif
+ u64 delta, ts, *tsp;
+
+ ts = bpf_ktime_get_ns();
+ bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
+
+ /* calculate current thread's delta time */
+ pid = bpf_get_current_pid_tgid();
+ tsp = bpf_map_lookup_elem(&start, &pid);
+ if (!tsp)
+ /* missed start or filtered */
+ return 0;
+
+ delta = bpf_ktime_get_ns() - *tsp;
+ bpf_map_delete_elem(&start, &pid);
+ delta = delta / 1000;
+ if (delta < MINBLOCK_US)
+ return 0;
+
+ return update_counts(ctx, pid, delta);
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
new file mode 100644
index 000000000..f06063af9
--- /dev/null
+++ b/samples/bpf/offwaketime_user.c
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <linux/perf_event.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <sys/resource.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "trace_helpers.h"
+
+#define PRINT_RAW_ADDR 0
+
+static void print_ksym(__u64 addr)
+{
+ struct ksym *sym;
+
+ if (!addr)
+ return;
+ sym = ksym_search(addr);
+ if (PRINT_RAW_ADDR)
+ printf("%s/%llx;", sym->name, addr);
+ else
+ printf("%s;", sym->name);
+}
+
+#define TASK_COMM_LEN 16
+
+struct key_t {
+ char waker[TASK_COMM_LEN];
+ char target[TASK_COMM_LEN];
+ __u32 wret;
+ __u32 tret;
+};
+
+static void print_stack(struct key_t *key, __u64 count)
+{
+ __u64 ip[PERF_MAX_STACK_DEPTH] = {};
+ static bool warned;
+ int i;
+
+ printf("%s;", key->target);
+ if (bpf_map_lookup_elem(map_fd[3], &key->tret, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
+ print_ksym(ip[i]);
+ }
+ printf("-;");
+ if (bpf_map_lookup_elem(map_fd[3], &key->wret, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = 0; i < PERF_MAX_STACK_DEPTH; i++)
+ print_ksym(ip[i]);
+ }
+ printf(";%s %lld\n", key->waker, count);
+
+ if ((key->tret == -EEXIST || key->wret == -EEXIST) && !warned) {
+ printf("stackmap collisions seen. Consider increasing size\n");
+ warned = true;
+ } else if (((int)(key->tret) < 0 || (int)(key->wret) < 0)) {
+ printf("err stackid %d %d\n", key->tret, key->wret);
+ }
+}
+
+static void print_stacks(int fd)
+{
+ struct key_t key = {}, next_key;
+ __u64 value;
+
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &value);
+ print_stack(&next_key, value);
+ key = next_key;
+ }
+}
+
+static void int_exit(int sig)
+{
+ print_stacks(map_fd[0]);
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+ int delay = 1;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 2;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (argc > 1)
+ delay = atoi(argv[1]);
+ sleep(delay);
+ print_stacks(map_fd[0]);
+
+ return 0;
+}
diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c
new file mode 100644
index 000000000..6db6b21fd
--- /dev/null
+++ b/samples/bpf/parse_ldabs.c
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define DEFAULT_PKTGEN_UDP_PORT 9
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+ return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+ & (IP_MF | IP_OFFSET);
+}
+
+SEC("ldabs")
+int handle_ingress(struct __sk_buff *skb)
+{
+ __u64 troff = ETH_HLEN + sizeof(struct iphdr);
+
+ if (load_half(skb, offsetof(struct ethhdr, h_proto)) != ETH_P_IP)
+ return 0;
+ if (load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)) != IPPROTO_UDP ||
+ load_byte(skb, ETH_HLEN) != 0x45)
+ return 0;
+ if (ip_is_fragment(skb, ETH_HLEN))
+ return 0;
+ if (load_half(skb, troff + offsetof(struct udphdr, dest)) == DEFAULT_PKTGEN_UDP_PORT)
+ return TC_ACT_SHOT;
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c
new file mode 100644
index 000000000..10af53d33
--- /dev/null
+++ b/samples/bpf/parse_simple.c
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <uapi/linux/bpf.h>
+#include <net/ip.h>
+#include "bpf_helpers.h"
+
+#define DEFAULT_PKTGEN_UDP_PORT 9
+
+/* copy of 'struct ethhdr' without __packed */
+struct eth_hdr {
+ unsigned char h_dest[ETH_ALEN];
+ unsigned char h_source[ETH_ALEN];
+ unsigned short h_proto;
+};
+
+SEC("simple")
+int handle_ingress(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ struct iphdr *iph = data + sizeof(*eth);
+ struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph);
+ void *data_end = (void *)(long)skb->data_end;
+
+ /* single length check */
+ if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end)
+ return 0;
+
+ if (eth->h_proto != htons(ETH_P_IP))
+ return 0;
+ if (iph->protocol != IPPROTO_UDP || iph->ihl != 5)
+ return 0;
+ if (ip_is_fragment(iph))
+ return 0;
+ if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT))
+ return TC_ACT_SHOT;
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c
new file mode 100644
index 000000000..0b6f22feb
--- /dev/null
+++ b/samples/bpf/parse_varlen.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <uapi/linux/bpf.h>
+#include <net/ip.h>
+#include "bpf_helpers.h"
+
+#define DEFAULT_PKTGEN_UDP_PORT 9
+#define DEBUG 0
+
+static int tcp(void *data, uint64_t tp_off, void *data_end)
+{
+ struct tcphdr *tcp = data + tp_off;
+
+ if (tcp + 1 > data_end)
+ return 0;
+ if (tcp->dest == htons(80) || tcp->source == htons(80))
+ return TC_ACT_SHOT;
+ return 0;
+}
+
+static int udp(void *data, uint64_t tp_off, void *data_end)
+{
+ struct udphdr *udp = data + tp_off;
+
+ if (udp + 1 > data_end)
+ return 0;
+ if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) ||
+ udp->source == htons(DEFAULT_PKTGEN_UDP_PORT)) {
+ if (DEBUG) {
+ char fmt[] = "udp port 9 indeed\n";
+
+ bpf_trace_printk(fmt, sizeof(fmt));
+ }
+ return TC_ACT_SHOT;
+ }
+ return 0;
+}
+
+static int parse_ipv4(void *data, uint64_t nh_off, void *data_end)
+{
+ struct iphdr *iph;
+ uint64_t ihl_len;
+
+ iph = data + nh_off;
+ if (iph + 1 > data_end)
+ return 0;
+
+ if (ip_is_fragment(iph))
+ return 0;
+ ihl_len = iph->ihl * 4;
+
+ if (iph->protocol == IPPROTO_IPIP) {
+ iph = data + nh_off + ihl_len;
+ if (iph + 1 > data_end)
+ return 0;
+ ihl_len += iph->ihl * 4;
+ }
+
+ if (iph->protocol == IPPROTO_TCP)
+ return tcp(data, nh_off + ihl_len, data_end);
+ else if (iph->protocol == IPPROTO_UDP)
+ return udp(data, nh_off + ihl_len, data_end);
+ return 0;
+}
+
+static int parse_ipv6(void *data, uint64_t nh_off, void *data_end)
+{
+ struct ipv6hdr *ip6h;
+ struct iphdr *iph;
+ uint64_t ihl_len = sizeof(struct ipv6hdr);
+ uint64_t nexthdr;
+
+ ip6h = data + nh_off;
+ if (ip6h + 1 > data_end)
+ return 0;
+
+ nexthdr = ip6h->nexthdr;
+
+ if (nexthdr == IPPROTO_IPIP) {
+ iph = data + nh_off + ihl_len;
+ if (iph + 1 > data_end)
+ return 0;
+ ihl_len += iph->ihl * 4;
+ nexthdr = iph->protocol;
+ } else if (nexthdr == IPPROTO_IPV6) {
+ ip6h = data + nh_off + ihl_len;
+ if (ip6h + 1 > data_end)
+ return 0;
+ ihl_len += sizeof(struct ipv6hdr);
+ nexthdr = ip6h->nexthdr;
+ }
+
+ if (nexthdr == IPPROTO_TCP)
+ return tcp(data, nh_off + ihl_len, data_end);
+ else if (nexthdr == IPPROTO_UDP)
+ return udp(data, nh_off + ihl_len, data_end);
+ return 0;
+}
+
+SEC("varlen")
+int handle_ingress(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ struct ethhdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ uint64_t h_proto, nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return 0;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return 0;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return 0;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_IP))
+ return parse_ipv4(data, nh_off, data_end);
+ else if (h_proto == htons(ETH_P_IPV6))
+ return parse_ipv6(data, nh_off, data_end);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/run_cookie_uid_helper_example.sh b/samples/bpf/run_cookie_uid_helper_example.sh
new file mode 100755
index 000000000..fc6bc0451
--- /dev/null
+++ b/samples/bpf/run_cookie_uid_helper_example.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+local_dir="$(pwd)"
+root_dir=$local_dir/../..
+mnt_dir=$(mktemp -d --tmp)
+
+on_exit() {
+ iptables -D OUTPUT -m bpf --object-pinned ${mnt_dir}/bpf_prog -j ACCEPT
+ umount ${mnt_dir}
+ rm -r ${mnt_dir}
+}
+
+trap on_exit EXIT
+mount -t bpf bpf ${mnt_dir}
+./per_socket_stats_example ${mnt_dir}/bpf_prog $1
diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c
new file mode 100644
index 000000000..ceabf3107
--- /dev/null
+++ b/samples/bpf/sampleip_kern.c
@@ -0,0 +1,38 @@
+/* Copyright 2016 Netflix, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include "bpf_helpers.h"
+
+#define MAX_IPS 8192
+
+struct bpf_map_def SEC("maps") ip_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u64),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_IPS,
+};
+
+SEC("perf_event")
+int do_sample(struct bpf_perf_event_data *ctx)
+{
+ u64 ip;
+ u32 *value, init_val = 1;
+
+ ip = PT_REGS_IP(&ctx->regs);
+ value = bpf_map_lookup_elem(&ip_map, &ip);
+ if (value)
+ *value += 1;
+ else
+ /* E2BIG not tested for this example only */
+ bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST);
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
new file mode 100644
index 000000000..60c2b73d1
--- /dev/null
+++ b/samples/bpf/sampleip_user.c
@@ -0,0 +1,199 @@
+/*
+ * sampleip: sample instruction pointer and frequency count in a BPF map.
+ *
+ * Copyright 2016 Netflix, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <assert.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define DEFAULT_FREQ 99
+#define DEFAULT_SECS 5
+#define MAX_IPS 8192
+#define PAGE_OFFSET 0xffff880000000000
+
+static int nr_cpus;
+
+static void usage(void)
+{
+ printf("USAGE: sampleip [-F freq] [duration]\n");
+ printf(" -F freq # sample frequency (Hertz), default 99\n");
+ printf(" duration # sampling duration (seconds), default 5\n");
+}
+
+static int sampling_start(int *pmu_fd, int freq)
+{
+ int i;
+
+ struct perf_event_attr pe_sample_attr = {
+ .type = PERF_TYPE_SOFTWARE,
+ .freq = 1,
+ .sample_period = freq,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ .inherit = 1,
+ };
+
+ for (i = 0; i < nr_cpus; i++) {
+ pmu_fd[i] = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i,
+ -1 /* group_fd */, 0 /* flags */);
+ if (pmu_fd[i] < 0) {
+ fprintf(stderr, "ERROR: Initializing perf sampling\n");
+ return 1;
+ }
+ assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF,
+ prog_fd[0]) == 0);
+ assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0);
+ }
+
+ return 0;
+}
+
+static void sampling_end(int *pmu_fd)
+{
+ int i;
+
+ for (i = 0; i < nr_cpus; i++)
+ close(pmu_fd[i]);
+}
+
+struct ipcount {
+ __u64 ip;
+ __u32 count;
+};
+
+/* used for sorting */
+struct ipcount counts[MAX_IPS];
+
+static int count_cmp(const void *p1, const void *p2)
+{
+ return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count;
+}
+
+static void print_ip_map(int fd)
+{
+ struct ksym *sym;
+ __u64 key, next_key;
+ __u32 value;
+ int i, max;
+
+ printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT");
+
+ /* fetch IPs and counts */
+ key = 0, i = 0;
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &value);
+ counts[i].ip = next_key;
+ counts[i++].count = value;
+ key = next_key;
+ }
+ max = i;
+
+ /* sort and print */
+ qsort(counts, max, sizeof(struct ipcount), count_cmp);
+ for (i = 0; i < max; i++) {
+ if (counts[i].ip > PAGE_OFFSET) {
+ sym = ksym_search(counts[i].ip);
+ printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name,
+ counts[i].count);
+ } else {
+ printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)",
+ counts[i].count);
+ }
+ }
+
+ if (max == MAX_IPS) {
+ printf("WARNING: IP hash was full (max %d entries); ", max);
+ printf("may have dropped samples\n");
+ }
+}
+
+static void int_exit(int sig)
+{
+ printf("\n");
+ print_ip_map(map_fd[0]);
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+ int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS;
+
+ /* process arguments */
+ while ((opt = getopt(argc, argv, "F:h")) != -1) {
+ switch (opt) {
+ case 'F':
+ freq = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage();
+ return 0;
+ }
+ }
+ if (argc - optind == 1)
+ secs = atoi(argv[optind]);
+ if (freq == 0 || secs == 0) {
+ usage();
+ return 1;
+ }
+
+ /* initialize kernel symbol translation */
+ if (load_kallsyms()) {
+ fprintf(stderr, "ERROR: loading /proc/kallsyms\n");
+ return 2;
+ }
+
+ /* create perf FDs for each CPU */
+ nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ pmu_fd = malloc(nr_cpus * sizeof(int));
+ if (pmu_fd == NULL) {
+ fprintf(stderr, "ERROR: malloc of pmu_fd\n");
+ return 1;
+ }
+
+ /* load BPF program */
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "ERROR: loading BPF program (errno %d):\n",
+ errno);
+ if (strcmp(bpf_log_buf, "") == 0)
+ fprintf(stderr, "Try: ulimit -l unlimited\n");
+ else
+ fprintf(stderr, "%s", bpf_log_buf);
+ return 1;
+ }
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ /* do sampling */
+ printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n",
+ freq, secs);
+ if (sampling_start(pmu_fd, freq) != 0)
+ return 1;
+ sleep(secs);
+ sampling_end(pmu_fd);
+ free(pmu_fd);
+
+ /* output sample counts */
+ print_ip_map(map_fd[0]);
+
+ return 0;
+}
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
new file mode 100644
index 000000000..60ec467c7
--- /dev/null
+++ b/samples/bpf/sock_example.c
@@ -0,0 +1,106 @@
+/* eBPF example program:
+ * - creates arraymap in kernel with key 4 bytes and value 8 bytes
+ *
+ * - loads eBPF program:
+ * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)];
+ * *(u32*)(fp - 4) = r0;
+ * // assuming packet is IPv4, lookup ip->proto in a map
+ * value = bpf_map_lookup_elem(map_fd, fp - 4);
+ * if (value)
+ * (*(u64*)value) += 1;
+ *
+ * - attaches this program to loopback interface "lo" raw socket
+ *
+ * - every second user space reads map[tcp], map[udp], map[icmp] to see
+ * how many packets of given protocol were seen on "lo"
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <stddef.h>
+#include <bpf/bpf.h>
+#include "bpf_insn.h"
+#include "sock_example.h"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int test_sock(void)
+{
+ int sock = -1, map_fd, prog_fd, i, key;
+ long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
+ 256, 0);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ goto cleanup;
+ }
+
+ struct bpf_insn prog[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+ prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt,
+ "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
+ if (prog_fd < 0) {
+ printf("failed to load prog '%s'\n", strerror(errno));
+ goto cleanup;
+ }
+
+ sock = open_raw_sock("lo");
+
+ if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) < 0) {
+ printf("setsockopt %s\n", strerror(errno));
+ goto cleanup;
+ }
+
+ for (i = 0; i < 10; i++) {
+ key = IPPROTO_TCP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
+
+ key = IPPROTO_UDP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0);
+
+ key = IPPROTO_ICMP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
+
+ printf("TCP %lld UDP %lld ICMP %lld packets\n",
+ tcp_cnt, udp_cnt, icmp_cnt);
+ sleep(1);
+ }
+
+cleanup:
+ /* maps, programs, raw sockets will auto cleanup on process exit */
+ return 0;
+}
+
+int main(void)
+{
+ FILE *f;
+
+ f = popen("ping -c5 localhost", "r");
+ (void)f;
+
+ return test_sock();
+}
diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h
new file mode 100644
index 000000000..a27d7579b
--- /dev/null
+++ b/samples/bpf/sock_example.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
+#include <stdio.h>
+#include <linux/unistd.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <linux/if_ether.h>
+#include <net/if.h>
+#include <linux/if_packet.h>
+#include <arpa/inet.h>
+
+static inline int open_raw_sock(const char *name)
+{
+ struct sockaddr_ll sll;
+ int sock;
+
+ sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
+ if (sock < 0) {
+ printf("cannot create raw socket\n");
+ return -1;
+ }
+
+ memset(&sll, 0, sizeof(sll));
+ sll.sll_family = AF_PACKET;
+ sll.sll_ifindex = if_nametoindex(name);
+ sll.sll_protocol = htons(ETH_P_ALL);
+ if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
+ printf("bind to %s: %s\n", name, strerror(errno));
+ close(sock);
+ return -1;
+ }
+
+ return sock;
+}
diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c
new file mode 100644
index 000000000..05dcdf8a4
--- /dev/null
+++ b/samples/bpf/sock_flags_kern.c
@@ -0,0 +1,49 @@
+#include <uapi/linux/bpf.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/in6.h>
+#include "bpf_helpers.h"
+
+SEC("cgroup/sock1")
+int bpf_prog1(struct bpf_sock *sk)
+{
+ char fmt[] = "socket: family %d type %d protocol %d\n";
+ char fmt2[] = "socket: uid %u gid %u\n";
+ __u64 gid_uid = bpf_get_current_uid_gid();
+ __u32 uid = gid_uid & 0xffffffff;
+ __u32 gid = gid_uid >> 32;
+
+ bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+ bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid);
+
+ /* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets
+ * ie., make ping6 fail
+ */
+ if (sk->family == PF_INET6 &&
+ sk->type == SOCK_RAW &&
+ sk->protocol == IPPROTO_ICMPV6)
+ return 0;
+
+ return 1;
+}
+
+SEC("cgroup/sock2")
+int bpf_prog2(struct bpf_sock *sk)
+{
+ char fmt[] = "socket: family %d type %d protocol %d\n";
+
+ bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+
+ /* block PF_INET, SOCK_RAW, IPPROTO_ICMP sockets
+ * ie., make ping fail
+ */
+ if (sk->family == PF_INET &&
+ sk->type == SOCK_RAW &&
+ sk->protocol == IPPROTO_ICMP)
+ return 0;
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c
new file mode 100644
index 000000000..ed18e9a49
--- /dev/null
+++ b/samples/bpf/sockex1_kern.c
@@ -0,0 +1,29 @@
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 256,
+};
+
+SEC("socket1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
+ long *value;
+
+ if (skb->pkt_type != PACKET_OUTGOING)
+ return 0;
+
+ value = bpf_map_lookup_elem(&my_map, &index);
+ if (value)
+ __sync_fetch_and_add(value, skb->len);
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
new file mode 100644
index 000000000..93ec01c56
--- /dev/null
+++ b/samples/bpf/sockex1_user.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "sock_example.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ FILE *f;
+ int i, sock;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ sock = open_raw_sock("lo");
+
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
+ sizeof(prog_fd[0])) == 0);
+
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ for (i = 0; i < 5; i++) {
+ long long tcp_cnt, udp_cnt, icmp_cnt;
+ int key;
+
+ key = IPPROTO_TCP;
+ assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);
+
+ key = IPPROTO_UDP;
+ assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);
+
+ key = IPPROTO_ICMP;
+ assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);
+
+ printf("TCP %lld UDP %lld ICMP %lld bytes\n",
+ tcp_cnt, udp_cnt, icmp_cnt);
+ sleep(1);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
new file mode 100644
index 000000000..f2f9dbc02
--- /dev/null
+++ b/samples/bpf/sockex2_kern.c
@@ -0,0 +1,222 @@
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+#include <uapi/linux/in.h>
+#include <uapi/linux/if.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/if_tunnel.h>
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_key_record {
+ __be32 src;
+ __be32 dst;
+ union {
+ __be32 ports;
+ __be16 port16[2];
+ };
+ __u16 thoff;
+ __u8 ip_proto;
+};
+
+static inline int proto_ports_offset(__u64 proto)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ return 0;
+ case IPPROTO_AH:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+ return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+ & (IP_MF | IP_OFFSET);
+}
+
+static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
+{
+ __u64 w0 = load_word(ctx, off);
+ __u64 w1 = load_word(ctx, off + 4);
+ __u64 w2 = load_word(ctx, off + 8);
+ __u64 w3 = load_word(ctx, off + 12);
+
+ return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+ struct flow_key_record *flow)
+{
+ __u64 verlen;
+
+ if (unlikely(ip_is_fragment(skb, nhoff)))
+ *ip_proto = 0;
+ else
+ *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
+
+ if (*ip_proto != IPPROTO_GRE) {
+ flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
+ flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
+ }
+
+ verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
+ if (likely(verlen == 0x45))
+ nhoff += 20;
+ else
+ nhoff += (verlen & 0xF) << 2;
+
+ return nhoff;
+}
+
+static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+ struct flow_key_record *flow)
+{
+ *ip_proto = load_byte(skb,
+ nhoff + offsetof(struct ipv6hdr, nexthdr));
+ flow->src = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, saddr));
+ flow->dst = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, daddr));
+ nhoff += sizeof(struct ipv6hdr);
+
+ return nhoff;
+}
+
+static inline bool flow_dissector(struct __sk_buff *skb,
+ struct flow_key_record *flow)
+{
+ __u64 nhoff = ETH_HLEN;
+ __u64 ip_proto;
+ __u64 proto = load_half(skb, 12);
+ int poff;
+
+ if (proto == ETH_P_8021AD) {
+ proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ }
+
+ if (proto == ETH_P_8021Q) {
+ proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ }
+
+ if (likely(proto == ETH_P_IP))
+ nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+ else if (proto == ETH_P_IPV6)
+ nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+ else
+ return false;
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ };
+
+ __u64 gre_flags = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, flags));
+ __u64 gre_proto = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, proto));
+
+ if (gre_flags & (GRE_VERSION|GRE_ROUTING))
+ break;
+
+ proto = gre_proto;
+ nhoff += 4;
+ if (gre_flags & GRE_CSUM)
+ nhoff += 4;
+ if (gre_flags & GRE_KEY)
+ nhoff += 4;
+ if (gre_flags & GRE_SEQ)
+ nhoff += 4;
+
+ if (proto == ETH_P_8021Q) {
+ proto = load_half(skb,
+ nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ }
+
+ if (proto == ETH_P_IP)
+ nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+ else if (proto == ETH_P_IPV6)
+ nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+ else
+ return false;
+ break;
+ }
+ case IPPROTO_IPIP:
+ nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+ break;
+ case IPPROTO_IPV6:
+ nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+ break;
+ default:
+ break;
+ }
+
+ flow->ip_proto = ip_proto;
+ poff = proto_ports_offset(ip_proto);
+ if (poff >= 0) {
+ nhoff += poff;
+ flow->ports = load_word(skb, nhoff);
+ }
+
+ flow->thoff = (__u16) nhoff;
+
+ return true;
+}
+
+struct pair {
+ long packets;
+ long bytes;
+};
+
+struct bpf_map_def SEC("maps") hash_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__be32),
+ .value_size = sizeof(struct pair),
+ .max_entries = 1024,
+};
+
+SEC("socket2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+ struct flow_key_record flow = {};
+ struct pair *value;
+ u32 key;
+
+ if (!flow_dissector(skb, &flow))
+ return 0;
+
+ key = flow.dst;
+ value = bpf_map_lookup_elem(&hash_map, &key);
+ if (value) {
+ __sync_fetch_and_add(&value->packets, 1);
+ __sync_fetch_and_add(&value->bytes, skb->len);
+ } else {
+ struct pair val = {1, skb->len};
+
+ bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
+ }
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
new file mode 100644
index 000000000..1d5c6e9a6
--- /dev/null
+++ b/samples/bpf/sockex2_user.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "sock_example.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/resource.h>
+
+struct pair {
+ __u64 packets;
+ __u64 bytes;
+};
+
+int main(int ac, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+ FILE *f;
+ int i, sock;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ sock = open_raw_sock("lo");
+
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
+ sizeof(prog_fd[0])) == 0);
+
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ for (i = 0; i < 5; i++) {
+ int key = 0, next_key;
+ struct pair value;
+
+ while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd[0], &next_key, &value);
+ printf("ip %s bytes %lld packets %lld\n",
+ inet_ntoa((struct in_addr){htonl(next_key)}),
+ value.bytes, value.packets);
+ key = next_key;
+ }
+ sleep(1);
+ }
+ return 0;
+}
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
new file mode 100644
index 000000000..c527b57d3
--- /dev/null
+++ b/samples/bpf/sockex3_kern.c
@@ -0,0 +1,290 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+#include <uapi/linux/in.h>
+#include <uapi/linux/if.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/if_tunnel.h>
+#include <uapi/linux/mpls.h>
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+
+#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
+
+struct bpf_map_def SEC("maps") jmp_table = {
+ .type = BPF_MAP_TYPE_PROG_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 8,
+};
+
+#define PARSE_VLAN 1
+#define PARSE_MPLS 2
+#define PARSE_IP 3
+#define PARSE_IPV6 4
+
+/* protocol dispatch routine.
+ * It tail-calls next BPF program depending on eth proto
+ * Note, we could have used:
+ * bpf_tail_call(skb, &jmp_table, proto);
+ * but it would need large prog_array
+ */
+static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto)
+{
+ switch (proto) {
+ case ETH_P_8021Q:
+ case ETH_P_8021AD:
+ bpf_tail_call(skb, &jmp_table, PARSE_VLAN);
+ break;
+ case ETH_P_MPLS_UC:
+ case ETH_P_MPLS_MC:
+ bpf_tail_call(skb, &jmp_table, PARSE_MPLS);
+ break;
+ case ETH_P_IP:
+ bpf_tail_call(skb, &jmp_table, PARSE_IP);
+ break;
+ case ETH_P_IPV6:
+ bpf_tail_call(skb, &jmp_table, PARSE_IPV6);
+ break;
+ }
+}
+
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_key_record {
+ __be32 src;
+ __be32 dst;
+ union {
+ __be32 ports;
+ __be16 port16[2];
+ };
+ __u32 ip_proto;
+};
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+ return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+ & (IP_MF | IP_OFFSET);
+}
+
+static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
+{
+ __u64 w0 = load_word(ctx, off);
+ __u64 w1 = load_word(ctx, off + 4);
+ __u64 w2 = load_word(ctx, off + 8);
+ __u64 w3 = load_word(ctx, off + 12);
+
+ return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+struct globals {
+ struct flow_key_record flow;
+};
+
+struct bpf_map_def SEC("maps") percpu_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct globals),
+ .max_entries = 32,
+};
+
+/* user poor man's per_cpu until native support is ready */
+static struct globals *this_cpu_globals(void)
+{
+ u32 key = bpf_get_smp_processor_id();
+
+ return bpf_map_lookup_elem(&percpu_map, &key);
+}
+
+/* some simple stats for user space consumption */
+struct pair {
+ __u64 packets;
+ __u64 bytes;
+};
+
+struct bpf_map_def SEC("maps") hash_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct flow_key_record),
+ .value_size = sizeof(struct pair),
+ .max_entries = 1024,
+};
+
+static void update_stats(struct __sk_buff *skb, struct globals *g)
+{
+ struct flow_key_record key = g->flow;
+ struct pair *value;
+
+ value = bpf_map_lookup_elem(&hash_map, &key);
+ if (value) {
+ __sync_fetch_and_add(&value->packets, 1);
+ __sync_fetch_and_add(&value->bytes, skb->len);
+ } else {
+ struct pair val = {1, skb->len};
+
+ bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
+ }
+}
+
+static __always_inline void parse_ip_proto(struct __sk_buff *skb,
+ struct globals *g, __u32 ip_proto)
+{
+ __u32 nhoff = skb->cb[0];
+ int poff;
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ };
+
+ __u32 gre_flags = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, flags));
+ __u32 gre_proto = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, proto));
+
+ if (gre_flags & (GRE_VERSION|GRE_ROUTING))
+ break;
+
+ nhoff += 4;
+ if (gre_flags & GRE_CSUM)
+ nhoff += 4;
+ if (gre_flags & GRE_KEY)
+ nhoff += 4;
+ if (gre_flags & GRE_SEQ)
+ nhoff += 4;
+
+ skb->cb[0] = nhoff;
+ parse_eth_proto(skb, gre_proto);
+ break;
+ }
+ case IPPROTO_IPIP:
+ parse_eth_proto(skb, ETH_P_IP);
+ break;
+ case IPPROTO_IPV6:
+ parse_eth_proto(skb, ETH_P_IPV6);
+ break;
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ g->flow.ports = load_word(skb, nhoff);
+ case IPPROTO_ICMP:
+ g->flow.ip_proto = ip_proto;
+ update_stats(skb, g);
+ break;
+ default:
+ break;
+ }
+}
+
+PROG(PARSE_IP)(struct __sk_buff *skb)
+{
+ struct globals *g = this_cpu_globals();
+ __u32 nhoff, verlen, ip_proto;
+
+ if (!g)
+ return 0;
+
+ nhoff = skb->cb[0];
+
+ if (unlikely(ip_is_fragment(skb, nhoff)))
+ return 0;
+
+ ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
+
+ if (ip_proto != IPPROTO_GRE) {
+ g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
+ g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
+ }
+
+ verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
+ nhoff += (verlen & 0xF) << 2;
+
+ skb->cb[0] = nhoff;
+ parse_ip_proto(skb, g, ip_proto);
+ return 0;
+}
+
+PROG(PARSE_IPV6)(struct __sk_buff *skb)
+{
+ struct globals *g = this_cpu_globals();
+ __u32 nhoff, ip_proto;
+
+ if (!g)
+ return 0;
+
+ nhoff = skb->cb[0];
+
+ ip_proto = load_byte(skb,
+ nhoff + offsetof(struct ipv6hdr, nexthdr));
+ g->flow.src = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, saddr));
+ g->flow.dst = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, daddr));
+ nhoff += sizeof(struct ipv6hdr);
+
+ skb->cb[0] = nhoff;
+ parse_ip_proto(skb, g, ip_proto);
+ return 0;
+}
+
+PROG(PARSE_VLAN)(struct __sk_buff *skb)
+{
+ __u32 nhoff, proto;
+
+ nhoff = skb->cb[0];
+
+ proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ skb->cb[0] = nhoff;
+
+ parse_eth_proto(skb, proto);
+
+ return 0;
+}
+
+PROG(PARSE_MPLS)(struct __sk_buff *skb)
+{
+ __u32 nhoff, label;
+
+ nhoff = skb->cb[0];
+
+ label = load_word(skb, nhoff);
+ nhoff += sizeof(struct mpls_label);
+ skb->cb[0] = nhoff;
+
+ if (label & MPLS_LS_S_MASK) {
+ __u8 verlen = load_byte(skb, nhoff);
+ if ((verlen & 0xF0) == 4)
+ parse_eth_proto(skb, ETH_P_IP);
+ else
+ parse_eth_proto(skb, ETH_P_IPV6);
+ } else {
+ parse_eth_proto(skb, ETH_P_MPLS_UC);
+ }
+
+ return 0;
+}
+
+SEC("socket/0")
+int main_prog(struct __sk_buff *skb)
+{
+ __u32 nhoff = ETH_HLEN;
+ __u32 proto = load_half(skb, 12);
+
+ skb->cb[0] = nhoff;
+ parse_eth_proto(skb, proto);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
new file mode 100644
index 000000000..9d02e0404
--- /dev/null
+++ b/samples/bpf/sockex3_user.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "sock_example.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/resource.h>
+
+#define PARSE_IP 3
+#define PARSE_IP_PROG_FD (prog_fd[0])
+#define PROG_ARRAY_FD (map_fd[0])
+
+struct flow_key_record {
+ __be32 src;
+ __be32 dst;
+ union {
+ __be32 ports;
+ __be16 port16[2];
+ };
+ __u32 ip_proto;
+};
+
+struct pair {
+ __u64 packets;
+ __u64 bytes;
+};
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+ FILE *f;
+ int i, sock, err, id, key = PARSE_IP;
+ struct bpf_prog_info info = {};
+ uint32_t info_len = sizeof(info);
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ /* Test fd array lookup which returns the id of the bpf_prog */
+ err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len);
+ assert(!err);
+ err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id);
+ assert(!err);
+ assert(id == info.id);
+
+ sock = open_raw_sock("lo");
+
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4],
+ sizeof(__u32)) == 0);
+
+ if (argc > 1)
+ f = popen("ping -c5 localhost", "r");
+ else
+ f = popen("netperf -l 4 localhost", "r");
+ (void) f;
+
+ for (i = 0; i < 5; i++) {
+ struct flow_key_record key = {}, next_key;
+ struct pair value;
+
+ sleep(1);
+ printf("IP src.port -> dst.port bytes packets\n");
+ while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd[2], &next_key, &value);
+ printf("%s.%05d -> %s.%05d %12lld %12lld\n",
+ inet_ntoa((struct in_addr){htonl(next_key.src)}),
+ next_key.port16[0],
+ inet_ntoa((struct in_addr){htonl(next_key.dst)}),
+ next_key.port16[1],
+ value.bytes, value.packets);
+ key = next_key;
+ }
+ }
+ return 0;
+}
diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c
new file mode 100644
index 000000000..ce0167d09
--- /dev/null
+++ b/samples/bpf/spintest_kern.c
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016, Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/perf_event.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+struct bpf_map_def SEC("maps") my_map2 = {
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") stackmap = {
+ .type = BPF_MAP_TYPE_STACK_TRACE,
+ .key_size = sizeof(u32),
+ .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
+ .max_entries = 10000,
+};
+
+#define PROG(foo) \
+int foo(struct pt_regs *ctx) \
+{ \
+ long v = PT_REGS_IP(ctx), *val; \
+\
+ val = bpf_map_lookup_elem(&my_map, &v); \
+ bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \
+ bpf_map_update_elem(&my_map2, &v, &v, BPF_ANY); \
+ bpf_map_delete_elem(&my_map2, &v); \
+ bpf_get_stackid(ctx, &stackmap, BPF_F_REUSE_STACKID); \
+ return 0; \
+}
+
+/* add kprobes to all possible *spin* functions */
+SEC("kprobe/spin_unlock")PROG(p1)
+SEC("kprobe/spin_lock")PROG(p2)
+SEC("kprobe/mutex_spin_on_owner")PROG(p3)
+SEC("kprobe/rwsem_spin_on_owner")PROG(p4)
+SEC("kprobe/spin_unlock_irqrestore")PROG(p5)
+SEC("kprobe/_raw_spin_unlock_irqrestore")PROG(p6)
+SEC("kprobe/_raw_spin_unlock_bh")PROG(p7)
+SEC("kprobe/_raw_spin_unlock")PROG(p8)
+SEC("kprobe/_raw_spin_lock_irqsave")PROG(p9)
+SEC("kprobe/_raw_spin_trylock_bh")PROG(p10)
+SEC("kprobe/_raw_spin_lock_irq")PROG(p11)
+SEC("kprobe/_raw_spin_trylock")PROG(p12)
+SEC("kprobe/_raw_spin_lock")PROG(p13)
+SEC("kprobe/_raw_spin_lock_bh")PROG(p14)
+/* and to inner bpf helpers */
+SEC("kprobe/htab_map_update_elem")PROG(p15)
+SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16)
+SEC("kprobe/htab_map_alloc")PROG(p17)
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
new file mode 100644
index 000000000..8d3e9cfa1
--- /dev/null
+++ b/samples/bpf/spintest_user.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/resource.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "trace_helpers.h"
+
+int main(int ac, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ long key, next_key, value;
+ char filename[256];
+ struct ksym *sym;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 2;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ key = 0;
+ printf("kprobing funcs:");
+ while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd[0], &next_key, &value);
+ assert(next_key == value);
+ sym = ksym_search(value);
+ printf(" %s", sym->name);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ key = 0;
+ while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0)
+ bpf_map_delete_elem(map_fd[0], &next_key);
+ sleep(1);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c
new file mode 100644
index 000000000..516e255cb
--- /dev/null
+++ b/samples/bpf/syscall_nrs.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <uapi/linux/unistd.h>
+#include <linux/kbuild.h>
+
+#define SYSNR(_NR) DEFINE(SYS ## _NR, _NR)
+
+void syscall_defines(void)
+{
+ COMMENT("Linux system call numbers.");
+ SYSNR(__NR_write);
+ SYSNR(__NR_read);
+ SYSNR(__NR_mmap);
+}
diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c
new file mode 100644
index 000000000..8833aacb9
--- /dev/null
+++ b/samples/bpf/syscall_tp_kern.c
@@ -0,0 +1,76 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct syscalls_enter_open_args {
+ unsigned long long unused;
+ long syscall_nr;
+ long filename_ptr;
+ long flags;
+ long mode;
+};
+
+struct syscalls_exit_open_args {
+ unsigned long long unused;
+ long syscall_nr;
+ long ret;
+};
+
+struct bpf_map_def SEC("maps") enter_open_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") exit_open_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+
+static __always_inline void count(void *map)
+{
+ u32 key = 0;
+ u32 *value, init_val = 1;
+
+ value = bpf_map_lookup_elem(map, &key);
+ if (value)
+ *value += 1;
+ else
+ bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST);
+}
+
+SEC("tracepoint/syscalls/sys_enter_open")
+int trace_enter_open(struct syscalls_enter_open_args *ctx)
+{
+ count(&enter_open_map);
+ return 0;
+}
+
+SEC("tracepoint/syscalls/sys_enter_openat")
+int trace_enter_open_at(struct syscalls_enter_open_args *ctx)
+{
+ count(&enter_open_map);
+ return 0;
+}
+
+SEC("tracepoint/syscalls/sys_exit_open")
+int trace_enter_exit(struct syscalls_exit_open_args *ctx)
+{
+ count(&exit_open_map);
+ return 0;
+}
+
+SEC("tracepoint/syscalls/sys_exit_openat")
+int trace_enter_exit_at(struct syscalls_exit_open_args *ctx)
+{
+ count(&exit_open_map);
+ return 0;
+}
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
new file mode 100644
index 000000000..1a1d0059a
--- /dev/null
+++ b/samples/bpf/syscall_tp_user.c
@@ -0,0 +1,111 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <linux/perf_event.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <sys/resource.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*.
+ * This requires kernel CONFIG_FTRACE_SYSCALLS to be set.
+ */
+
+static void usage(const char *cmd)
+{
+ printf("USAGE: %s [-i num_progs] [-h]\n", cmd);
+ printf(" -i num_progs # number of progs of the test\n");
+ printf(" -h # help\n");
+}
+
+static void verify_map(int map_id)
+{
+ __u32 key = 0;
+ __u32 val;
+
+ if (bpf_map_lookup_elem(map_id, &key, &val) != 0) {
+ fprintf(stderr, "map_lookup failed: %s\n", strerror(errno));
+ return;
+ }
+ if (val == 0) {
+ fprintf(stderr, "failed: map #%d returns value 0\n", map_id);
+ return;
+ }
+ val = 0;
+ if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) {
+ fprintf(stderr, "map_update failed: %s\n", strerror(errno));
+ return;
+ }
+}
+
+static int test(char *filename, int num_progs)
+{
+ int i, fd, map0_fds[num_progs], map1_fds[num_progs];
+
+ for (i = 0; i < num_progs; i++) {
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "%s", bpf_log_buf);
+ return 1;
+ }
+ printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]);
+ map0_fds[i] = map_fd[0];
+ map1_fds[i] = map_fd[1];
+ }
+
+ /* current load_bpf_file has perf_event_open default pid = -1
+ * and cpu = 0, which permits attached bpf execution on
+ * all cpus for all pid's. bpf program execution ignores
+ * cpu affinity.
+ */
+ /* trigger some "open" operations */
+ fd = open(filename, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "open failed: %s\n", strerror(errno));
+ return 1;
+ }
+ close(fd);
+
+ /* verify the map */
+ for (i = 0; i < num_progs; i++) {
+ verify_map(map0_fds[i]);
+ verify_map(map1_fds[i]);
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int opt, num_progs = 1;
+ char filename[256];
+
+ while ((opt = getopt(argc, argv, "i:h")) != -1) {
+ switch (opt) {
+ case 'i':
+ num_progs = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ return 0;
+ }
+ }
+
+ setrlimit(RLIMIT_MEMLOCK, &r);
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ return test(filename, num_progs);
+}
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000000000..f4b0a9ea6
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000000000..06957f0fb
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("FAIL: %s:\n", __func__); \
+ perror(" "); \
+ return -1; \
+ } \
+})
+
+#define CHECK_AND_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) \
+ return -1; \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+ return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ errno = 0;
+ ret = (int)strtol(buf, NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+ CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+ errno = 0;
+ ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name,
+ __u32 expected_fd_type)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ char buf[256];
+ int err;
+
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+ __func__, prog_fd_idx, fn_name);
+ perror(" :");
+ return -1;
+ }
+ if (strcmp(buf, fn_name) != 0 ||
+ fd_type != expected_fd_type ||
+ probe_offset != 0x0 || probe_addr != 0x0) {
+ printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
+ prog_fd_idx);
+ printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
+ " probe_addr: 0x%llx\n",
+ buf, fd_type, probe_offset, probe_addr);
+ return -1;
+ }
+ return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+ const char *name, __u64 offset, __u64 addr, bool is_return,
+ char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+ __u64 *probe_offset, __u64 *probe_addr)
+{
+ int is_return_bit = bpf_get_retprobe_bit(event_type);
+ int type = bpf_find_probe_type(event_type);
+ struct perf_event_attr attr = {};
+ int fd;
+
+ if (type < 0 || is_return_bit < 0) {
+ printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+ __func__, type, is_return_bit);
+ return -1;
+ }
+
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ if (is_return)
+ attr.config |= 1 << is_return_bit;
+
+ if (name) {
+ attr.config1 = ptr_to_u64((void *)name);
+ attr.config2 = offset;
+ } else {
+ attr.config1 = 0;
+ attr.config2 = addr;
+ }
+ attr.size = sizeof(attr);
+ attr.type = type;
+
+ fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+ CHECK_PERROR_RET(fd < 0);
+
+ CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+ CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+ CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+ prog_id, fd_type, probe_offset, probe_addr) < 0);
+
+ return 0;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+ __u64 offset, __u64 addr, bool is_return,
+ __u32 expected_fd_type,
+ __u32 expected_ret_fd_type,
+ char *buf, __u32 buf_len)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 prog_id, fd_type;
+ int err;
+
+ err = test_nondebug_fs_kuprobe_common(event_type, name,
+ offset, addr, is_return,
+ buf, &buf_len, &prog_id,
+ &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, "
+ "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+ __func__, name ? name : "", offset, addr, is_return);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != expected_ret_fd_type) ||
+ (!is_return && fd_type != expected_fd_type)) {
+ printf("FAIL: %s, incorrect fd_type %u\n",
+ __func__, fd_type);
+ return -1;
+ }
+ if (name) {
+ if (strcmp(name, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+ __func__, probe_offset);
+ return -1;
+ }
+ } else {
+ if (buf_len != 0) {
+ printf("FAIL: %s, incorrect buf %p\n",
+ __func__, buf);
+ return -1;
+ }
+
+ if (probe_addr != addr) {
+ printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+ __func__, probe_addr);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+ const char *event_type = "uprobe";
+ struct perf_event_attr attr = {};
+ char buf[256], event_alias[sizeof("test_1234567890")];
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ int err, res, kfd, efd;
+ ssize_t bytes;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+ event_type);
+ kfd = open(buf, O_WRONLY | O_APPEND, 0);
+ CHECK_PERROR_RET(kfd < 0);
+
+ res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+ res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+ is_return ? 'r' : 'p', event_type, event_alias,
+ binary_path, offset);
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+ CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+ close(kfd);
+ kfd = -1;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+ event_type, event_alias);
+ efd = open(buf, O_RDONLY, 0);
+ CHECK_PERROR_RET(efd < 0);
+
+ bytes = read(efd, buf, sizeof(buf));
+ CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+ close(efd);
+ buf[bytes] = '\0';
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+ CHECK_PERROR_RET(kfd < 0);
+ CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+ CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
+ (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
+ printf("FAIL: %s, incorrect fd_type %u\n", __func__,
+ fd_type);
+ return -1;
+ }
+ if (strcmp(binary_path, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+ probe_offset);
+ return -1;
+ }
+
+ close(kfd);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {1024*1024, RLIM_INFINITY};
+ extern char __executable_start;
+ char filename[256], buf[256];
+ __u64 uprobe_file_offset;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ /* test two functions in the corresponding *_kern.c file */
+ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+ BPF_FD_TYPE_KPROBE));
+ CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+ BPF_FD_TYPE_KRETPROBE));
+
+ /* test nondebug fs kprobe */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#ifdef __x86_64__
+ /* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#endif
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ true, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ NULL, 0));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ 0, 0));
+
+ /* test nondebug fs uprobe */
+ /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+ * and the default linker script, which defines __executable_start as
+ * the start of the .text section. The calculation could be different
+ * on different systems with different compilers. The right way is
+ * to parse the ELF file. We took a shortcut here.
+ */
+ uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, false,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, true,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+
+ /* test debug fs uprobe */
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ false));
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ true));
+
+ return 0;
+}
diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh
new file mode 100755
index 000000000..37d95ef3c
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+[[ -z $TC ]] && TC='tc'
+[[ -z $IP ]] && IP='ip'
+
+REDIRECT_USER='./tc_l2_redirect'
+REDIRECT_BPF='./tc_l2_redirect_kern.o'
+
+RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter)
+IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding)
+
+function config_common {
+ local tun_type=$1
+
+ $IP netns add ns1
+ $IP netns add ns2
+ $IP link add ve1 type veth peer name vens1
+ $IP link add ve2 type veth peer name vens2
+ $IP link set dev ve1 up
+ $IP link set dev ve2 up
+ $IP link set dev ve1 mtu 1500
+ $IP link set dev ve2 mtu 1500
+ $IP link set dev vens1 netns ns1
+ $IP link set dev vens2 netns ns2
+
+ $IP -n ns1 link set dev lo up
+ $IP -n ns1 link set dev vens1 up
+ $IP -n ns1 addr add 10.1.1.101/24 dev vens1
+ $IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad
+ $IP -n ns1 route add default via 10.1.1.1 dev vens1
+ $IP -n ns1 route add default via 2401:db01::1 dev vens1
+
+ $IP -n ns2 link set dev lo up
+ $IP -n ns2 link set dev vens2 up
+ $IP -n ns2 addr add 10.2.1.102/24 dev vens2
+ $IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad
+ $IP -n ns2 addr add 10.10.1.102 dev lo
+ $IP -n ns2 addr add 2401:face::66/64 dev lo nodad
+ $IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1
+ $IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1
+ $IP -n ns2 link set dev ipt2 up
+ $IP -n ns2 link set dev ip6t2 up
+ $IP netns exec ns2 $TC qdisc add dev vens2 clsact
+ $IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip
+ if [[ $tun_type == "ipip" ]]; then
+ $IP -n ns2 route add 10.1.1.0/24 dev ipt2
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0
+ else
+ $IP -n ns2 route add 10.1.1.0/24 dev ip6t2
+ $IP -n ns2 route add 2401:db01::/64 dev ip6t2
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0
+ fi
+
+ $IP addr add 10.1.1.1/24 dev ve1
+ $IP addr add 2401:db01::1/64 dev ve1 nodad
+ $IP addr add 10.2.1.1/24 dev ve2
+ $IP addr add 2401:db02::1/64 dev ve2 nodad
+
+ $TC qdisc add dev ve2 clsact
+ $TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward
+
+ sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ sysctl -q -w net.ipv6.conf.all.forwarding=1
+}
+
+function cleanup {
+ set +e
+ [[ -z $DEBUG ]] || set +x
+ $IP netns delete ns1 >& /dev/null
+ $IP netns delete ns2 >& /dev/null
+ $IP link del ve1 >& /dev/null
+ $IP link del ve2 >& /dev/null
+ $IP link del ipt >& /dev/null
+ $IP link del ip6t >& /dev/null
+ sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER
+ sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING
+ rm -f /sys/fs/bpf/tc/globals/tun_iface
+ [[ -z $DEBUG ]] || set -x
+ set -e
+}
+
+function l2_to_ipip {
+ echo -n "l2_to_ipip $1: "
+
+ local dir=$1
+
+ config_common ipip
+
+ $IP link add ipt type ipip external
+ $IP link set dev ipt up
+ sysctl -q -w net.ipv4.conf.ipt.rp_filter=0
+ sysctl -q -w net.ipv4.conf.ipt.forwarding=1
+
+ if [[ $dir == "egress" ]]; then
+ $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
+ $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
+ sysctl -q -w net.ipv4.conf.ve1.forwarding=1
+ else
+ $TC qdisc add dev ve1 clsact
+ $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
+ fi
+
+ $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex)
+
+ $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
+
+ if [[ $dir == "egress" ]]; then
+ # test direct egress to ve2 (i.e. not forwarding from
+ # ve1 to ve2).
+ ping -c1 10.10.1.102 >& /dev/null
+ fi
+
+ cleanup
+
+ echo "OK"
+}
+
+function l2_to_ip6tnl {
+ echo -n "l2_to_ip6tnl $1: "
+
+ local dir=$1
+
+ config_common ip6tnl
+
+ $IP link add ip6t type ip6tnl mode any external
+ $IP link set dev ip6t up
+ sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0
+ sysctl -q -w net.ipv4.conf.ip6t.forwarding=1
+
+ if [[ $dir == "egress" ]]; then
+ $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
+ $IP route add 2401:face::/64 via 2401:db02::66 dev ve2
+ $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
+ sysctl -q -w net.ipv4.conf.ve1.forwarding=1
+ else
+ $TC qdisc add dev ve1 clsact
+ $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
+ fi
+
+ $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex)
+
+ $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
+ $IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null
+
+ if [[ $dir == "egress" ]]; then
+ # test direct egress to ve2 (i.e. not forwarding from
+ # ve1 to ve2).
+ ping -c1 10.10.1.102 >& /dev/null
+ ping -6 -c1 2401:face::66 >& /dev/null
+ fi
+
+ cleanup
+
+ echo "OK"
+}
+
+cleanup
+test_names="l2_to_ipip l2_to_ip6tnl"
+test_dirs="ingress egress"
+if [[ $# -ge 2 ]]; then
+ test_names=$1
+ test_dirs=$2
+elif [[ $# -ge 1 ]]; then
+ test_names=$1
+fi
+
+for t in $test_names; do
+ for d in $test_dirs; do
+ $t $d
+ done
+done
diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c
new file mode 100644
index 000000000..7ef2a12b2
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect_kern.c
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
+#include "bpf_helpers.h"
+
+#define _htonl __builtin_bswap32
+
+#define PIN_GLOBAL_NS 2
+struct bpf_elf_map {
+ __u32 type;
+ __u32 size_key;
+ __u32 size_value;
+ __u32 max_elem;
+ __u32 flags;
+ __u32 id;
+ __u32 pinning;
+};
+
+/* copy of 'struct ethhdr' without __packed */
+struct eth_hdr {
+ unsigned char h_dest[ETH_ALEN];
+ unsigned char h_source[ETH_ALEN];
+ unsigned short h_proto;
+};
+
+struct bpf_elf_map SEC("maps") tun_iface = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .size_key = sizeof(int),
+ .size_value = sizeof(int),
+ .pinning = PIN_GLOBAL_NS,
+ .max_elem = 1,
+};
+
+static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr)
+{
+ if (eth_proto == htons(ETH_P_IP))
+ return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100);
+ else if (eth_proto == htons(ETH_P_IPV6))
+ return (daddr == _htonl(0x2401face));
+
+ return false;
+}
+
+SEC("l2_to_iptun_ingress_forward")
+int _l2_to_iptun_ingress_forward(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key tkey = {};
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int key = 0, *ifindex;
+
+ int ret;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ ifindex = bpf_map_lookup_elem(&tun_iface, &key);
+ if (!ifindex)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n";
+ struct iphdr *iph = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (iph->protocol != IPPROTO_IPIP)
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex,
+ _htonl(iph->daddr));
+ return bpf_redirect(*ifindex, BPF_F_INGRESS);
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n";
+ struct ipv6hdr *ip6h = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+ return TC_ACT_OK;
+
+ if (ip6h->nexthdr != IPPROTO_IPIP &&
+ ip6h->nexthdr != IPPROTO_IPV6)
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex,
+ _htonl(ip6h->daddr.s6_addr32[0]),
+ _htonl(ip6h->daddr.s6_addr32[3]));
+ return bpf_redirect(*ifindex, BPF_F_INGRESS);
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("l2_to_iptun_ingress_redirect")
+int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key tkey = {};
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int key = 0, *ifindex;
+
+ int ret;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ ifindex = bpf_map_lookup_elem(&tun_iface, &key);
+ if (!ifindex)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
+ struct iphdr *iph = data + sizeof(*eth);
+ __be32 daddr = iph->daddr;
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (!is_vip_addr(eth->h_proto, daddr))
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex);
+ } else {
+ return TC_ACT_OK;
+ }
+
+ tkey.tunnel_id = 10000;
+ tkey.tunnel_ttl = 64;
+ tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
+ return bpf_redirect(*ifindex, 0);
+}
+
+SEC("l2_to_ip6tun_ingress_redirect")
+int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key tkey = {};
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int key = 0, *ifindex;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ ifindex = bpf_map_lookup_elem(&tun_iface, &key);
+ if (!ifindex)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
+ struct iphdr *iph = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (!is_vip_addr(eth->h_proto, iph->daddr))
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr),
+ *ifindex);
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n";
+ struct ipv6hdr *ip6h = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+ return TC_ACT_OK;
+
+ if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt6, sizeof(fmt6),
+ _htonl(ip6h->daddr.s6_addr32[0]), *ifindex);
+ } else {
+ return TC_ACT_OK;
+ }
+
+ tkey.tunnel_id = 10000;
+ tkey.tunnel_ttl = 64;
+ /* 2401:db02:0:0:0:0:0:66 */
+ tkey.remote_ipv6[0] = _htonl(0x2401db02);
+ tkey.remote_ipv6[1] = 0;
+ tkey.remote_ipv6[2] = 0;
+ tkey.remote_ipv6[3] = _htonl(0x00000066);
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6);
+ return bpf_redirect(*ifindex, 0);
+}
+
+SEC("drop_non_tun_vip")
+int _drop_non_tun_vip(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key tkey = {};
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ struct iphdr *iph = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (is_vip_addr(eth->h_proto, iph->daddr))
+ return TC_ACT_SHOT;
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+ return TC_ACT_OK;
+
+ if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c
new file mode 100644
index 000000000..7ec45c3e8
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect_user.c
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include <bpf/bpf.h>
+
+static void usage(void)
+{
+ printf("Usage: tc_l2_ipip_redirect [...]\n");
+ printf(" -U <file> Update an already pinned BPF array\n");
+ printf(" -i <ifindex> Interface index\n");
+ printf(" -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+ const char *pinned_file = NULL;
+ int ifindex = -1;
+ int array_key = 0;
+ int array_fd = -1;
+ int ret = -1;
+ int opt;
+
+ while ((opt = getopt(argc, argv, "F:U:i:")) != -1) {
+ switch (opt) {
+ /* General args */
+ case 'U':
+ pinned_file = optarg;
+ break;
+ case 'i':
+ ifindex = atoi(optarg);
+ break;
+ default:
+ usage();
+ goto out;
+ }
+ }
+
+ if (ifindex < 0 || !pinned_file) {
+ usage();
+ goto out;
+ }
+
+ array_fd = bpf_obj_get(pinned_file);
+ if (array_fd < 0) {
+ fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
+ pinned_file, strerror(errno), errno);
+ goto out;
+ }
+
+ /* bpf_tunnel_key.remote_ipv4 expects host byte orders */
+ ret = bpf_map_update_elem(array_fd, &array_key, &ifindex, 0);
+ if (ret) {
+ perror("bpf_map_update_elem");
+ goto out;
+ }
+
+out:
+ if (array_fd != -1)
+ close(array_fd);
+ return ret;
+}
diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c
new file mode 100644
index 000000000..274c884c8
--- /dev/null
+++ b/samples/bpf/tcbpf1_kern.c
@@ -0,0 +1,90 @@
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include "bpf_helpers.h"
+
+/* compiler workaround */
+#define _htonl __builtin_bswap32
+
+static inline void set_dst_mac(struct __sk_buff *skb, char *mac)
+{
+ bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1);
+}
+
+#define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check))
+#define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos))
+
+static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
+{
+ __u8 old_tos = load_byte(skb, TOS_OFF);
+
+ bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2);
+ bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0);
+}
+
+#define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check))
+#define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr))
+
+#define IS_PSEUDO 0x10
+
+static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
+{
+ __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF));
+
+ bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip));
+ bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
+ bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
+}
+
+#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest))
+static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
+{
+ __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF));
+
+ bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port));
+ bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0);
+}
+
+SEC("classifier")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
+ long *value;
+
+ if (proto == IPPROTO_TCP) {
+ set_ip_tos(skb, 8);
+ set_tcp_ip_src(skb, 0xA010101);
+ set_tcp_dest_port(skb, 5001);
+ }
+
+ return 0;
+}
+SEC("redirect_xmit")
+int _redirect_xmit(struct __sk_buff *skb)
+{
+ return bpf_redirect(skb->ifindex + 1, 0);
+}
+SEC("redirect_recv")
+int _redirect_recv(struct __sk_buff *skb)
+{
+ return bpf_redirect(skb->ifindex + 1, 1);
+}
+SEC("clone_redirect_xmit")
+int _clone_redirect_xmit(struct __sk_buff *skb)
+{
+ bpf_clone_redirect(skb, skb->ifindex + 1, 0);
+ return TC_ACT_SHOT;
+}
+SEC("clone_redirect_recv")
+int _clone_redirect_recv(struct __sk_buff *skb)
+{
+ bpf_clone_redirect(skb, skb->ifindex + 1, 1);
+ return TC_ACT_SHOT;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
new file mode 100644
index 000000000..4bf4fc597
--- /dev/null
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set base_rtt to 80us when host is running TCP-NV and
+ * both hosts are in the same datacenter (as determined by IPv6 prefix).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_basertt(struct bpf_sock_ops *skops)
+{
+ char cong[20];
+ char nv[] = "nv";
+ int rv = 0, n;
+ int op;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check if both hosts are in the same datacenter. For this
+ * example they are if the 1st 5.5 bytes in the IPv6 address
+ * are the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_BASE_RTT:
+ n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) {
+ /* Set base_rtt to 80us */
+ rv = 80;
+ } else if (n) {
+ rv = n;
+ } else {
+ rv = -1;
+ }
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
new file mode 100644
index 000000000..831fb601e
--- /dev/null
+++ b/samples/bpf/tcp_bpf.readme
@@ -0,0 +1,26 @@
+This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops)
+programs. These programs attach to a cgroupv2. The following commands create
+a cgroupv2 and attach a bash shell to the group.
+
+ mkdir -p /tmp/cgroupv2
+ mount -t cgroup2 none /tmp/cgroupv2
+ mkdir -p /tmp/cgroupv2/foo
+ bash
+ echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+
+Anything that runs under this shell belongs to the foo cgroupv2 To load
+(attach) one of the tcp_*_kern.o programs:
+
+ ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o
+
+If the "-l" flag is used, the load_sock_ops program will continue to run
+printing the BPF log buffer. The tcp_*_kern.o programs use special print
+functions to print logging information (if enabled by the ifdef).
+
+If using netperf/netserver to create traffic, you need to run them under the
+cgroupv2 to which the BPF programs are attached (i.e. under bash shell
+attached to the cgroupv2).
+
+To remove (unattach) a socket_ops BPF program from a cgroupv2:
+
+ ./load_sock_ops -r /tmp/cgroupv2/foo
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
new file mode 100644
index 000000000..0566b7fa3
--- /dev/null
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -0,0 +1,88 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets and send
+ * and receive buffers to 1.5MB. This would usually be done after
+ * doing appropriate checks that indicate the hosts are far enough
+ * away (i.e. large RTT).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_bufs(struct bpf_sock_ops *skops)
+{
+ int bufsize = 1500000;
+ int rwnd_init = 40;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+
+ /* Usually there would be a check to insure the hosts are far
+ * from each other so it makes sense to increase buffer sizes
+ */
+ switch (op) {
+ case BPF_SOCK_OPS_RWND_INIT:
+ rv = rwnd_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ /* Nothing to do */
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
new file mode 100644
index 000000000..f4225c9d2
--- /dev/null
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -0,0 +1,104 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp
+ * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within
+ * the same datacenter. For his example, we assume they are within the same
+ * datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_clamp(struct bpf_sock_ops *skops)
+{
+ int bufsize = 150000;
+ int to_init = 10;
+ int clamp = 100;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) {
+ skops->reply = -1;
+ return 0;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check that both hosts are within same datacenter. For this example
+ * it is the case when the first 5.5 bytes of their IPv6 addresses are
+ * the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_TIMEOUT_INIT:
+ rv = to_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF,
+ &bufsize, sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP,
+ TCP_BPF_SNDCWND_CLAMP,
+ &clamp, sizeof(clamp));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_TCP,
+ TCP_BPF_SNDCWND_CLAMP,
+ &clamp, sizeof(clamp));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
new file mode 100644
index 000000000..ad0f1ba82
--- /dev/null
+++ b/samples/bpf/tcp_cong_kern.c
@@ -0,0 +1,85 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set congestion control to dctcp when both hosts are
+ * in the same datacenter (as deteremined by IPv6 prefix).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_cong(struct bpf_sock_ops *skops)
+{
+ char cong[] = "dctcp";
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check if both hosts are in the same datacenter. For this
+ * example they are if the 1st 5.5 bytes in the IPv6 address
+ * are the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_NEEDS_ECN:
+ rv = 1;
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
new file mode 100644
index 000000000..4ca5ecc9f
--- /dev/null
+++ b/samples/bpf/tcp_iw_kern.c
@@ -0,0 +1,90 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial congestion window and initial receive
+ * window to 40 packets and send and receive buffers to 1.5MB. This
+ * would usually be done after doing appropriate checks that indicate
+ * the hosts are far enough away (i.e. large RTT).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_iw(struct bpf_sock_ops *skops)
+{
+ int bufsize = 1500000;
+ int rwnd_init = 40;
+ int iw = 40;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Usually there would be a check to insure the hosts are far
+ * from each other so it makes sense to increase buffer sizes
+ */
+ switch (op) {
+ case BPF_SOCK_OPS_RWND_INIT:
+ rv = rwnd_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw,
+ sizeof(iw));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
new file mode 100644
index 000000000..09ff65b40
--- /dev/null
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -0,0 +1,71 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets when using IPv6
+ * and the first 5.5 bytes of the IPv6 addresses are not the same (in this
+ * example that means both hosts are not the same datacenter).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_rwnd(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) !=
+ 55601 && skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check for RWND_INIT operation and IPv6 addresses */
+ if (op == BPF_SOCK_OPS_RWND_INIT &&
+ skops->family == AF_INET6) {
+
+ /* If the first 5.5 bytes of the IPv6 address are not the same
+ * then both hosts are not in the same datacenter
+ * so use a larger initial advertized window (40 packets)
+ */
+ if (skops->local_ip6[0] != skops->remote_ip6[0] ||
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfffff000) !=
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfffff000))
+ rv = 40;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
new file mode 100644
index 000000000..232bb2428
--- /dev/null
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -0,0 +1,71 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses
+ * and the first 5.5 bytes of the IPv6 addresses are the same (in this example
+ * that means both hosts are in the same datacenter).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_synrto(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check for TIMEOUT_INIT operation and IPv6 addresses */
+ if (op == BPF_SOCK_OPS_TIMEOUT_INIT &&
+ skops->family == AF_INET6) {
+
+ /* If the first 5.5 bytes of the IPv6 address are the same
+ * then both hosts are in the same datacenter
+ * so use an RTO of 10ms
+ */
+ if (skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000))
+ rv = 10;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c
new file mode 100644
index 000000000..242184292
--- /dev/null
+++ b/samples/bpf/test_cgrp2_array_pin.c
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <bpf/bpf.h>
+
+static void usage(void)
+{
+ printf("Usage: test_cgrp2_array_pin [...]\n");
+ printf(" -F <file> File to pin an BPF cgroup array\n");
+ printf(" -U <file> Update an already pinned BPF cgroup array\n");
+ printf(" -v <value> Full path of the cgroup2\n");
+ printf(" -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+ const char *pinned_file = NULL, *cg2 = NULL;
+ int create_array = 1;
+ int array_key = 0;
+ int array_fd = -1;
+ int cg2_fd = -1;
+ int ret = -1;
+ int opt;
+
+ while ((opt = getopt(argc, argv, "F:U:v:")) != -1) {
+ switch (opt) {
+ /* General args */
+ case 'F':
+ pinned_file = optarg;
+ break;
+ case 'U':
+ pinned_file = optarg;
+ create_array = 0;
+ break;
+ case 'v':
+ cg2 = optarg;
+ break;
+ default:
+ usage();
+ goto out;
+ }
+ }
+
+ if (!cg2 || !pinned_file) {
+ usage();
+ goto out;
+ }
+
+ cg2_fd = open(cg2, O_RDONLY);
+ if (cg2_fd < 0) {
+ fprintf(stderr, "open(%s,...): %s(%d)\n",
+ cg2, strerror(errno), errno);
+ goto out;
+ }
+
+ if (create_array) {
+ array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,
+ sizeof(uint32_t), sizeof(uint32_t),
+ 1, 0);
+ if (array_fd < 0) {
+ fprintf(stderr,
+ "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n",
+ strerror(errno), errno);
+ goto out;
+ }
+ } else {
+ array_fd = bpf_obj_get(pinned_file);
+ if (array_fd < 0) {
+ fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
+ pinned_file, strerror(errno), errno);
+ goto out;
+ }
+ }
+
+ ret = bpf_map_update_elem(array_fd, &array_key, &cg2_fd, 0);
+ if (ret) {
+ perror("bpf_map_update_elem");
+ goto out;
+ }
+
+ if (create_array) {
+ ret = bpf_obj_pin(array_fd, pinned_file);
+ if (ret) {
+ fprintf(stderr, "bpf_obj_pin(..., %s): %s(%d)\n",
+ pinned_file, strerror(errno), errno);
+ goto out;
+ }
+ }
+
+out:
+ if (array_fd != -1)
+ close(array_fd);
+ if (cg2_fd != -1)
+ close(cg2_fd);
+ return ret;
+}
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
new file mode 100644
index 000000000..20fbd1241
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -0,0 +1,172 @@
+/* eBPF example program:
+ *
+ * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
+ *
+ * - Loads eBPF program
+ *
+ * The eBPF program accesses the map passed in to store two pieces of
+ * information. The number of invocations of the program, which maps
+ * to the number of packets received, is stored to key 0. Key 1 is
+ * incremented on each iteration by the number of bytes stored in
+ * the skb.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ *
+ * - Every second, reads map[0] and map[1] to see how many bytes and
+ * packets were seen on any socket of tasks in the given cgroup.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_insn.h"
+
+enum {
+ MAP_KEY_PACKETS,
+ MAP_KEY_BYTES,
+};
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int prog_load(int map_fd, int verdict)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */
+
+ /* Count packets */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ /* Count bytes */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+ return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+}
+
+static int usage(const char *argv0)
+{
+ printf("Usage: %s [-d] [-D] <cg-path> <egress|ingress>\n", argv0);
+ printf(" -d Drop Traffic\n");
+ printf(" -D Detach filter, and exit\n");
+ return EXIT_FAILURE;
+}
+
+static int attach_filter(int cg_fd, int type, int verdict)
+{
+ int prog_fd, map_fd, ret, key;
+ long long pkt_cnt, byte_cnt;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY,
+ sizeof(key), sizeof(byte_cnt),
+ 256, 0);
+ if (map_fd < 0) {
+ printf("Failed to create map: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ prog_fd = prog_load(map_fd, verdict);
+ printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+ if (prog_fd < 0) {
+ printf("Failed to load prog: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ ret = bpf_prog_attach(prog_fd, cg_fd, type, 0);
+ if (ret < 0) {
+ printf("Failed to attach prog to cgroup: '%s'\n",
+ strerror(errno));
+ return EXIT_FAILURE;
+ }
+ while (1) {
+ key = MAP_KEY_PACKETS;
+ assert(bpf_map_lookup_elem(map_fd, &key, &pkt_cnt) == 0);
+
+ key = MAP_KEY_BYTES;
+ assert(bpf_map_lookup_elem(map_fd, &key, &byte_cnt) == 0);
+
+ printf("cgroup received %lld packets, %lld bytes\n",
+ pkt_cnt, byte_cnt);
+ sleep(1);
+ }
+
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+ int detach_only = 0, verdict = 1;
+ enum bpf_attach_type type;
+ int opt, cg_fd, ret;
+
+ while ((opt = getopt(argc, argv, "Dd")) != -1) {
+ switch (opt) {
+ case 'd':
+ verdict = 0;
+ break;
+ case 'D':
+ detach_only = 1;
+ break;
+ default:
+ return usage(argv[0]);
+ }
+ }
+
+ if (argc - optind < 2)
+ return usage(argv[0]);
+
+ if (strcmp(argv[optind + 1], "ingress") == 0)
+ type = BPF_CGROUP_INET_INGRESS;
+ else if (strcmp(argv[optind + 1], "egress") == 0)
+ type = BPF_CGROUP_INET_EGRESS;
+ else
+ return usage(argv[0]);
+
+ cg_fd = open(argv[optind], O_DIRECTORY | O_RDONLY);
+ if (cg_fd < 0) {
+ printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ if (detach_only) {
+ ret = bpf_prog_detach(cg_fd, type);
+ printf("bpf_prog_detach() returned '%s' (%d)\n",
+ strerror(errno), errno);
+ } else
+ ret = attach_filter(cg_fd, type, verdict);
+
+ return ret;
+}
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
new file mode 100644
index 000000000..180f9d813
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -0,0 +1,442 @@
+/* eBPF example program:
+ *
+ * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
+ *
+ * - Loads eBPF program
+ *
+ * The eBPF program accesses the map passed in to store two pieces of
+ * information. The number of invocations of the program, which maps
+ * to the number of packets received, is stored to key 0. Key 1 is
+ * incremented on each iteration by the number of bytes stored in
+ * the skb. The program also stores the number of received bytes
+ * in the cgroup storage.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ *
+ * - Every second, reads map[0] and map[1] to see how many bytes and
+ * packets were seen on any socket of tasks in the given cgroup.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_insn.h"
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+
+#define FOO "/foo"
+#define BAR "/foo/bar/"
+#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int prog_load(int verdict)
+{
+ int ret;
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+ ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+ if (ret < 0) {
+ log_err("Loading program");
+ printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
+ return 0;
+ }
+ return ret;
+}
+
+static int test_foo_bar(void)
+{
+ int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0;
+
+ allow_prog = prog_load(1);
+ if (!allow_prog)
+ goto err;
+
+ drop_prog = prog_load(0);
+ if (!drop_prog)
+ goto err;
+
+ if (setup_cgroup_environment())
+ goto err;
+
+ /* Create cgroup /foo, get fd, and join it */
+ foo = create_and_get_cgroup(FOO);
+ if (!foo)
+ goto err;
+
+ if (join_cgroup(FOO))
+ goto err;
+
+ if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to /foo");
+ goto err;
+ }
+
+ printf("Attached DROP prog. This ping in cgroup /foo should fail...\n");
+ assert(system(PING_CMD) != 0);
+
+ /* Create cgroup /foo/bar, get fd, and join it */
+ bar = create_and_get_cgroup(BAR);
+ if (!bar)
+ goto err;
+
+ if (join_cgroup(BAR))
+ goto err;
+
+ printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
+ assert(system(PING_CMD) != 0);
+
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to /foo/bar");
+ goto err;
+ }
+
+ printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n");
+ assert(system(PING_CMD) == 0);
+
+ if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching program from /foo/bar");
+ goto err;
+ }
+
+ printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n"
+ "This ping in cgroup /foo/bar should fail...\n");
+ assert(system(PING_CMD) != 0);
+
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to /foo/bar");
+ goto err;
+ }
+
+ if (bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching program from /foo");
+ goto err;
+ }
+
+ printf("Attached PASS from /foo/bar and detached DROP from /foo.\n"
+ "This ping in cgroup /foo/bar should pass...\n");
+ assert(system(PING_CMD) == 0);
+
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to /foo/bar");
+ goto err;
+ }
+
+ if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+ errno = 0;
+ log_err("Unexpected success attaching prog to /foo/bar");
+ goto err;
+ }
+
+ if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching program from /foo/bar");
+ goto err;
+ }
+
+ if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
+ errno = 0;
+ log_err("Unexpected success in double detach from /foo");
+ goto err;
+ }
+
+ if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+ log_err("Attaching non-overridable prog to /foo");
+ goto err;
+ }
+
+ if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+ errno = 0;
+ log_err("Unexpected success attaching non-overridable prog to /foo/bar");
+ goto err;
+ }
+
+ if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ errno = 0;
+ log_err("Unexpected success attaching overridable prog to /foo/bar");
+ goto err;
+ }
+
+ if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ errno = 0;
+ log_err("Unexpected success attaching overridable prog to /foo");
+ goto err;
+ }
+
+ if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+ log_err("Attaching different non-overridable prog to /foo");
+ goto err;
+ }
+
+ goto out;
+
+err:
+ rc = 1;
+
+out:
+ close(foo);
+ close(bar);
+ cleanup_cgroup_environment();
+ if (!rc)
+ printf("### override:PASS\n");
+ else
+ printf("### override:FAIL\n");
+ return rc;
+}
+
+static int map_fd = -1;
+
+static int prog_load_cnt(int verdict, int val)
+{
+ int cgroup_storage_fd;
+
+ if (map_fd < 0)
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE,
+ sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+ if (cgroup_storage_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ struct bpf_insn prog[] = {
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+ BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_MOV64_IMM(BPF_REG_1, val),
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+ int ret;
+
+ ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+ if (ret < 0) {
+ log_err("Loading program");
+ printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
+ return 0;
+ }
+ close(cgroup_storage_fd);
+ return ret;
+}
+
+
+static int test_multiprog(void)
+{
+ __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id;
+ int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0;
+ int drop_prog, allow_prog[6] = {}, rc = 0;
+ unsigned long long value;
+ int i = 0;
+
+ for (i = 0; i < 6; i++) {
+ allow_prog[i] = prog_load_cnt(1, 1 << i);
+ if (!allow_prog[i])
+ goto err;
+ }
+ drop_prog = prog_load_cnt(0, 1);
+ if (!drop_prog)
+ goto err;
+
+ if (setup_cgroup_environment())
+ goto err;
+
+ cg1 = create_and_get_cgroup("/cg1");
+ if (!cg1)
+ goto err;
+ cg2 = create_and_get_cgroup("/cg1/cg2");
+ if (!cg2)
+ goto err;
+ cg3 = create_and_get_cgroup("/cg1/cg2/cg3");
+ if (!cg3)
+ goto err;
+ cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4");
+ if (!cg4)
+ goto err;
+ cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5");
+ if (!cg5)
+ goto err;
+
+ if (join_cgroup("/cg1/cg2/cg3/cg4/cg5"))
+ goto err;
+
+ if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Attaching prog to cg1");
+ goto err;
+ }
+ if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Unexpected success attaching the same prog to cg1");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Attaching prog2 to cg1");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to cg2");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Attaching prog to cg3");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to cg4");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) {
+ log_err("Attaching prog to cg5");
+ goto err;
+ }
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 8 + 32);
+
+ /* query the number of effective progs in cg5 */
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ NULL, NULL, &prog_cnt) == 0);
+ assert(prog_cnt == 4);
+ /* retrieve prog_ids of effective progs in cg5 */
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 4);
+ assert(attach_flags == 0);
+ saved_prog_id = prog_ids[0];
+ /* check enospc handling */
+ prog_ids[0] = 0;
+ prog_cnt = 2;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == -1 &&
+ errno == ENOSPC);
+ assert(prog_cnt == 4);
+ /* check that prog_ids are returned even when buffer is too small */
+ assert(prog_ids[0] == saved_prog_id);
+ /* retrieve prog_id of single attached prog in cg5 */
+ prog_ids[0] = 0;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+ NULL, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 1);
+ assert(prog_ids[0] == saved_prog_id);
+
+ /* detach bottom program and ping again */
+ if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching prog from cg5");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 8 + 16);
+
+ /* detach 3rd from bottom program and ping again */
+ errno = 0;
+ if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Unexpected success on detach from cg3");
+ goto err;
+ }
+ if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching from cg3");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 16);
+
+ /* detach 2nd from bottom program and ping again */
+ if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching prog from cg4");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 4);
+
+ prog_cnt = 4;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 3);
+ assert(attach_flags == 0);
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+ NULL, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 0);
+ goto out;
+err:
+ rc = 1;
+
+out:
+ for (i = 0; i < 6; i++)
+ if (allow_prog[i] > 0)
+ close(allow_prog[i]);
+ close(cg1);
+ close(cg2);
+ close(cg3);
+ close(cg4);
+ close(cg5);
+ cleanup_cgroup_environment();
+ if (!rc)
+ printf("### multi:PASS\n");
+ else
+ printf("### multi:FAIL\n");
+ return rc;
+}
+
+int main(int argc, char **argv)
+{
+ int rc = 0;
+
+ rc = test_foo_bar();
+ if (rc)
+ return rc;
+
+ return test_multiprog();
+}
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
new file mode 100644
index 000000000..b0811da5a
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -0,0 +1,290 @@
+/* eBPF example program:
+ *
+ * - Loads eBPF program
+ *
+ * The eBPF program sets the sk_bound_dev_if index in new AF_INET{6}
+ * sockets opened by processes in the cgroup.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <inttypes.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_insn.h"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int prog_load(__u32 idx, __u32 mark, __u32 prio)
+{
+ /* save pointer to context */
+ struct bpf_insn prog_start[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ };
+ struct bpf_insn prog_end[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+
+ /* set sk_bound_dev_if on socket */
+ struct bpf_insn prog_dev[] = {
+ BPF_MOV64_IMM(BPF_REG_3, idx),
+ BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
+ };
+
+ /* set mark on socket */
+ struct bpf_insn prog_mark[] = {
+ /* get uid of process */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_current_uid_gid),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff),
+
+ /* if uid is 0, use given mark, else use the uid as the mark */
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_3, mark),
+
+ /* set the mark on the new socket */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)),
+ };
+
+ /* set priority on socket */
+ struct bpf_insn prog_prio[] = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_3, prio),
+ BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)),
+ };
+
+ struct bpf_insn *prog;
+ size_t insns_cnt;
+ void *p;
+ int ret;
+
+ insns_cnt = sizeof(prog_start) + sizeof(prog_end);
+ if (idx)
+ insns_cnt += sizeof(prog_dev);
+
+ if (mark)
+ insns_cnt += sizeof(prog_mark);
+
+ if (prio)
+ insns_cnt += sizeof(prog_prio);
+
+ p = prog = malloc(insns_cnt);
+ if (!prog) {
+ fprintf(stderr, "Failed to allocate memory for instructions\n");
+ return EXIT_FAILURE;
+ }
+
+ memcpy(p, prog_start, sizeof(prog_start));
+ p += sizeof(prog_start);
+
+ if (idx) {
+ memcpy(p, prog_dev, sizeof(prog_dev));
+ p += sizeof(prog_dev);
+ }
+
+ if (mark) {
+ memcpy(p, prog_mark, sizeof(prog_mark));
+ p += sizeof(prog_mark);
+ }
+
+ if (prio) {
+ memcpy(p, prog_prio, sizeof(prog_prio));
+ p += sizeof(prog_prio);
+ }
+
+ memcpy(p, prog_end, sizeof(prog_end));
+ p += sizeof(prog_end);
+
+ insns_cnt /= sizeof(struct bpf_insn);
+
+ ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
+ "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+ free(prog);
+
+ return ret;
+}
+
+static int get_bind_to_device(int sd, char *name, size_t len)
+{
+ socklen_t optlen = len;
+ int rc;
+
+ name[0] = '\0';
+ rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen);
+ if (rc < 0)
+ perror("setsockopt(SO_BINDTODEVICE)");
+
+ return rc;
+}
+
+static unsigned int get_somark(int sd)
+{
+ unsigned int mark = 0;
+ socklen_t optlen = sizeof(mark);
+ int rc;
+
+ rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen);
+ if (rc < 0)
+ perror("getsockopt(SO_MARK)");
+
+ return mark;
+}
+
+static unsigned int get_priority(int sd)
+{
+ unsigned int prio = 0;
+ socklen_t optlen = sizeof(prio);
+ int rc;
+
+ rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen);
+ if (rc < 0)
+ perror("getsockopt(SO_PRIORITY)");
+
+ return prio;
+}
+
+static int show_sockopts(int family)
+{
+ unsigned int mark, prio;
+ char name[16];
+ int sd;
+
+ sd = socket(family, SOCK_DGRAM, 17);
+ if (sd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ if (get_bind_to_device(sd, name, sizeof(name)) < 0)
+ return 1;
+
+ mark = get_somark(sd);
+ prio = get_priority(sd);
+
+ close(sd);
+
+ printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio);
+
+ return 0;
+}
+
+static int usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" Attach a program\n");
+ printf(" %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
+ printf("\n");
+ printf(" Detach a program\n");
+ printf(" %s -d cg-path\n", argv0);
+ printf("\n");
+ printf(" Show inherited socket settings (mark, priority, and device)\n");
+ printf(" %s [-6]\n", argv0);
+ return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+ __u32 idx = 0, mark = 0, prio = 0;
+ const char *cgrp_path = NULL;
+ int cg_fd, prog_fd, ret;
+ int family = PF_INET;
+ int do_attach = 1;
+ int rc;
+
+ while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) {
+ switch (rc) {
+ case 'd':
+ do_attach = 0;
+ break;
+ case 'b':
+ idx = if_nametoindex(optarg);
+ if (!idx) {
+ idx = strtoumax(optarg, NULL, 0);
+ if (!idx) {
+ printf("Invalid device name\n");
+ return EXIT_FAILURE;
+ }
+ }
+ break;
+ case 'm':
+ mark = strtoumax(optarg, NULL, 0);
+ break;
+ case 'p':
+ prio = strtoumax(optarg, NULL, 0);
+ break;
+ case '6':
+ family = PF_INET6;
+ break;
+ default:
+ return usage(argv[0]);
+ }
+ }
+
+ if (optind == argc)
+ return show_sockopts(family);
+
+ cgrp_path = argv[optind];
+ if (!cgrp_path) {
+ fprintf(stderr, "cgroup path not given\n");
+ return EXIT_FAILURE;
+ }
+
+ if (do_attach && !idx && !mark && !prio) {
+ fprintf(stderr,
+ "One of device, mark or priority must be given\n");
+ return EXIT_FAILURE;
+ }
+
+ cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY);
+ if (cg_fd < 0) {
+ printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ if (do_attach) {
+ prog_fd = prog_load(idx, mark, prio);
+ if (prog_fd < 0) {
+ printf("Failed to load prog: '%s'\n", strerror(errno));
+ printf("Output from kernel verifier:\n%s\n-------\n",
+ bpf_log_buf);
+ return EXIT_FAILURE;
+ }
+
+ ret = bpf_prog_attach(prog_fd, cg_fd,
+ BPF_CGROUP_INET_SOCK_CREATE, 0);
+ if (ret < 0) {
+ printf("Failed to attach prog to cgroup: '%s'\n",
+ strerror(errno));
+ return EXIT_FAILURE;
+ }
+ } else {
+ ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+ if (ret < 0) {
+ printf("Failed to detach prog from cgroup: '%s'\n",
+ strerror(errno));
+ return EXIT_FAILURE;
+ }
+ }
+
+ close(cg_fd);
+ return EXIT_SUCCESS;
+}
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
new file mode 100755
index 000000000..9f6174236
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -0,0 +1,135 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Test various socket options that can be set by attaching programs to cgroups.
+
+CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock"
+
+################################################################################
+#
+print_result()
+{
+ local rc=$1
+ local status=" OK "
+
+ [ $rc -ne 0 ] && status="FAIL"
+
+ printf "%-50s [%4s]\n" "$2" "$status"
+}
+
+check_sock()
+{
+ out=$(test_cgrp2_sock)
+ echo $out | grep -q "$1"
+ if [ $? -ne 0 ]; then
+ print_result 1 "IPv4: $2"
+ echo " expected: $1"
+ echo " have: $out"
+ rc=1
+ else
+ print_result 0 "IPv4: $2"
+ fi
+}
+
+check_sock6()
+{
+ out=$(test_cgrp2_sock -6)
+ echo $out | grep -q "$1"
+ if [ $? -ne 0 ]; then
+ print_result 1 "IPv6: $2"
+ echo " expected: $1"
+ echo " have: $out"
+ rc=1
+ else
+ print_result 0 "IPv6: $2"
+ fi
+}
+
+################################################################################
+#
+
+cleanup()
+{
+ echo $$ >> ${CGRP_MNT}/cgroup.procs
+ rmdir ${CGRP_MNT}/sockopts
+}
+
+cleanup_and_exit()
+{
+ local rc=$1
+ local msg="$2"
+
+ [ -n "$msg" ] && echo "ERROR: $msg"
+
+ test_cgrp2_sock -d ${CGRP_MNT}/sockopts
+ ip li del cgrp2_sock
+ umount ${CGRP_MNT}
+
+ exit $rc
+}
+
+
+################################################################################
+# main
+
+rc=0
+
+ip li add cgrp2_sock type dummy 2>/dev/null
+
+set -e
+mkdir -p ${CGRP_MNT}
+mount -t cgroup2 none ${CGRP_MNT}
+set +e
+
+
+# make sure we have a known start point
+cleanup 2>/dev/null
+
+mkdir -p ${CGRP_MNT}/sockopts
+[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy"
+
+
+# set pid into cgroup
+echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs
+
+# no bpf program attached, so socket should show no settings
+check_sock "dev , mark 0, priority 0" "No programs attached"
+check_sock6 "dev , mark 0, priority 0" "No programs attached"
+
+# verify device is set
+#
+test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+ cleanup_and_exit 1 "Failed to install program to set device"
+fi
+check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set"
+check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set"
+
+# verify mark is set
+#
+test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+ cleanup_and_exit 1 "Failed to install program to set mark"
+fi
+check_sock "dev , mark 666, priority 0" "Mark set"
+check_sock6 "dev , mark 666, priority 0" "Mark set"
+
+# verify priority is set
+#
+test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+ cleanup_and_exit 1 "Failed to install program to set priority"
+fi
+check_sock "dev , mark 0, priority 123" "Priority set"
+check_sock6 "dev , mark 0, priority 123" "Priority set"
+
+# all 3 at once
+#
+test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+ cleanup_and_exit 1 "Failed to install program to set device, mark and priority"
+fi
+check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set"
+check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set"
+
+cleanup_and_exit $rc
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
new file mode 100644
index 000000000..a9277b118
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* eBPF example program:
+ *
+ * - Loads eBPF program
+ *
+ * The eBPF program loads a filter from file and attaches the
+ * program to a cgroup using BPF_PROG_ATTACH
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_insn.h"
+#include "bpf_load.h"
+
+static int usage(const char *argv0)
+{
+ printf("Usage: %s cg-path filter-path [filter-id]\n", argv0);
+ return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+ int cg_fd, ret, filter_id = 0;
+
+ if (argc < 3)
+ return usage(argv[0]);
+
+ cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+ if (cg_fd < 0) {
+ printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ if (load_bpf_file(argv[2]))
+ return EXIT_FAILURE;
+
+ printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+ if (argc > 3)
+ filter_id = atoi(argv[3]);
+
+ if (filter_id >= prog_cnt) {
+ printf("Invalid program id; program not found in file\n");
+ return EXIT_FAILURE;
+ }
+
+ ret = bpf_prog_attach(prog_fd[filter_id], cg_fd,
+ BPF_CGROUP_INET_SOCK_CREATE, 0);
+ if (ret < 0) {
+ printf("Failed to attach prog to cgroup: '%s'\n",
+ strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh
new file mode 100755
index 000000000..0f396a86e
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+function config_device {
+ ip netns add at_ns0
+ ip link add veth0 type veth peer name veth0b
+ ip link set veth0b up
+ ip link set veth0 netns at_ns0
+ ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+ ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip addr add 172.16.1.101/24 dev veth0b
+ ip addr add 2401:db00::2/64 dev veth0b nodad
+}
+
+function config_cgroup {
+ rm -rf /tmp/cgroupv2
+ mkdir -p /tmp/cgroupv2
+ mount -t cgroup2 none /tmp/cgroupv2
+ mkdir -p /tmp/cgroupv2/foo
+ echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+}
+
+
+function attach_bpf {
+ test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1
+ [ $? -ne 0 ] && exit 1
+}
+
+function cleanup {
+ if [ -d /tmp/cgroupv2/foo ]; then
+ test_cgrp2_sock -d /tmp/cgroupv2/foo
+ fi
+ ip link del veth0b
+ ip netns delete at_ns0
+ umount /tmp/cgroupv2
+ rm -rf /tmp/cgroupv2
+}
+
+cleanup 2>/dev/null
+
+set -e
+config_device
+config_cgroup
+set +e
+
+#
+# Test 1 - fail ping6
+#
+attach_bpf 0
+ping -c1 -w1 172.16.1.100
+if [ $? -ne 0 ]; then
+ echo "ping failed when it should succeed"
+ cleanup
+ exit 1
+fi
+
+ping6 -c1 -w1 2401:db00::1
+if [ $? -eq 0 ]; then
+ echo "ping6 succeeded when it should not"
+ cleanup
+ exit 1
+fi
+
+#
+# Test 2 - fail ping
+#
+attach_bpf 1
+ping6 -c1 -w1 2401:db00::1
+if [ $? -ne 0 ]; then
+ echo "ping6 failed when it should succeed"
+ cleanup
+ exit 1
+fi
+
+ping -c1 -w1 172.16.1.100
+if [ $? -eq 0 ]; then
+ echo "ping succeeded when it should not"
+ cleanup
+ exit 1
+fi
+
+cleanup
+echo
+echo "*** PASS ***"
diff --git a/samples/bpf/test_cgrp2_tc.sh b/samples/bpf/test_cgrp2_tc.sh
new file mode 100755
index 000000000..12faf5847
--- /dev/null
+++ b/samples/bpf/test_cgrp2_tc.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+MY_DIR=$(dirname $0)
+# Details on the bpf prog
+BPF_CGRP2_ARRAY_NAME='test_cgrp2_array_pin'
+BPF_PROG="$MY_DIR/test_cgrp2_tc_kern.o"
+BPF_SECTION='filter'
+
+[ -z "$TC" ] && TC='tc'
+[ -z "$IP" ] && IP='ip'
+
+# Names of the veth interface, net namespace...etc.
+HOST_IFC='ve'
+NS_IFC='vens'
+NS='ns'
+
+find_mnt() {
+ cat /proc/mounts | \
+ awk '{ if ($3 == "'$1'" && mnt == "") { mnt = $2 }} END { print mnt }'
+}
+
+# Init cgroup2 vars
+init_cgrp2_vars() {
+ CGRP2_ROOT=$(find_mnt cgroup2)
+ if [ -z "$CGRP2_ROOT" ]
+ then
+ CGRP2_ROOT='/mnt/cgroup2'
+ MOUNT_CGRP2="yes"
+ fi
+ CGRP2_TC="$CGRP2_ROOT/tc"
+ CGRP2_TC_LEAF="$CGRP2_TC/leaf"
+}
+
+# Init bpf fs vars
+init_bpf_fs_vars() {
+ local bpf_fs_root=$(find_mnt bpf)
+ [ -n "$bpf_fs_root" ] || return -1
+ BPF_FS_TC_SHARE="$bpf_fs_root/tc/globals"
+}
+
+setup_cgrp2() {
+ case $1 in
+ start)
+ if [ "$MOUNT_CGRP2" == 'yes' ]
+ then
+ [ -d $CGRP2_ROOT ] || mkdir -p $CGRP2_ROOT
+ mount -t cgroup2 none $CGRP2_ROOT || return $?
+ fi
+ mkdir -p $CGRP2_TC_LEAF
+ ;;
+ *)
+ rmdir $CGRP2_TC_LEAF && rmdir $CGRP2_TC
+ [ "$MOUNT_CGRP2" == 'yes' ] && umount $CGRP2_ROOT
+ ;;
+ esac
+}
+
+setup_bpf_cgrp2_array() {
+ local bpf_cgrp2_array="$BPF_FS_TC_SHARE/$BPF_CGRP2_ARRAY_NAME"
+ case $1 in
+ start)
+ $MY_DIR/test_cgrp2_array_pin -U $bpf_cgrp2_array -v $CGRP2_TC
+ ;;
+ *)
+ [ -d "$BPF_FS_TC_SHARE" ] && rm -f $bpf_cgrp2_array
+ ;;
+ esac
+}
+
+setup_net() {
+ case $1 in
+ start)
+ $IP link add $HOST_IFC type veth peer name $NS_IFC || return $?
+ $IP link set dev $HOST_IFC up || return $?
+ sysctl -q net.ipv6.conf.$HOST_IFC.accept_dad=0
+
+ $IP netns add ns || return $?
+ $IP link set dev $NS_IFC netns ns || return $?
+ $IP -n $NS link set dev $NS_IFC up || return $?
+ $IP netns exec $NS sysctl -q net.ipv6.conf.$NS_IFC.accept_dad=0
+ $TC qdisc add dev $HOST_IFC clsact || return $?
+ $TC filter add dev $HOST_IFC egress bpf da obj $BPF_PROG sec $BPF_SECTION || return $?
+ ;;
+ *)
+ $IP netns del $NS
+ $IP link del $HOST_IFC
+ ;;
+ esac
+}
+
+run_in_cgrp() {
+ # Fork another bash and move it under the specified cgroup.
+ # It makes the cgroup cleanup easier at the end of the test.
+ cmd='echo $$ > '
+ cmd="$cmd $1/cgroup.procs; exec $2"
+ bash -c "$cmd"
+}
+
+do_test() {
+ run_in_cgrp $CGRP2_TC_LEAF "ping -6 -c3 ff02::1%$HOST_IFC >& /dev/null"
+ local dropped=$($TC -s qdisc show dev $HOST_IFC | tail -3 | \
+ awk '/drop/{print substr($7, 0, index($7, ",")-1)}')
+ if [[ $dropped -eq 0 ]]
+ then
+ echo "FAIL"
+ return 1
+ else
+ echo "Successfully filtered $dropped packets"
+ return 0
+ fi
+}
+
+do_exit() {
+ if [ "$DEBUG" == "yes" ] && [ "$MODE" != 'cleanuponly' ]
+ then
+ echo "------ DEBUG ------"
+ echo "mount: "; mount | egrep '(cgroup2|bpf)'; echo
+ echo "$CGRP2_TC_LEAF: "; ls -l $CGRP2_TC_LEAF; echo
+ if [ -d "$BPF_FS_TC_SHARE" ]
+ then
+ echo "$BPF_FS_TC_SHARE: "; ls -l $BPF_FS_TC_SHARE; echo
+ fi
+ echo "Host net:"
+ $IP netns
+ $IP link show dev $HOST_IFC
+ $IP -6 a show dev $HOST_IFC
+ $TC -s qdisc show dev $HOST_IFC
+ echo
+ echo "$NS net:"
+ $IP -n $NS link show dev $NS_IFC
+ $IP -n $NS -6 link show dev $NS_IFC
+ echo "------ DEBUG ------"
+ echo
+ fi
+
+ if [ "$MODE" != 'nocleanup' ]
+ then
+ setup_net stop
+ setup_bpf_cgrp2_array stop
+ setup_cgrp2 stop
+ fi
+}
+
+init_cgrp2_vars
+init_bpf_fs_vars
+
+while [[ $# -ge 1 ]]
+do
+ a="$1"
+ case $a in
+ debug)
+ DEBUG='yes'
+ shift 1
+ ;;
+ cleanup-only)
+ MODE='cleanuponly'
+ shift 1
+ ;;
+ no-cleanup)
+ MODE='nocleanup'
+ shift 1
+ ;;
+ *)
+ echo "test_cgrp2_tc [debug] [cleanup-only | no-cleanup]"
+ echo " debug: Print cgrp and network setup details at the end of the test"
+ echo " cleanup-only: Try to cleanup things from last test. No test will be run"
+ echo " no-cleanup: Run the test but don't do cleanup at the end"
+ echo "[Note: If no arg is given, it will run the test and do cleanup at the end]"
+ echo
+ exit -1
+ ;;
+ esac
+done
+
+trap do_exit 0
+
+[ "$MODE" == 'cleanuponly' ] && exit
+
+setup_cgrp2 start || exit $?
+setup_net start || exit $?
+init_bpf_fs_vars || exit $?
+setup_bpf_cgrp2_array start || exit $?
+do_test
+echo
diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c
new file mode 100644
index 000000000..1547b36a7
--- /dev/null
+++ b/samples/bpf/test_cgrp2_tc_kern.c
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/in6.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/pkt_cls.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* copy of 'struct ethhdr' without __packed */
+struct eth_hdr {
+ unsigned char h_dest[ETH_ALEN];
+ unsigned char h_source[ETH_ALEN];
+ unsigned short h_proto;
+};
+
+#define PIN_GLOBAL_NS 2
+struct bpf_elf_map {
+ __u32 type;
+ __u32 size_key;
+ __u32 size_value;
+ __u32 max_elem;
+ __u32 flags;
+ __u32 id;
+ __u32 pinning;
+};
+
+struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = {
+ .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+ .size_key = sizeof(uint32_t),
+ .size_value = sizeof(uint32_t),
+ .pinning = PIN_GLOBAL_NS,
+ .max_elem = 1,
+};
+
+SEC("filter")
+int handle_egress(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ struct ipv6hdr *ip6h = data + sizeof(*eth);
+ void *data_end = (void *)(long)skb->data_end;
+ char dont_care_msg[] = "dont care %04x %d\n";
+ char pass_msg[] = "pass\n";
+ char reject_msg[] = "reject\n";
+
+ /* single length check */
+ if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+ return TC_ACT_OK;
+
+ if (eth->h_proto != htons(ETH_P_IPV6) ||
+ ip6h->nexthdr != IPPROTO_ICMPV6) {
+ bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg),
+ eth->h_proto, ip6h->nexthdr);
+ return TC_ACT_OK;
+ } else if (bpf_skb_under_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) {
+ bpf_trace_printk(pass_msg, sizeof(pass_msg));
+ return TC_ACT_OK;
+ } else {
+ bpf_trace_printk(reject_msg, sizeof(reject_msg));
+ return TC_ACT_SHOT;
+ }
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cls_bpf.sh b/samples/bpf/test_cls_bpf.sh
new file mode 100755
index 000000000..aaddd67b3
--- /dev/null
+++ b/samples/bpf/test_cls_bpf.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+function pktgen {
+ ../pktgen/pktgen_bench_xmit_mode_netif_receive.sh -i $IFC -s 64 \
+ -m 90:e2:ba:ff:ff:ff -d 192.168.0.1 -t 4
+ local dropped=`tc -s qdisc show dev $IFC | tail -3 | awk '/drop/{print $7}'`
+ if [ "$dropped" == "0," ]; then
+ echo "FAIL"
+ else
+ echo "Successfully filtered " $dropped " packets"
+ fi
+}
+
+function test {
+ echo -n "Loading bpf program '$2'... "
+ tc qdisc add dev $IFC clsact
+ tc filter add dev $IFC ingress bpf da obj $1 sec $2
+ local status=$?
+ if [ $status -ne 0 ]; then
+ echo "FAIL"
+ else
+ echo "ok"
+ pktgen
+ fi
+ tc qdisc del dev $IFC clsact
+}
+
+IFC=test_veth
+
+ip link add name $IFC type veth peer name pair_$IFC
+ip link set $IFC up
+ip link set pair_$IFC up
+
+test ./parse_simple.o simple
+test ./parse_varlen.o varlen
+test ./parse_ldabs.o ldabs
+ip link del dev $IFC
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c
new file mode 100644
index 000000000..86b28d7d6
--- /dev/null
+++ b/samples/bpf/test_current_task_under_cgroup_kern.c
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+#include <uapi/linux/utsname.h>
+
+struct bpf_map_def SEC("maps") cgroup_map = {
+ .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") perf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = 1,
+};
+
+/* Writes the last PID that called sync to a map at index 0 */
+SEC("kprobe/sys_sync")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ u64 pid = bpf_get_current_pid_tgid();
+ int idx = 0;
+
+ if (!bpf_current_task_under_cgroup(&cgroup_map, 0))
+ return 0;
+
+ bpf_map_update_elem(&perf_map, &idx, &pid, BPF_ANY);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
new file mode 100644
index 000000000..4be4874ca
--- /dev/null
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include <linux/bpf.h>
+#include "cgroup_helpers.h"
+
+#define CGROUP_PATH "/my-cgroup"
+
+int main(int argc, char **argv)
+{
+ pid_t remote_pid, local_pid = getpid();
+ int cg2, idx = 0, rc = 0;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (setup_cgroup_environment())
+ goto err;
+
+ cg2 = create_and_get_cgroup(CGROUP_PATH);
+
+ if (!cg2)
+ goto err;
+
+ if (bpf_map_update_elem(map_fd[0], &idx, &cg2, BPF_ANY)) {
+ log_err("Adding target cgroup to map");
+ goto err;
+ }
+
+ if (join_cgroup(CGROUP_PATH))
+ goto err;
+
+ /*
+ * The installed helper program catched the sync call, and should
+ * write it to the map.
+ */
+
+ sync();
+ bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid);
+
+ if (local_pid != remote_pid) {
+ fprintf(stderr,
+ "BPF Helper didn't write correct PID to map, but: %d\n",
+ remote_pid);
+ goto err;
+ }
+
+ /* Verify the negative scenario; leave the cgroup */
+ if (join_cgroup("/"))
+ goto err;
+
+ remote_pid = 0;
+ bpf_map_update_elem(map_fd[1], &idx, &remote_pid, BPF_ANY);
+
+ sync();
+ bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid);
+
+ if (local_pid == remote_pid) {
+ fprintf(stderr, "BPF cgroup negative test did not work\n");
+ goto err;
+ }
+
+ goto out;
+err:
+ rc = 1;
+
+out:
+ close(cg2);
+ cleanup_cgroup_environment();
+ return rc;
+}
diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh
new file mode 100755
index 000000000..9e507c305
--- /dev/null
+++ b/samples/bpf/test_ipip.sh
@@ -0,0 +1,179 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+function config_device {
+ ip netns add at_ns0
+ ip netns add at_ns1
+ ip netns add at_ns2
+ ip link add veth0 type veth peer name veth0b
+ ip link add veth1 type veth peer name veth1b
+ ip link add veth2 type veth peer name veth2b
+ ip link set veth0b up
+ ip link set veth1b up
+ ip link set veth2b up
+ ip link set dev veth0b mtu 1500
+ ip link set dev veth1b mtu 1500
+ ip link set dev veth2b mtu 1500
+ ip link set veth0 netns at_ns0
+ ip link set veth1 netns at_ns1
+ ip link set veth2 netns at_ns2
+ ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+ ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1
+ ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad
+ ip netns exec at_ns1 ip link set dev veth1 up
+ ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2
+ ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad
+ ip netns exec at_ns2 ip link set dev veth2 up
+ ip link add br0 type bridge
+ ip link set br0 up
+ ip link set dev br0 mtu 1500
+ ip link set veth0b master br0
+ ip link set veth1b master br0
+ ip link set veth2b master br0
+}
+
+function add_ipip_tunnel {
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns1 \
+ ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200
+ ip netns exec at_ns1 ip link set dev $DEV_NS up
+ # same inner IP address in at_ns0 and at_ns1
+ ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
+
+ ip netns exec at_ns2 ip link add dev $DEV type ipip external
+ ip netns exec at_ns2 ip link set dev $DEV up
+ ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
+}
+
+function add_ipip6_tunnel {
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns1 \
+ ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64
+ ip netns exec at_ns1 ip link set dev $DEV_NS up
+ # same inner IP address in at_ns0 and at_ns1
+ ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
+
+ ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external
+ ip netns exec at_ns2 ip link set dev $DEV up
+ ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
+}
+
+function add_ip6ip6_tunnel {
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64
+ ip netns exec at_ns1 \
+ ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64
+ ip netns exec at_ns1 ip link set dev $DEV_NS up
+ # same inner IP address in at_ns0 and at_ns1
+ ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64
+
+ ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external
+ ip netns exec at_ns2 ip link set dev $DEV up
+ ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64
+}
+
+function attach_bpf {
+ DEV=$1
+ SET_TUNNEL=$2
+ GET_TUNNEL=$3
+ ip netns exec at_ns2 tc qdisc add dev $DEV clsact
+ ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL
+ ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL
+}
+
+function test_ipip {
+ DEV_NS=ipip_std
+ DEV=ipip_bpf
+ config_device
+# tcpdump -nei br0 &
+ cat /sys/kernel/debug/tracing/trace_pipe &
+
+ add_ipip_tunnel
+ attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
+
+ ip netns exec at_ns0 ping -c 1 10.1.1.200
+ ip netns exec at_ns2 ping -c 1 10.1.1.100
+ ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
+ ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
+ sleep 0.2
+ # tcp check _same_ IP over different tunnels
+ ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
+ ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
+ cleanup
+}
+
+# IPv4 over IPv6 tunnel
+function test_ipip6 {
+ DEV_NS=ipip_std
+ DEV=ipip_bpf
+ config_device
+# tcpdump -nei br0 &
+ cat /sys/kernel/debug/tracing/trace_pipe &
+
+ add_ipip6_tunnel
+ attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel
+
+ ip netns exec at_ns0 ping -c 1 10.1.1.200
+ ip netns exec at_ns2 ping -c 1 10.1.1.100
+ ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
+ ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
+ sleep 0.2
+ # tcp check _same_ IP over different tunnels
+ ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
+ ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
+ cleanup
+}
+
+# IPv6 over IPv6 tunnel
+function test_ip6ip6 {
+ DEV_NS=ipip_std
+ DEV=ipip_bpf
+ config_device
+# tcpdump -nei br0 &
+ cat /sys/kernel/debug/tracing/trace_pipe &
+
+ add_ip6ip6_tunnel
+ attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel
+
+ ip netns exec at_ns0 ping -6 -c 1 2601:646::2
+ ip netns exec at_ns2 ping -6 -c 1 2601:646::1
+ ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null
+ ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null
+ sleep 0.2
+ # tcp check _same_ IP over different tunnels
+ ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200
+ ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201
+ cleanup
+}
+
+function cleanup {
+ set +ex
+ pkill iperf
+ ip netns delete at_ns0
+ ip netns delete at_ns1
+ ip netns delete at_ns2
+ ip link del veth0
+ ip link del veth1
+ ip link del veth2
+ ip link del br0
+ pkill tcpdump
+ pkill cat
+ set -ex
+}
+
+cleanup
+echo "Testing IP tunnels..."
+test_ipip
+test_ipip6
+test_ip6ip6
+echo "*** PASS ***"
diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c
new file mode 100644
index 000000000..eec3e2509
--- /dev/null
+++ b/samples/bpf/test_lru_dist.c
@@ -0,0 +1,543 @@
+/*
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include <linux/types.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sched.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include <bpf/bpf.h>
+#include "bpf_util.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#ifndef offsetof
+# define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
+#endif
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+static int nr_cpus;
+static unsigned long long *dist_keys;
+static unsigned int dist_key_counts;
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void __list_del(struct list_head *prev, struct list_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void __list_del_entry(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+}
+
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del_entry(list);
+ list_add(list, head);
+}
+
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+#define list_last_entry(ptr, type, member) \
+ list_entry((ptr)->prev, type, member)
+
+struct pfect_lru_node {
+ struct list_head list;
+ unsigned long long key;
+};
+
+struct pfect_lru {
+ struct list_head list;
+ struct pfect_lru_node *free_nodes;
+ unsigned int cur_size;
+ unsigned int lru_size;
+ unsigned int nr_unique;
+ unsigned int nr_misses;
+ unsigned int total;
+ int map_fd;
+};
+
+static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size,
+ unsigned int nr_possible_elems)
+{
+ lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+ sizeof(unsigned long long),
+ sizeof(struct pfect_lru_node *),
+ nr_possible_elems, 0);
+ assert(lru->map_fd != -1);
+
+ lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node));
+ assert(lru->free_nodes);
+
+ INIT_LIST_HEAD(&lru->list);
+ lru->cur_size = 0;
+ lru->lru_size = lru_size;
+ lru->nr_unique = lru->nr_misses = lru->total = 0;
+}
+
+static void pfect_lru_destroy(struct pfect_lru *lru)
+{
+ close(lru->map_fd);
+ free(lru->free_nodes);
+}
+
+static int pfect_lru_lookup_or_insert(struct pfect_lru *lru,
+ unsigned long long key)
+{
+ struct pfect_lru_node *node = NULL;
+ int seen = 0;
+
+ lru->total++;
+ if (!bpf_map_lookup_elem(lru->map_fd, &key, &node)) {
+ if (node) {
+ list_move(&node->list, &lru->list);
+ return 1;
+ }
+ seen = 1;
+ }
+
+ if (lru->cur_size < lru->lru_size) {
+ node = &lru->free_nodes[lru->cur_size++];
+ INIT_LIST_HEAD(&node->list);
+ } else {
+ struct pfect_lru_node *null_node = NULL;
+
+ node = list_last_entry(&lru->list,
+ struct pfect_lru_node,
+ list);
+ bpf_map_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST);
+ }
+
+ node->key = key;
+ list_move(&node->list, &lru->list);
+
+ lru->nr_misses++;
+ if (seen) {
+ assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_EXIST));
+ } else {
+ lru->nr_unique++;
+ assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST));
+ }
+
+ return seen;
+}
+
+static unsigned int read_keys(const char *dist_file,
+ unsigned long long **keys)
+{
+ struct stat fst;
+ unsigned long long *retkeys;
+ unsigned int counts = 0;
+ int dist_fd;
+ char *b, *l;
+ int i;
+
+ dist_fd = open(dist_file, 0);
+ assert(dist_fd != -1);
+
+ assert(fstat(dist_fd, &fst) == 0);
+ b = malloc(fst.st_size);
+ assert(b);
+
+ assert(read(dist_fd, b, fst.st_size) == fst.st_size);
+ close(dist_fd);
+ for (i = 0; i < fst.st_size; i++) {
+ if (b[i] == '\n')
+ counts++;
+ }
+ counts++; /* in case the last line has no \n */
+
+ retkeys = malloc(counts * sizeof(unsigned long long));
+ assert(retkeys);
+
+ counts = 0;
+ for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n"))
+ retkeys[counts++] = strtoull(l, NULL, 10);
+ free(b);
+
+ *keys = retkeys;
+
+ return counts;
+}
+
+static int create_map(int map_type, int map_flags, unsigned int size)
+{
+ int map_fd;
+
+ map_fd = bpf_create_map(map_type, sizeof(unsigned long long),
+ sizeof(unsigned long long), size, map_flags);
+
+ if (map_fd == -1)
+ perror("bpf_create_map");
+
+ return map_fd;
+}
+
+static int sched_next_online(int pid, int next_to_try)
+{
+ cpu_set_t cpuset;
+
+ if (next_to_try == nr_cpus)
+ return -1;
+
+ while (next_to_try < nr_cpus) {
+ CPU_ZERO(&cpuset);
+ CPU_SET(next_to_try++, &cpuset);
+ if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset))
+ break;
+ }
+
+ return next_to_try;
+}
+
+static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data),
+ void *data)
+{
+ int next_sched_cpu = 0;
+ pid_t pid[tasks];
+ int i;
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ next_sched_cpu = sched_next_online(0, next_sched_cpu);
+ fn(i, data);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ }
+ /* It is mostly redundant and just allow the parent
+ * process to update next_shced_cpu for the next child
+ * process
+ */
+ next_sched_cpu = sched_next_online(pid[i], next_sched_cpu);
+ }
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void do_test_lru_dist(int task, void *data)
+{
+ unsigned int nr_misses = 0;
+ struct pfect_lru pfect_lru;
+ unsigned long long key, value = 1234;
+ unsigned int i;
+
+ unsigned int lru_map_fd = ((unsigned int *)data)[0];
+ unsigned int lru_size = ((unsigned int *)data)[1];
+ unsigned long long key_offset = task * dist_key_counts;
+
+ pfect_lru_init(&pfect_lru, lru_size, dist_key_counts);
+
+ for (i = 0; i < dist_key_counts; i++) {
+ key = dist_keys[i] + key_offset;
+
+ pfect_lru_lookup_or_insert(&pfect_lru, key);
+
+ if (!bpf_map_lookup_elem(lru_map_fd, &key, &value))
+ continue;
+
+ if (bpf_map_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) {
+ printf("bpf_map_update_elem(lru_map_fd, %llu): errno:%d\n",
+ key, errno);
+ assert(0);
+ }
+
+ nr_misses++;
+ }
+
+ printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
+ task, pfect_lru.nr_unique, dist_key_counts, nr_misses,
+ dist_key_counts);
+ printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
+ task, pfect_lru.nr_unique, pfect_lru.total,
+ pfect_lru.nr_misses, pfect_lru.total);
+
+ pfect_lru_destroy(&pfect_lru);
+ close(lru_map_fd);
+}
+
+static void test_parallel_lru_dist(int map_type, int map_flags,
+ int nr_tasks, unsigned int lru_size)
+{
+ int child_data[2];
+ int lru_map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
+ map_flags);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags,
+ nr_cpus * lru_size);
+ else
+ lru_map_fd = create_map(map_type, map_flags,
+ nr_tasks * lru_size);
+ assert(lru_map_fd != -1);
+
+ child_data[0] = lru_map_fd;
+ child_data[1] = lru_size;
+
+ run_parallel(nr_tasks, do_test_lru_dist, child_data);
+
+ close(lru_map_fd);
+}
+
+static void test_lru_loss0(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ unsigned int old_unused_losses = 0;
+ unsigned int new_unused_losses = 0;
+ unsigned int used_losses = 0;
+ int map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags, 900 * nr_cpus);
+ else
+ map_fd = create_map(map_type, map_flags, 900);
+
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 1000; key++) {
+ int start_key, end_key;
+
+ assert(bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0);
+
+ start_key = 101;
+ end_key = min(key, 900);
+
+ while (start_key <= end_key) {
+ bpf_map_lookup_elem(map_fd, &start_key, value);
+ start_key++;
+ }
+ }
+
+ for (key = 1; key <= 1000; key++) {
+ if (bpf_map_lookup_elem(map_fd, &key, value)) {
+ if (key <= 100)
+ old_unused_losses++;
+ else if (key <= 900)
+ used_losses++;
+ else
+ new_unused_losses++;
+ }
+ }
+
+ close(map_fd);
+
+ printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) "
+ "newer-elem-losses:%d(/100)\n",
+ old_unused_losses, used_losses, new_unused_losses);
+}
+
+static void test_lru_loss1(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int map_fd;
+ unsigned int nr_losses = 0;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags, 1000 * nr_cpus);
+ else
+ map_fd = create_map(map_type, map_flags, 1000);
+
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 1000; key++)
+ assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST));
+
+ for (key = 1; key <= 1000; key++) {
+ if (bpf_map_lookup_elem(map_fd, &key, value))
+ nr_losses++;
+ }
+
+ close(map_fd);
+
+ printf("nr_losses:%d(/1000)\n", nr_losses);
+}
+
+static void do_test_parallel_lru_loss(int task, void *data)
+{
+ const unsigned int nr_stable_elems = 1000;
+ const unsigned int nr_repeats = 100000;
+
+ int map_fd = *(int *)data;
+ unsigned long long stable_base;
+ unsigned long long key, value[nr_cpus];
+ unsigned long long next_ins_key;
+ unsigned int nr_losses = 0;
+ unsigned int i;
+
+ stable_base = task * nr_repeats * 2 + 1;
+ next_ins_key = stable_base;
+ value[0] = 1234;
+ for (i = 0; i < nr_stable_elems; i++) {
+ assert(bpf_map_update_elem(map_fd, &next_ins_key, value,
+ BPF_NOEXIST) == 0);
+ next_ins_key++;
+ }
+
+ for (i = 0; i < nr_repeats; i++) {
+ int rn;
+
+ rn = rand();
+
+ if (rn % 10) {
+ key = rn % nr_stable_elems + stable_base;
+ bpf_map_lookup_elem(map_fd, &key, value);
+ } else {
+ bpf_map_update_elem(map_fd, &next_ins_key, value,
+ BPF_NOEXIST);
+ next_ins_key++;
+ }
+ }
+
+ key = stable_base;
+ for (i = 0; i < nr_stable_elems; i++) {
+ if (bpf_map_lookup_elem(map_fd, &key, value))
+ nr_losses++;
+ key++;
+ }
+
+ printf(" task:%d nr_losses:%u\n", task, nr_losses);
+}
+
+static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks)
+{
+ int map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
+ map_flags);
+
+ /* Give 20% more than the active working set */
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags,
+ nr_cpus * (1000 + 200));
+ else
+ map_fd = create_map(map_type, map_flags,
+ nr_tasks * (1000 + 200));
+
+ assert(map_fd != -1);
+
+ run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd);
+
+ close(map_fd);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
+ const char *dist_file;
+ int nr_tasks = 1;
+ int lru_size;
+ int f;
+
+ if (argc < 4) {
+ printf("Usage: %s <dist-file> <lru-size> <nr-tasks>\n",
+ argv[0]);
+ return -1;
+ }
+
+ dist_file = argv[1];
+ lru_size = atoi(argv[2]);
+ nr_tasks = atoi(argv[3]);
+
+ setbuf(stdout, NULL);
+
+ assert(!setrlimit(RLIMIT_MEMLOCK, &r));
+
+ srand(time(NULL));
+
+ nr_cpus = bpf_num_possible_cpus();
+ assert(nr_cpus != -1);
+ printf("nr_cpus:%d\n\n", nr_cpus);
+
+ nr_tasks = min(nr_tasks, nr_cpus);
+
+ dist_key_counts = read_keys(dist_file, &dist_keys);
+ if (!dist_key_counts) {
+ printf("%s has no key\n", dist_file);
+ return -1;
+ }
+
+ for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) {
+ test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
+ test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
+ test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
+ nr_tasks);
+ test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
+ nr_tasks, lru_size);
+ printf("\n");
+ }
+
+ free(dist_keys);
+
+ return 0;
+}
diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c
new file mode 100644
index 000000000..bacc80134
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.c
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
+#include <linux/if_ether.h>
+#include "bpf_helpers.h"
+#include <string.h>
+
+# define printk(fmt, ...) \
+ ({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+ })
+
+#define CB_MAGIC 1234
+
+/* Test: Pass all packets through */
+SEC("nop")
+int do_nop(struct __sk_buff *skb)
+{
+ return BPF_OK;
+}
+
+/* Test: Verify context information can be accessed */
+SEC("test_ctx")
+int do_test_ctx(struct __sk_buff *skb)
+{
+ skb->cb[0] = CB_MAGIC;
+ printk("len %d hash %d protocol %d\n", skb->len, skb->hash,
+ skb->protocol);
+ printk("cb %d ingress_ifindex %d ifindex %d\n", skb->cb[0],
+ skb->ingress_ifindex, skb->ifindex);
+
+ return BPF_OK;
+}
+
+/* Test: Ensure skb->cb[] buffer is cleared */
+SEC("test_cb")
+int do_test_cb(struct __sk_buff *skb)
+{
+ printk("cb0: %x cb1: %x cb2: %x\n", skb->cb[0], skb->cb[1],
+ skb->cb[2]);
+ printk("cb3: %x cb4: %x\n", skb->cb[3], skb->cb[4]);
+
+ return BPF_OK;
+}
+
+/* Test: Verify skb data can be read */
+SEC("test_data")
+int do_test_data(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+ struct iphdr *iph = data;
+
+ if (data + sizeof(*iph) > data_end) {
+ printk("packet truncated\n");
+ return BPF_DROP;
+ }
+
+ printk("src: %x dst: %x\n", iph->saddr, iph->daddr);
+
+ return BPF_OK;
+}
+
+#define IP_CSUM_OFF offsetof(struct iphdr, check)
+#define IP_DST_OFF offsetof(struct iphdr, daddr)
+#define IP_SRC_OFF offsetof(struct iphdr, saddr)
+#define IP_PROTO_OFF offsetof(struct iphdr, protocol)
+#define TCP_CSUM_OFF offsetof(struct tcphdr, check)
+#define UDP_CSUM_OFF offsetof(struct udphdr, check)
+#define IS_PSEUDO 0x10
+
+static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip,
+ uint32_t new_ip, int rw_daddr)
+{
+ int ret, off = 0, flags = IS_PSEUDO;
+ uint8_t proto;
+
+ ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1);
+ if (ret < 0) {
+ printk("bpf_l4_csum_replace failed: %d\n", ret);
+ return BPF_DROP;
+ }
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ off = TCP_CSUM_OFF;
+ break;
+
+ case IPPROTO_UDP:
+ off = UDP_CSUM_OFF;
+ flags |= BPF_F_MARK_MANGLED_0;
+ break;
+
+ case IPPROTO_ICMPV6:
+ off = offsetof(struct icmp6hdr, icmp6_cksum);
+ break;
+ }
+
+ if (off) {
+ ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip,
+ flags | sizeof(new_ip));
+ if (ret < 0) {
+ printk("bpf_l4_csum_replace failed: %d\n");
+ return BPF_DROP;
+ }
+ }
+
+ ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
+ if (ret < 0) {
+ printk("bpf_l3_csum_replace failed: %d\n", ret);
+ return BPF_DROP;
+ }
+
+ if (rw_daddr)
+ ret = bpf_skb_store_bytes(skb, IP_DST_OFF, &new_ip, sizeof(new_ip), 0);
+ else
+ ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
+
+ if (ret < 0) {
+ printk("bpf_skb_store_bytes() failed: %d\n", ret);
+ return BPF_DROP;
+ }
+
+ return BPF_OK;
+}
+
+/* Test: Verify skb data can be modified */
+SEC("test_rewrite")
+int do_test_rewrite(struct __sk_buff *skb)
+{
+ uint32_t old_ip, new_ip = 0x3fea8c0;
+ int ret;
+
+ ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4);
+ if (ret < 0) {
+ printk("bpf_skb_load_bytes failed: %d\n", ret);
+ return BPF_DROP;
+ }
+
+ if (old_ip == 0x2fea8c0) {
+ printk("out: rewriting from %x to %x\n", old_ip, new_ip);
+ return rewrite(skb, old_ip, new_ip, 1);
+ }
+
+ return BPF_OK;
+}
+
+static inline int __do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+ uint64_t smac = SRC_MAC, dmac = DST_MAC;
+ int ret, ifindex = DST_IFINDEX;
+ struct ethhdr ehdr;
+
+ ret = bpf_skb_change_head(skb, 14, 0);
+ if (ret < 0) {
+ printk("skb_change_head() failed: %d\n", ret);
+ }
+
+ ehdr.h_proto = __constant_htons(ETH_P_IP);
+ memcpy(&ehdr.h_source, &smac, 6);
+ memcpy(&ehdr.h_dest, &dmac, 6);
+
+ ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0);
+ if (ret < 0) {
+ printk("skb_store_bytes() failed: %d\n", ret);
+ return BPF_DROP;
+ }
+
+ return bpf_redirect(ifindex, 0);
+}
+
+SEC("push_ll_and_redirect_silent")
+int do_push_ll_and_redirect_silent(struct __sk_buff *skb)
+{
+ return __do_push_ll_and_redirect(skb);
+}
+
+SEC("push_ll_and_redirect")
+int do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+ int ret, ifindex = DST_IFINDEX;
+
+ ret = __do_push_ll_and_redirect(skb);
+ if (ret >= 0)
+ printk("redirected to %d\n", ifindex);
+
+ return ret;
+}
+
+static inline void __fill_garbage(struct __sk_buff *skb)
+{
+ uint64_t f = 0xFFFFFFFFFFFFFFFF;
+
+ bpf_skb_store_bytes(skb, 0, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 8, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 16, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 24, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 32, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 40, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 48, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 56, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 64, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 72, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 80, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 88, &f, sizeof(f), 0);
+}
+
+SEC("fill_garbage")
+int do_fill_garbage(struct __sk_buff *skb)
+{
+ __fill_garbage(skb);
+ printk("Set initial 96 bytes of header to FF\n");
+ return BPF_OK;
+}
+
+SEC("fill_garbage_and_redirect")
+int do_fill_garbage_and_redirect(struct __sk_buff *skb)
+{
+ int ifindex = DST_IFINDEX;
+ __fill_garbage(skb);
+ printk("redirected to %d\n", ifindex);
+ return bpf_redirect(ifindex, 0);
+}
+
+/* Drop all packets */
+SEC("drop_all")
+int do_drop_all(struct __sk_buff *skb)
+{
+ printk("dropping with: %d\n", BPF_DROP);
+ return BPF_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh
new file mode 100755
index 000000000..65a976058
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.sh
@@ -0,0 +1,400 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Uncomment to see generated bytecode
+#VERBOSE=verbose
+
+NS1=lwt_ns1
+NS2=lwt_ns2
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+VETH2=tst_lwt2a
+VETH3=tst_lwt2b
+IPVETH0="192.168.254.1"
+IPVETH1="192.168.254.2"
+IPVETH1b="192.168.254.3"
+
+IPVETH2="192.168.111.1"
+IPVETH3="192.168.111.2"
+
+IP_LOCAL="192.168.99.1"
+
+TRACE_ROOT=/sys/kernel/debug/tracing
+
+function lookup_mac()
+{
+ set +x
+ if [ ! -z "$2" ]; then
+ MAC=$(ip netns exec $2 ip link show $1 | grep ether | awk '{print $2}')
+ else
+ MAC=$(ip link show $1 | grep ether | awk '{print $2}')
+ fi
+ MAC="${MAC//:/}"
+ echo "0x${MAC:10:2}${MAC:8:2}${MAC:6:2}${MAC:4:2}${MAC:2:2}${MAC:0:2}"
+ set -x
+}
+
+function cleanup {
+ set +ex
+ rm test_lwt_bpf.o 2> /dev/null
+ ip link del $VETH0 2> /dev/null
+ ip link del $VETH1 2> /dev/null
+ ip link del $VETH2 2> /dev/null
+ ip link del $VETH3 2> /dev/null
+ ip netns exec $NS1 killall netserver
+ ip netns delete $NS1 2> /dev/null
+ ip netns delete $NS2 2> /dev/null
+ set -ex
+}
+
+function setup_one_veth {
+ ip netns add $1
+ ip link add $2 type veth peer name $3
+ ip link set dev $2 up
+ ip addr add $4/24 dev $2
+ ip link set $3 netns $1
+ ip netns exec $1 ip link set dev $3 up
+ ip netns exec $1 ip addr add $5/24 dev $3
+
+ if [ "$6" ]; then
+ ip netns exec $1 ip addr add $6/32 dev $3
+ fi
+}
+
+function get_trace {
+ set +x
+ cat ${TRACE_ROOT}/trace | grep -v '^#'
+ set -x
+}
+
+function cleanup_routes {
+ ip route del ${IPVETH1}/32 dev $VETH0 2> /dev/null || true
+ ip route del table local local ${IP_LOCAL}/32 dev lo 2> /dev/null || true
+}
+
+function install_test {
+ cleanup_routes
+ cp /dev/null ${TRACE_ROOT}/trace
+
+ OPTS="encap bpf headroom 14 $1 obj test_lwt_bpf.o section $2 $VERBOSE"
+
+ if [ "$1" == "in" ]; then
+ ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo
+ else
+ ip route add ${IPVETH1}/32 $OPTS dev $VETH0
+ fi
+}
+
+function remove_prog {
+ if [ "$1" == "in" ]; then
+ ip route del table local local ${IP_LOCAL}/32 dev lo
+ else
+ ip route del ${IPVETH1}/32 dev $VETH0
+ fi
+}
+
+function filter_trace {
+ # Add newline to allow starting EXPECT= variables on newline
+ NL=$'\n'
+ echo "${NL}$*" | sed -e 's/^.*: : //g'
+}
+
+function expect_fail {
+ set +x
+ echo "FAIL:"
+ echo "Expected: $1"
+ echo "Got: $2"
+ set -x
+ exit 1
+}
+
+function match_trace {
+ set +x
+ RET=0
+ TRACE=$1
+ EXPECT=$2
+ GOT="$(filter_trace "$TRACE")"
+
+ [ "$GOT" != "$EXPECT" ] && {
+ expect_fail "$EXPECT" "$GOT"
+ RET=1
+ }
+ set -x
+ return $RET
+}
+
+function test_start {
+ set +x
+ echo "----------------------------------------------------------------"
+ echo "Starting test: $*"
+ echo "----------------------------------------------------------------"
+ set -x
+}
+
+function failure {
+ get_trace
+ echo "FAIL: $*"
+ exit 1
+}
+
+function test_ctx_xmit {
+ test_start "test_ctx on lwt xmit"
+ install_test xmit test_ctx
+ ping -c 3 $IPVETH1 || {
+ failure "test_ctx xmit: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX" || exit 1
+ remove_prog xmit
+}
+
+function test_ctx_out {
+ test_start "test_ctx on lwt out"
+ install_test out test_ctx
+ ping -c 3 $IPVETH1 || {
+ failure "test_ctx out: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0" || exit 1
+ remove_prog out
+}
+
+function test_ctx_in {
+ test_start "test_ctx on lwt in"
+ install_test in test_ctx
+ ping -c 3 $IP_LOCAL || {
+ failure "test_ctx out: packets are dropped"
+ }
+ # We will both request & reply packets as the packets will
+ # be from $IP_LOCAL => $IP_LOCAL
+ match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1" || exit 1
+ remove_prog in
+}
+
+function test_data {
+ test_start "test_data on lwt $1"
+ install_test $1 test_data
+ ping -c 3 $IPVETH1 || {
+ failure "test_data ${1}: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0" || exit 1
+ remove_prog $1
+}
+
+function test_data_in {
+ test_start "test_data on lwt in"
+ install_test in test_data
+ ping -c 3 $IP_LOCAL || {
+ failure "test_data in: packets are dropped"
+ }
+ # We will both request & reply packets as the packets will
+ # be from $IP_LOCAL => $IP_LOCAL
+ match_trace "$(get_trace)" "
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0" || exit 1
+ remove_prog in
+}
+
+function test_cb {
+ test_start "test_cb on lwt $1"
+ install_test $1 test_cb
+ ping -c 3 $IPVETH1 || {
+ failure "test_cb ${1}: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+ remove_prog $1
+}
+
+function test_cb_in {
+ test_start "test_cb on lwt in"
+ install_test in test_cb
+ ping -c 3 $IP_LOCAL || {
+ failure "test_cb in: packets are dropped"
+ }
+ # We will both request & reply packets as the packets will
+ # be from $IP_LOCAL => $IP_LOCAL
+ match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+ remove_prog in
+}
+
+function test_drop_all {
+ test_start "test_drop_all on lwt $1"
+ install_test $1 drop_all
+ ping -c 3 $IPVETH1 && {
+ failure "test_drop_all ${1}: Unexpected success of ping"
+ }
+ match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+ remove_prog $1
+}
+
+function test_drop_all_in {
+ test_start "test_drop_all on lwt in"
+ install_test in drop_all
+ ping -c 3 $IP_LOCAL && {
+ failure "test_drop_all in: Unexpected success of ping"
+ }
+ match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+ remove_prog in
+}
+
+function test_push_ll_and_redirect {
+ test_start "test_push_ll_and_redirect on lwt xmit"
+ install_test xmit push_ll_and_redirect
+ ping -c 3 $IPVETH1 || {
+ failure "Redirected packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" "
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX" || exit 1
+ remove_prog xmit
+}
+
+function test_no_l2_and_redirect {
+ test_start "test_no_l2_and_redirect on lwt xmit"
+ install_test xmit fill_garbage_and_redirect
+ ping -c 3 $IPVETH1 && {
+ failure "Unexpected success despite lack of L2 header"
+ }
+ match_trace "$(get_trace)" "
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX" || exit 1
+ remove_prog xmit
+}
+
+function test_rewrite {
+ test_start "test_rewrite on lwt xmit"
+ install_test xmit test_rewrite
+ ping -c 3 $IPVETH1 || {
+ failure "Rewritten packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" "
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0" || exit 1
+ remove_prog out
+}
+
+function test_fill_garbage {
+ test_start "test_fill_garbage on lwt xmit"
+ install_test xmit fill_garbage
+ ping -c 3 $IPVETH1 && {
+ failure "test_drop_all ${1}: Unexpected success of ping"
+ }
+ match_trace "$(get_trace)" "
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF" || exit 1
+ remove_prog xmit
+}
+
+function test_netperf_nop {
+ test_start "test_netperf_nop on lwt xmit"
+ install_test xmit nop
+ netperf -H $IPVETH1 -t TCP_STREAM || {
+ failure "packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" ""|| exit 1
+ remove_prog xmit
+}
+
+function test_netperf_redirect {
+ test_start "test_netperf_redirect on lwt xmit"
+ install_test xmit push_ll_and_redirect_silent
+ netperf -H $IPVETH1 -t TCP_STREAM || {
+ failure "Rewritten packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" ""|| exit 1
+ remove_prog xmit
+}
+
+cleanup
+setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b
+setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3
+ip netns exec $NS1 netserver
+echo 1 > ${TRACE_ROOT}/tracing_on
+
+DST_MAC=$(lookup_mac $VETH1 $NS1)
+SRC_MAC=$(lookup_mac $VETH0)
+DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex)
+
+CLANG_OPTS="-O2 -target bpf -I ../include/"
+CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX"
+clang $CLANG_OPTS -c test_lwt_bpf.c -o test_lwt_bpf.o
+
+test_ctx_xmit
+test_ctx_out
+test_ctx_in
+test_data "xmit"
+test_data "out"
+test_data_in
+test_cb "xmit"
+test_cb "out"
+test_cb_in
+test_drop_all "xmit"
+test_drop_all "out"
+test_drop_all_in
+test_rewrite
+test_push_ll_and_redirect
+test_no_l2_and_redirect
+test_fill_garbage
+test_netperf_nop
+test_netperf_redirect
+
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+exit 0
diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c
new file mode 100644
index 000000000..42c44d091
--- /dev/null
+++ b/samples/bpf/test_map_in_map_kern.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/in6.h>
+#include "bpf_helpers.h"
+
+#define MAX_NR_PORTS 65536
+
+/* map #0 */
+struct bpf_map_def SEC("maps") port_a = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(int),
+ .max_entries = MAX_NR_PORTS,
+};
+
+/* map #1 */
+struct bpf_map_def SEC("maps") port_h = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+/* map #2 */
+struct bpf_map_def SEC("maps") reg_result_h = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+/* map #3 */
+struct bpf_map_def SEC("maps") inline_result_h = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+/* map #4 */ /* Test case #0 */
+struct bpf_map_def SEC("maps") a_of_port_a = {
+ .type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ .key_size = sizeof(u32),
+ .inner_map_idx = 0, /* map_fd[0] is port_a */
+ .max_entries = MAX_NR_PORTS,
+};
+
+/* map #5 */ /* Test case #1 */
+struct bpf_map_def SEC("maps") h_of_port_a = {
+ .type = BPF_MAP_TYPE_HASH_OF_MAPS,
+ .key_size = sizeof(u32),
+ .inner_map_idx = 0, /* map_fd[0] is port_a */
+ .max_entries = 1,
+};
+
+/* map #6 */ /* Test case #2 */
+struct bpf_map_def SEC("maps") h_of_port_h = {
+ .type = BPF_MAP_TYPE_HASH_OF_MAPS,
+ .key_size = sizeof(u32),
+ .inner_map_idx = 1, /* map_fd[1] is port_h */
+ .max_entries = 1,
+};
+
+static __always_inline int do_reg_lookup(void *inner_map, u32 port)
+{
+ int *result;
+
+ result = bpf_map_lookup_elem(inner_map, &port);
+ return result ? *result : -ENOENT;
+}
+
+static __always_inline int do_inline_array_lookup(void *inner_map, u32 port)
+{
+ int *result;
+
+ if (inner_map != &port_a)
+ return -EINVAL;
+
+ result = bpf_map_lookup_elem(&port_a, &port);
+ return result ? *result : -ENOENT;
+}
+
+static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port)
+{
+ int *result;
+
+ if (inner_map != &port_h)
+ return -EINVAL;
+
+ result = bpf_map_lookup_elem(&port_h, &port);
+ return result ? *result : -ENOENT;
+}
+
+SEC("kprobe/sys_connect")
+int trace_sys_connect(struct pt_regs *ctx)
+{
+ struct sockaddr_in6 *in6;
+ u16 test_case, port, dst6[8];
+ int addrlen, ret, inline_ret, ret_key = 0;
+ u32 port_key;
+ void *outer_map, *inner_map;
+ bool inline_hash = false;
+
+ in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx);
+ addrlen = (int)PT_REGS_PARM3(ctx);
+
+ if (addrlen != sizeof(*in6))
+ return 0;
+
+ ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr);
+ if (ret) {
+ inline_ret = ret;
+ goto done;
+ }
+
+ if (dst6[0] != 0xdead || dst6[1] != 0xbeef)
+ return 0;
+
+ test_case = dst6[7];
+
+ ret = bpf_probe_read(&port, sizeof(port), &in6->sin6_port);
+ if (ret) {
+ inline_ret = ret;
+ goto done;
+ }
+
+ port_key = port;
+
+ ret = -ENOENT;
+ if (test_case == 0) {
+ outer_map = &a_of_port_a;
+ } else if (test_case == 1) {
+ outer_map = &h_of_port_a;
+ } else if (test_case == 2) {
+ outer_map = &h_of_port_h;
+ } else {
+ ret = __LINE__;
+ inline_ret = ret;
+ goto done;
+ }
+
+ inner_map = bpf_map_lookup_elem(outer_map, &port_key);
+ if (!inner_map) {
+ ret = __LINE__;
+ inline_ret = ret;
+ goto done;
+ }
+
+ ret = do_reg_lookup(inner_map, port_key);
+
+ if (test_case == 0 || test_case == 1)
+ inline_ret = do_inline_array_lookup(inner_map, port_key);
+ else
+ inline_ret = do_inline_hash_lookup(inner_map, port_key);
+
+done:
+ bpf_map_update_elem(&reg_result_h, &ret_key, &ret, BPF_ANY);
+ bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c
new file mode 100644
index 000000000..e308858f7
--- /dev/null
+++ b/samples/bpf/test_map_in_map_user.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <stdint.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+#define PORT_A (map_fd[0])
+#define PORT_H (map_fd[1])
+#define REG_RESULT_H (map_fd[2])
+#define INLINE_RESULT_H (map_fd[3])
+#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */
+#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */
+#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */
+
+static const char * const test_names[] = {
+ "Array of Array",
+ "Hash of Array",
+ "Hash of Hash",
+};
+
+#define NR_TESTS (sizeof(test_names) / sizeof(*test_names))
+
+static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key)
+{
+ struct bpf_map_info info = {};
+ uint32_t info_len = sizeof(info);
+ int ret, id;
+
+ ret = bpf_obj_get_info_by_fd(inner_map_fd, &info, &info_len);
+ assert(!ret);
+
+ ret = bpf_map_lookup_elem(map_in_map_fd, &key, &id);
+ assert(!ret);
+ assert(id == info.id);
+}
+
+static void populate_map(uint32_t port_key, int magic_result)
+{
+ int ret;
+
+ ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY);
+ assert(!ret);
+
+ ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result,
+ BPF_NOEXIST);
+ assert(!ret);
+
+ ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY);
+ assert(!ret);
+ check_map_id(PORT_A, A_OF_PORT_A, port_key);
+
+ ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST);
+ assert(!ret);
+ check_map_id(PORT_A, H_OF_PORT_A, port_key);
+
+ ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST);
+ assert(!ret);
+ check_map_id(PORT_H, H_OF_PORT_H, port_key);
+}
+
+static void test_map_in_map(void)
+{
+ struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
+ uint32_t result_key = 0, port_key;
+ int result, inline_result;
+ int magic_result = 0xfaceb00c;
+ int ret;
+ int i;
+
+ port_key = rand() & 0x00FF;
+ populate_map(port_key, magic_result);
+
+ in6.sin6_addr.s6_addr16[0] = 0xdead;
+ in6.sin6_addr.s6_addr16[1] = 0xbeef;
+ in6.sin6_port = port_key;
+
+ for (i = 0; i < NR_TESTS; i++) {
+ printf("%s: ", test_names[i]);
+
+ in6.sin6_addr.s6_addr16[7] = i;
+ ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6));
+ assert(ret == -1 && errno == EBADF);
+
+ ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result);
+ assert(!ret);
+
+ ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key,
+ &inline_result);
+ assert(!ret);
+
+ if (result != magic_result || inline_result != magic_result) {
+ printf("Error. result:%d inline_result:%d\n",
+ result, inline_result);
+ exit(1);
+ }
+
+ bpf_map_delete_elem(REG_RESULT_H, &result_key);
+ bpf_map_delete_elem(INLINE_RESULT_H, &result_key);
+
+ printf("Pass\n");
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+
+ assert(!setrlimit(RLIMIT_MEMLOCK, &r));
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ test_map_in_map();
+
+ return 0;
+}
diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c
new file mode 100644
index 000000000..468a66a92
--- /dev/null
+++ b/samples/bpf/test_overhead_kprobe_kern.c
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+SEC("kprobe/__set_task_comm")
+int prog(struct pt_regs *ctx)
+{
+ struct signal_struct *signal;
+ struct task_struct *tsk;
+ char oldcomm[16] = {};
+ char newcomm[16] = {};
+ u16 oom_score_adj;
+ u32 pid;
+
+ tsk = (void *)PT_REGS_PARM1(ctx);
+
+ pid = _(tsk->pid);
+ bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
+ bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
+ signal = _(tsk->signal);
+ oom_score_adj = _(signal->oom_score_adj);
+ return 0;
+}
+
+SEC("kprobe/urandom_read")
+int prog2(struct pt_regs *ctx)
+{
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c
new file mode 100644
index 000000000..d2af8bc1c
--- /dev/null
+++ b/samples/bpf/test_overhead_raw_tp_kern.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("raw_tracepoint/task_rename")
+int prog(struct bpf_raw_tracepoint_args *ctx)
+{
+ return 0;
+}
+
+SEC("raw_tracepoint/urandom_read")
+int prog2(struct bpf_raw_tracepoint_args *ctx)
+{
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c
new file mode 100644
index 000000000..38f5c0b9d
--- /dev/null
+++ b/samples/bpf/test_overhead_tp_kern.c
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
+struct task_rename {
+ __u64 pad;
+ __u32 pid;
+ char oldcomm[16];
+ char newcomm[16];
+ __u16 oom_score_adj;
+};
+SEC("tracepoint/task/task_rename")
+int prog(struct task_rename *ctx)
+{
+ return 0;
+}
+
+/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */
+struct urandom_read {
+ __u64 pad;
+ int got_bits;
+ int pool_left;
+ int input_left;
+};
+SEC("tracepoint/random/urandom_read")
+int prog2(struct urandom_read *ctx)
+{
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
new file mode 100644
index 000000000..9d6dcaa9d
--- /dev/null
+++ b/samples/bpf/test_overhead_user.c
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <asm/unistd.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <time.h>
+#include <sys/resource.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+#define MAX_CNT 1000000
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void test_task_rename(int cpu)
+{
+ __u64 start_time;
+ char buf[] = "test\n";
+ int i, fd;
+
+ fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
+ if (fd < 0) {
+ printf("couldn't open /proc\n");
+ exit(1);
+ }
+ start_time = time_get_ns();
+ for (i = 0; i < MAX_CNT; i++) {
+ if (write(fd, buf, sizeof(buf)) < 0) {
+ printf("task rename failed: %s\n", strerror(errno));
+ close(fd);
+ return;
+ }
+ }
+ printf("task_rename:%d: %lld events per sec\n",
+ cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+ close(fd);
+}
+
+static void test_urandom_read(int cpu)
+{
+ __u64 start_time;
+ char buf[4];
+ int i, fd;
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0) {
+ printf("couldn't open /dev/urandom\n");
+ exit(1);
+ }
+ start_time = time_get_ns();
+ for (i = 0; i < MAX_CNT; i++) {
+ if (read(fd, buf, sizeof(buf)) < 0) {
+ printf("failed to read from /dev/urandom: %s\n", strerror(errno));
+ close(fd);
+ return;
+ }
+ }
+ printf("urandom_read:%d: %lld events per sec\n",
+ cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+ close(fd);
+}
+
+static void loop(int cpu, int flags)
+{
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+ if (flags & 1)
+ test_task_rename(cpu);
+ if (flags & 2)
+ test_urandom_read(cpu);
+}
+
+static void run_perf_test(int tasks, int flags)
+{
+ pid_t pid[tasks];
+ int i;
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ loop(i, flags);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ }
+ }
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void unload_progs(void)
+{
+ close(prog_fd[0]);
+ close(prog_fd[1]);
+ close(event_fd[0]);
+ close(event_fd[1]);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+ int num_cpu = 8;
+ int test_flags = ~0;
+
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ if (argc > 1)
+ test_flags = atoi(argv[1]) ? : test_flags;
+ if (argc > 2)
+ num_cpu = atoi(argv[2]) ? : num_cpu;
+
+ if (test_flags & 0x3) {
+ printf("BASE\n");
+ run_perf_test(num_cpu, test_flags);
+ }
+
+ if (test_flags & 0xC) {
+ snprintf(filename, sizeof(filename),
+ "%s_kprobe_kern.o", argv[0]);
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+ printf("w/KPROBE\n");
+ run_perf_test(num_cpu, test_flags >> 2);
+ unload_progs();
+ }
+
+ if (test_flags & 0x30) {
+ snprintf(filename, sizeof(filename),
+ "%s_tp_kern.o", argv[0]);
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+ printf("w/TRACEPOINT\n");
+ run_perf_test(num_cpu, test_flags >> 4);
+ unload_progs();
+ }
+
+ if (test_flags & 0xC0) {
+ snprintf(filename, sizeof(filename),
+ "%s_raw_tp_kern.o", argv[0]);
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+ printf("w/RAW_TRACEPOINT\n");
+ run_perf_test(num_cpu, test_flags >> 6);
+ unload_progs();
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
new file mode 100755
index 000000000..35db26f73
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+rm -r tmpmnt
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+ echo "SUCCESS!"
+else
+ echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c
new file mode 100644
index 000000000..3a677c807
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_kern.c
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dnat_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct sockaddr_in),
+ .value_size = sizeof(struct sockaddr_in),
+ .max_entries = 256,
+};
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of arguments and their positions can change, etc.
+ * In such case this bpf+kprobe example will no longer be meaningful
+ *
+ * This example sits on a syscall, and the syscall ABI is relatively stable
+ * of course, across platforms, and over time, the ABI may change.
+ */
+SEC("kprobe/sys_connect")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ struct sockaddr_in new_addr, orig_addr = {};
+ struct sockaddr_in *mapped_addr;
+ void *sockaddr_arg = (void *)PT_REGS_PARM2(ctx);
+ int sockaddr_len = (int)PT_REGS_PARM3(ctx);
+
+ if (sockaddr_len > sizeof(orig_addr))
+ return 0;
+
+ if (bpf_probe_read(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0)
+ return 0;
+
+ mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr);
+ if (mapped_addr != NULL) {
+ memcpy(&new_addr, mapped_addr, sizeof(new_addr));
+ bpf_probe_write_user(sockaddr_arg, &new_addr,
+ sizeof(new_addr));
+ }
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c
new file mode 100644
index 000000000..045eb5e30
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include <sys/socket.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+int main(int ac, char **argv)
+{
+ int serverfd, serverconnfd, clientfd;
+ socklen_t sockaddr_len;
+ struct sockaddr serv_addr, mapped_addr, tmp_addr;
+ struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in;
+ char filename[256];
+ char *ip;
+
+ serv_addr_in = (struct sockaddr_in *)&serv_addr;
+ mapped_addr_in = (struct sockaddr_in *)&mapped_addr;
+ tmp_addr_in = (struct sockaddr_in *)&tmp_addr;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0);
+ assert((clientfd = socket(AF_INET, SOCK_STREAM, 0)) > 0);
+
+ /* Bind server to ephemeral port on lo */
+ memset(&serv_addr, 0, sizeof(serv_addr));
+ serv_addr_in->sin_family = AF_INET;
+ serv_addr_in->sin_port = 0;
+ serv_addr_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+ assert(bind(serverfd, &serv_addr, sizeof(serv_addr)) == 0);
+
+ sockaddr_len = sizeof(serv_addr);
+ assert(getsockname(serverfd, &serv_addr, &sockaddr_len) == 0);
+ ip = inet_ntoa(serv_addr_in->sin_addr);
+ printf("Server bound to: %s:%d\n", ip, ntohs(serv_addr_in->sin_port));
+
+ memset(&mapped_addr, 0, sizeof(mapped_addr));
+ mapped_addr_in->sin_family = AF_INET;
+ mapped_addr_in->sin_port = htons(5555);
+ mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255");
+
+ assert(!bpf_map_update_elem(map_fd[0], &mapped_addr, &serv_addr, BPF_ANY));
+
+ assert(listen(serverfd, 5) == 0);
+
+ ip = inet_ntoa(mapped_addr_in->sin_addr);
+ printf("Client connecting to: %s:%d\n",
+ ip, ntohs(mapped_addr_in->sin_port));
+ assert(connect(clientfd, &mapped_addr, sizeof(mapped_addr)) == 0);
+
+ sockaddr_len = sizeof(tmp_addr);
+ ip = inet_ntoa(tmp_addr_in->sin_addr);
+ assert((serverconnfd = accept(serverfd, &tmp_addr, &sockaddr_len)) > 0);
+ printf("Server received connection from: %s:%d\n",
+ ip, ntohs(tmp_addr_in->sin_port));
+
+ sockaddr_len = sizeof(tmp_addr);
+ assert(getpeername(clientfd, &tmp_addr, &sockaddr_len) == 0);
+ ip = inet_ntoa(tmp_addr_in->sin_addr);
+ printf("Client's peer address: %s:%d\n",
+ ip, ntohs(tmp_addr_in->sin_port));
+
+ /* Is the server's getsockname = the socket getpeername */
+ assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0);
+
+ return 0;
+}
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
new file mode 100644
index 000000000..7068fbdde
--- /dev/null
+++ b/samples/bpf/trace_event_kern.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <uapi/linux/perf_event.h>
+#include "bpf_helpers.h"
+
+struct key_t {
+ char comm[TASK_COMM_LEN];
+ u32 kernstack;
+ u32 userstack;
+};
+
+struct bpf_map_def SEC("maps") counts = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct key_t),
+ .value_size = sizeof(u64),
+ .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") stackmap = {
+ .type = BPF_MAP_TYPE_STACK_TRACE,
+ .key_size = sizeof(u32),
+ .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
+ .max_entries = 10000,
+};
+
+#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
+#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
+
+SEC("perf_event")
+int bpf_prog1(struct bpf_perf_event_data *ctx)
+{
+ char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+ char time_fmt2[] = "Get Time Failed, ErrCode: %d";
+ char addr_fmt[] = "Address recorded on event: %llx";
+ char fmt[] = "CPU-%d period %lld ip %llx";
+ u32 cpu = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value value_buf;
+ struct key_t key;
+ u64 *val, one = 1;
+ int ret;
+
+ if (ctx->sample_period < 10000)
+ /* ignore warmup */
+ return 0;
+ bpf_get_current_comm(&key.comm, sizeof(key.comm));
+ key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
+ key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS);
+ if ((int)key.kernstack < 0 && (int)key.userstack < 0) {
+ bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period,
+ PT_REGS_IP(&ctx->regs));
+ return 0;
+ }
+
+ ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+ if (!ret)
+ bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+ else
+ bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
+ if (ctx->addr != 0)
+ bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr);
+
+ val = bpf_map_lookup_elem(&counts, &key);
+ if (val)
+ (*val)++;
+ else
+ bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
new file mode 100644
index 000000000..d33022447
--- /dev/null
+++ b/samples/bpf/trace_event_user.c
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <signal.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/resource.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define SAMPLE_FREQ 50
+
+static bool sys_read_seen, sys_write_seen;
+
+static void print_ksym(__u64 addr)
+{
+ struct ksym *sym;
+
+ if (!addr)
+ return;
+ sym = ksym_search(addr);
+ printf("%s;", sym->name);
+ if (!strstr(sym->name, "sys_read"))
+ sys_read_seen = true;
+ else if (!strstr(sym->name, "sys_write"))
+ sys_write_seen = true;
+}
+
+static void print_addr(__u64 addr)
+{
+ if (!addr)
+ return;
+ printf("%llx;", addr);
+}
+
+#define TASK_COMM_LEN 16
+
+struct key_t {
+ char comm[TASK_COMM_LEN];
+ __u32 kernstack;
+ __u32 userstack;
+};
+
+static void print_stack(struct key_t *key, __u64 count)
+{
+ __u64 ip[PERF_MAX_STACK_DEPTH] = {};
+ static bool warned;
+ int i;
+
+ printf("%3lld %s;", count, key->comm);
+ if (bpf_map_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
+ print_ksym(ip[i]);
+ }
+ printf("-;");
+ if (bpf_map_lookup_elem(map_fd[1], &key->userstack, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
+ print_addr(ip[i]);
+ }
+ if (count < 6)
+ printf("\r");
+ else
+ printf("\n");
+
+ if (key->kernstack == -EEXIST && !warned) {
+ printf("stackmap collisions seen. Consider increasing size\n");
+ warned = true;
+ } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) {
+ printf("err stackid %d %d\n", key->kernstack, key->userstack);
+ }
+}
+
+static void int_exit(int sig)
+{
+ kill(0, SIGKILL);
+ exit(0);
+}
+
+static void print_stacks(void)
+{
+ struct key_t key = {}, next_key;
+ __u64 value;
+ __u32 stackid = 0, next_id;
+ int fd = map_fd[0], stack_map = map_fd[1];
+
+ sys_read_seen = sys_write_seen = false;
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &value);
+ print_stack(&next_key, value);
+ bpf_map_delete_elem(fd, &next_key);
+ key = next_key;
+ }
+ printf("\n");
+ if (!sys_read_seen || !sys_write_seen) {
+ printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n");
+ int_exit(0);
+ }
+
+ /* clear stack map */
+ while (bpf_map_get_next_key(stack_map, &stackid, &next_id) == 0) {
+ bpf_map_delete_elem(stack_map, &next_id);
+ stackid = next_id;
+ }
+}
+
+static inline int generate_load(void)
+{
+ if (system("dd if=/dev/zero of=/dev/null count=5000k status=none") < 0) {
+ printf("failed to generate some load with dd: %s\n", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static void test_perf_event_all_cpu(struct perf_event_attr *attr)
+{
+ int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ int *pmu_fd = malloc(nr_cpus * sizeof(int));
+ int i, error = 0;
+
+ /* system wide perf event, no need to inherit */
+ attr->inherit = 0;
+
+ /* open perf_event on all cpus */
+ for (i = 0; i < nr_cpus; i++) {
+ pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0);
+ if (pmu_fd[i] < 0) {
+ printf("sys_perf_event_open failed\n");
+ error = 1;
+ goto all_cpu_err;
+ }
+ assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0);
+ assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE) == 0);
+ }
+
+ if (generate_load() < 0) {
+ error = 1;
+ goto all_cpu_err;
+ }
+ print_stacks();
+all_cpu_err:
+ for (i--; i >= 0; i--) {
+ ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE);
+ close(pmu_fd[i]);
+ }
+ free(pmu_fd);
+ if (error)
+ int_exit(0);
+}
+
+static void test_perf_event_task(struct perf_event_attr *attr)
+{
+ int pmu_fd, error = 0;
+
+ /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+ * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+ */
+ attr->inherit = 1;
+
+ /* open task bound event */
+ pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
+ if (pmu_fd < 0) {
+ printf("sys_perf_event_open failed\n");
+ int_exit(0);
+ }
+ assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0);
+ assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE) == 0);
+
+ if (generate_load() < 0) {
+ error = 1;
+ goto err;
+ }
+ print_stacks();
+err:
+ ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+ close(pmu_fd);
+ if (error)
+ int_exit(0);
+}
+
+static void test_bpf_perf_event(void)
+{
+ struct perf_event_attr attr_type_hw = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ };
+ struct perf_event_attr attr_type_sw = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_SOFTWARE,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ };
+ struct perf_event_attr attr_hw_cache_l1d = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_HW_CACHE,
+ .config =
+ PERF_COUNT_HW_CACHE_L1D |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
+ };
+ struct perf_event_attr attr_hw_cache_branch_miss = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_HW_CACHE,
+ .config =
+ PERF_COUNT_HW_CACHE_BPU |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
+ };
+ struct perf_event_attr attr_type_raw = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_RAW,
+ /* Intel Instruction Retired */
+ .config = 0xc0,
+ };
+ struct perf_event_attr attr_type_raw_lock_load = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_RAW,
+ /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */
+ .config = 0x21d0,
+ /* Request to record lock address from PEBS */
+ .sample_type = PERF_SAMPLE_ADDR,
+ /* Record address value requires precise event */
+ .precise_ip = 2,
+ };
+
+ printf("Test HW_CPU_CYCLES\n");
+ test_perf_event_all_cpu(&attr_type_hw);
+ test_perf_event_task(&attr_type_hw);
+
+ printf("Test SW_CPU_CLOCK\n");
+ test_perf_event_all_cpu(&attr_type_sw);
+ test_perf_event_task(&attr_type_sw);
+
+ printf("Test HW_CACHE_L1D\n");
+ test_perf_event_all_cpu(&attr_hw_cache_l1d);
+ test_perf_event_task(&attr_hw_cache_l1d);
+
+ printf("Test HW_CACHE_BPU\n");
+ test_perf_event_all_cpu(&attr_hw_cache_branch_miss);
+ test_perf_event_task(&attr_hw_cache_branch_miss);
+
+ printf("Test Instruction Retired\n");
+ test_perf_event_all_cpu(&attr_type_raw);
+ test_perf_event_task(&attr_type_raw);
+
+ printf("Test Lock Load\n");
+ test_perf_event_all_cpu(&attr_type_raw_lock_load);
+ test_perf_event_task(&attr_type_raw_lock_load);
+
+ printf("*** PASS ***\n");
+}
+
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 2;
+ }
+
+ if (fork() == 0) {
+ read_trace_pipe();
+ return 0;
+ }
+ test_bpf_perf_event();
+ int_exit(0);
+ return 0;
+}
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c
new file mode 100644
index 000000000..9b96f4fb8
--- /dev/null
+++ b/samples/bpf/trace_output_kern.c
@@ -0,0 +1,30 @@
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u32),
+ .max_entries = 2,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ struct S {
+ u64 pid;
+ u64 cookie;
+ } data;
+
+ data.pid = bpf_get_current_pid_tgid();
+ data.cookie = 0x12345678;
+
+ bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
new file mode 100644
index 000000000..4837d73ed
--- /dev/null
+++ b/samples/bpf/trace_output_user.c
@@ -0,0 +1,106 @@
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include <libbpf.h>
+#include "bpf_load.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+static int pmu_fd;
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static __u64 start_time;
+
+#define MAX_CNT 100000ll
+
+static int print_bpf_output(void *data, int size)
+{
+ static __u64 cnt;
+ struct {
+ __u64 pid;
+ __u64 cookie;
+ } *e = data;
+
+ if (e->cookie != 0x12345678) {
+ printf("BUG pid %llx cookie %llx sized %d\n",
+ e->pid, e->cookie, size);
+ return LIBBPF_PERF_EVENT_ERROR;
+ }
+
+ cnt++;
+
+ if (cnt == MAX_CNT) {
+ printf("recv %lld events per sec\n",
+ MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+ return LIBBPF_PERF_EVENT_DONE;
+ }
+
+ return LIBBPF_PERF_EVENT_CONT;
+}
+
+static void test_bpf_perf_event(void)
+{
+ struct perf_event_attr attr = {
+ .sample_type = PERF_SAMPLE_RAW,
+ .type = PERF_TYPE_SOFTWARE,
+ .config = PERF_COUNT_SW_BPF_OUTPUT,
+ };
+ int key = 0;
+
+ pmu_fd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+
+ assert(pmu_fd >= 0);
+ assert(bpf_map_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0);
+ ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+ FILE *f;
+ int ret;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ test_bpf_perf_event();
+
+ if (perf_event_mmap(pmu_fd) < 0)
+ return 1;
+
+ f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+
+ start_time = time_get_ns();
+ ret = perf_event_poller(pmu_fd, print_bpf_output);
+ kill(0, SIGINT);
+ return ret;
+}
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000..9c74b45c5
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,48 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of arguments and their positions can change, etc.
+ * In such case this bpf+kprobe example will no longer be meaningful
+ */
+SEC("kprobe/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ /* attaches to kprobe __netif_receive_skb_core,
+ * looks for packets on loobpack device and prints them
+ */
+ char devname[IFNAMSIZ];
+ struct net_device *dev;
+ struct sk_buff *skb;
+ int len;
+
+ /* non-portable! works for the given kernel only */
+ bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx));
+ dev = _(skb->dev);
+ len = _(skb->len);
+
+ bpf_probe_read(devname, sizeof(devname), dev->name);
+
+ if (devname[0] == 'l' && devname[1] == 'o') {
+ char fmt[] = "skb %p len %d\n";
+ /* using bpf_trace_printk() for DEBUG ONLY */
+ bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000..af8c20608
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+ FILE *f;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ f = popen("taskset 1 ping -c5 localhost", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+ return 0;
+}
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000..5e11c20ce
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
+ * example will no longer be meaningful
+ */
+SEC("kprobe/kfree_skb")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long loc = 0;
+ long init_val = 1;
+ long *value;
+
+ /* read ip of kfree_skb caller.
+ * non-portable version of __builtin_return_address(0)
+ */
+ BPF_KPROBE_READ_RET_IP(loc, ctx);
+
+ value = bpf_map_lookup_elem(&my_map, &loc);
+ if (value)
+ *value += 1;
+ else
+ bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+struct hist_key {
+ char comm[16];
+ u64 pid_tgid;
+ u64 uid_gid;
+ u64 index;
+};
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+ .key_size = sizeof(struct hist_key),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog3(struct pt_regs *ctx)
+{
+ long write_size = PT_REGS_PARM3(ctx);
+ long init_val = 1;
+ long *value;
+ struct hist_key key;
+
+ key.index = log2l(write_size);
+ key.pid_tgid = bpf_get_current_pid_tgid();
+ key.uid_gid = bpf_get_current_uid_gid();
+ bpf_get_current_comm(&key.comm, sizeof(key.comm));
+
+ value = bpf_map_lookup_elem(&my_hist_map, &key);
+ if (value)
+ __sync_fetch_and_add(value, 1);
+ else
+ bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000..1a81e6a5c
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <sys/resource.h>
+
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "bpf_util.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+struct task {
+ char comm[16];
+ __u64 pid_tgid;
+ __u64 uid_gid;
+};
+
+struct hist_key {
+ struct task t;
+ __u32 index;
+};
+
+#define SIZE sizeof(struct task)
+
+static void print_hist_for_pid(int fd, void *task)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct hist_key key = {}, next_key;
+ long values[nr_cpus];
+ char starstr[MAX_STARS];
+ long value;
+ long data[MAX_INDEX] = {};
+ int max_ind = -1;
+ long max_value = 0;
+ int i, ind;
+
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ if (memcmp(&next_key, task, SIZE)) {
+ key = next_key;
+ continue;
+ }
+ bpf_map_lookup_elem(fd, &next_key, values);
+ value = 0;
+ for (i = 0; i < nr_cpus; i++)
+ value += values[i];
+ ind = next_key.index;
+ data[ind] = value;
+ if (value && ind > max_ind)
+ max_ind = ind;
+ if (value > max_value)
+ max_value = value;
+ key = next_key;
+ }
+
+ printf(" syscall write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+
+static void print_hist(int fd)
+{
+ struct hist_key key = {}, next_key;
+ static struct task tasks[1024];
+ int task_cnt = 0;
+ int i;
+
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ int found = 0;
+
+ for (i = 0; i < task_cnt; i++)
+ if (memcmp(&tasks[i], &next_key, SIZE) == 0)
+ found = 1;
+ if (!found)
+ memcpy(&tasks[task_cnt++], &next_key, SIZE);
+ key = next_key;
+ }
+
+ for (i = 0; i < task_cnt; i++) {
+ printf("\npid %d cmd %s uid %d\n",
+ (__u32) tasks[i].pid_tgid,
+ tasks[i].comm,
+ (__u32) tasks[i].uid_gid);
+ print_hist_for_pid(fd, &tasks[i]);
+ }
+
+}
+
+static void int_exit(int sig)
+{
+ print_hist(map_fd[1]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ struct rlimit r = {1024*1024, RLIM_INFINITY};
+ char filename[256];
+ long key, next_key, value;
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ /* start 'ping' in the background to have some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ key = 0;
+ while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd[0], &next_key, &value);
+ printf("location 0x%lx count %ld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+ print_hist(map_fd[1]);
+
+ return 0;
+}
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000..9974c3d7c
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(u64),
+ .max_entries = 4096,
+};
+
+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
+ * example will no longer be meaningful
+ */
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ long rq = PT_REGS_PARM1(ctx);
+ u64 val = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+ int i = -(n == 0);
+ S(32); S(16); S(8); S(4); S(2); S(1);
+ return i;
+#undef S
+}
+
+#define SLOTS 100
+
+struct bpf_map_def SEC("maps") lat_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = SLOTS,
+};
+
+SEC("kprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long rq = PT_REGS_PARM1(ctx);
+ u64 *value, l, base;
+ u32 index;
+
+ value = bpf_map_lookup_elem(&my_map, &rq);
+ if (!value)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = cur_time - *value;
+
+ bpf_map_delete_elem(&my_map, &rq);
+
+ /* the lines below are computing index = log10(delta)*10
+ * using integer arithmetic
+ * index = 29 ~ 1 usec
+ * index = 59 ~ 1 msec
+ * index = 89 ~ 1 sec
+ * index = 99 ~ 10sec or more
+ * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+ */
+ l = log2l(delta);
+ base = 1ll << l;
+ index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
+
+ if (index >= SLOTS)
+ index = SLOTS - 1;
+
+ value = bpf_map_lookup_elem(&lat_map, &index);
+ if (value)
+ *value += 1;
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000..6c6b10f4c
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,166 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <sys/resource.h>
+
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "bpf_util.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define SLOTS 100
+
+static void clear_stats(int fd)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u64 values[nr_cpus];
+ __u32 key;
+
+ memset(values, 0, sizeof(values));
+ for (key = 0; key < SLOTS; key++)
+ bpf_map_update_elem(fd, &key, values, BPF_ANY);
+}
+
+const char *color[] = {
+ "\033[48;5;255m",
+ "\033[48;5;252m",
+ "\033[48;5;250m",
+ "\033[48;5;248m",
+ "\033[48;5;246m",
+ "\033[48;5;244m",
+ "\033[48;5;242m",
+ "\033[48;5;240m",
+ "\033[48;5;238m",
+ "\033[48;5;236m",
+ "\033[48;5;234m",
+ "\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+const char *sym[] = {
+ " ",
+ " ",
+ ".",
+ ".",
+ "*",
+ "*",
+ "o",
+ "o",
+ "O",
+ "O",
+ "#",
+ "#",
+};
+
+bool full_range = false;
+bool text_only = false;
+
+static void print_banner(void)
+{
+ if (full_range)
+ printf("|1ns |10ns |100ns |1us |10us |100us"
+ " |1ms |10ms |100ms |1s |10s\n");
+ else
+ printf("|1us |10us |100us |1ms |10ms "
+ "|100ms |1s |10s\n");
+}
+
+static void print_hist(int fd)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u64 total_events = 0;
+ long values[nr_cpus];
+ __u64 max_cnt = 0;
+ __u64 cnt[SLOTS];
+ __u64 value;
+ __u32 key;
+ int i;
+
+ for (key = 0; key < SLOTS; key++) {
+ bpf_map_lookup_elem(fd, &key, values);
+ value = 0;
+ for (i = 0; i < nr_cpus; i++)
+ value += values[i];
+ cnt[key] = value;
+ total_events += value;
+ if (value > max_cnt)
+ max_cnt = value;
+ }
+ clear_stats(fd);
+ for (key = full_range ? 0 : 29; key < SLOTS; key++) {
+ int c = num_colors * cnt[key] / (max_cnt + 1);
+
+ if (text_only)
+ printf("%s", sym[c]);
+ else
+ printf("%s %s", color[c], nocolor);
+ }
+ printf(" # %lld\n", total_events);
+}
+
+int main(int ac, char **argv)
+{
+ struct rlimit r = {1024*1024, RLIM_INFINITY};
+ char filename[256];
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 1; i < ac; i++) {
+ if (strcmp(argv[i], "-a") == 0) {
+ full_range = true;
+ } else if (strcmp(argv[i], "-t") == 0) {
+ text_only = true;
+ } else if (strcmp(argv[i], "-h") == 0) {
+ printf("Usage:\n"
+ " -a display wider latency range\n"
+ " -t text only\n");
+ return 1;
+ }
+ }
+
+ printf(" heatmap of IO latency\n");
+ if (text_only)
+ printf(" %s", sym[num_colors - 1]);
+ else
+ printf(" %s %s", color[num_colors - 1], nocolor);
+ printf(" - many events with this latency\n");
+
+ if (text_only)
+ printf(" %s", sym[0]);
+ else
+ printf(" %s %s", color[0], nocolor);
+ printf(" - few events\n");
+
+ for (i = 0; ; i++) {
+ if (i % 20 == 0)
+ print_banner();
+ print_hist(map_fd[1]);
+ sleep(2);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000..6dd8e384d
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct pair {
+ u64 val;
+ u64 ip;
+};
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(struct pair),
+ .max_entries = 1000000,
+};
+
+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
+ * example will no longer be meaningful
+ */
+SEC("kprobe/kmem_cache_free")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ long ptr = PT_REGS_PARM2(ctx);
+
+ bpf_map_delete_elem(&my_map, &ptr);
+ return 0;
+}
+
+SEC("kretprobe/kmem_cache_alloc_node")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long ptr = PT_REGS_RC(ctx);
+ long ip = 0;
+
+ /* get ip address of kmem_cache_alloc_node() caller */
+ BPF_KRETPROBE_READ_RET_IP(ip, ctx);
+
+ struct pair v = {
+ .val = bpf_ktime_get_ns(),
+ .ip = ip,
+ };
+
+ bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000..14625c898
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,77 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <linux/bpf.h>
+#include <sys/resource.h>
+
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+struct pair {
+ long long val;
+ __u64 ip;
+};
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void print_old_objects(int fd)
+{
+ long long val = time_get_ns();
+ __u64 key, next_key;
+ struct pair v;
+
+ key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
+
+ key = -1;
+ while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd[0], &next_key, &v);
+ key = next_key;
+ if (val - v.val < 1000000000ll)
+ /* object was allocated more then 1 sec ago */
+ continue;
+ printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
+ next_key, (val - v.val) / 1000000000ll, v.ip);
+ }
+}
+
+int main(int ac, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; ; i++) {
+ print_old_objects(map_fd[1]);
+ sleep(1);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
new file mode 100644
index 000000000..f57f4e1ea
--- /dev/null
+++ b/samples/bpf/tracex5_kern.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/seccomp.h>
+#include <uapi/linux/unistd.h>
+#include "syscall_nrs.h"
+#include "bpf_helpers.h"
+
+#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
+
+struct bpf_map_def SEC("maps") progs = {
+ .type = BPF_MAP_TYPE_PROG_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+#ifdef __mips__
+ .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */
+#else
+ .max_entries = 1024,
+#endif
+};
+
+SEC("kprobe/__seccomp_filter")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ int sc_nr = (int)PT_REGS_PARM1(ctx);
+
+ /* dispatch into next BPF program depending on syscall number */
+ bpf_tail_call(ctx, &progs, sc_nr);
+
+ /* fall through -> unknown syscall */
+ if (sc_nr >= __NR_getuid && sc_nr <= __NR_getsid) {
+ char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n";
+ bpf_trace_printk(fmt, sizeof(fmt), sc_nr);
+ }
+ return 0;
+}
+
+/* we jump here when syscall number == __NR_write */
+PROG(SYS__NR_write)(struct pt_regs *ctx)
+{
+ struct seccomp_data sd;
+
+ bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
+ if (sd.args[2] == 512) {
+ char fmt[] = "write(fd=%d, buf=%p, size=%d)\n";
+ bpf_trace_printk(fmt, sizeof(fmt),
+ sd.args[0], sd.args[1], sd.args[2]);
+ }
+ return 0;
+}
+
+PROG(SYS__NR_read)(struct pt_regs *ctx)
+{
+ struct seccomp_data sd;
+
+ bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
+ if (sd.args[2] > 128 && sd.args[2] <= 1024) {
+ char fmt[] = "read(fd=%d, buf=%p, size=%d)\n";
+ bpf_trace_printk(fmt, sizeof(fmt),
+ sd.args[0], sd.args[1], sd.args[2]);
+ }
+ return 0;
+}
+
+PROG(SYS__NR_mmap)(struct pt_regs *ctx)
+{
+ char fmt[] = "mmap\n";
+ bpf_trace_printk(fmt, sizeof(fmt));
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
new file mode 100644
index 000000000..c4ab91c89
--- /dev/null
+++ b/samples/bpf/tracex5_user.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include <sys/resource.h>
+
+/* install fake seccomp program to enable seccomp code path inside the kernel,
+ * so that our kprobe attached to seccomp_phase1() can be triggered
+ */
+static void install_accept_all_seccomp(void)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ .filter = filter,
+ };
+ if (prctl(PR_SET_SECCOMP, 2, &prog))
+ perror("prctl");
+}
+
+int main(int ac, char **argv)
+{
+ FILE *f;
+ char filename[256];
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ setrlimit(RLIMIT_MEMLOCK, &r);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ install_accept_all_seccomp();
+
+ f = popen("dd if=/dev/zero of=/dev/null count=5", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+ return 0;
+}
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 000000000..46c557afa
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,67 @@
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") counters = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u32),
+ .max_entries = 64,
+};
+struct bpf_map_def SEC("maps") values = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u64),
+ .max_entries = 64,
+};
+struct bpf_map_def SEC("maps") values2 = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_perf_event_value),
+ .max_entries = 64,
+};
+
+SEC("kprobe/htab_map_get_next_key")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_smp_processor_id();
+ u64 count, *val;
+ s64 error;
+
+ count = bpf_perf_event_read(&counters, key);
+ error = (s64)count;
+ if (error <= -2 && error >= -22)
+ return 0;
+
+ val = bpf_map_lookup_elem(&values, &key);
+ if (val)
+ *val = count;
+ else
+ bpf_map_update_elem(&values, &key, &count, BPF_NOEXIST);
+
+ return 0;
+}
+
+SEC("kprobe/htab_map_lookup_elem")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value *val, buf;
+ int error;
+
+ error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
+ if (error)
+ return 0;
+
+ val = bpf_map_lookup_elem(&values2, &key);
+ if (val)
+ *val = buf;
+ else
+ bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 000000000..4bb3c830a
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "bpf_load.h"
+#include <bpf/bpf.h>
+#include "perf-sys.h"
+
+#define SAMPLE_PERIOD 0x7fffffffffffffffULL
+
+static void check_on_cpu(int cpu, struct perf_event_attr *attr)
+{
+ struct bpf_perf_event_value value2;
+ int pmu_fd, error = 0;
+ cpu_set_t set;
+ __u64 value;
+
+ /* Move to target CPU */
+ CPU_ZERO(&set);
+ CPU_SET(cpu, &set);
+ assert(sched_setaffinity(0, sizeof(set), &set) == 0);
+ /* Open perf event and attach to the perf_event_array */
+ pmu_fd = sys_perf_event_open(attr, -1/*pid*/, cpu/*cpu*/, -1/*group_fd*/, 0);
+ if (pmu_fd < 0) {
+ fprintf(stderr, "sys_perf_event_open failed on CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ }
+ assert(bpf_map_update_elem(map_fd[0], &cpu, &pmu_fd, BPF_ANY) == 0);
+ assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0);
+ /* Trigger the kprobe */
+ bpf_map_get_next_key(map_fd[1], &cpu, NULL);
+ /* Check the value */
+ if (bpf_map_lookup_elem(map_fd[1], &cpu, &value)) {
+ fprintf(stderr, "Value missing for CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: %llu\n", cpu, value);
+ }
+ /* The above bpf_map_lookup_elem should trigger the second kprobe */
+ if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
+ fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
+ value2.counter, value2.enabled, value2.running);
+ }
+
+on_exit:
+ assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
+ assert(ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE, 0) == 0 || error);
+ assert(close(pmu_fd) == 0 || error);
+ assert(bpf_map_delete_elem(map_fd[1], &cpu) == 0 || error);
+ exit(error);
+}
+
+static void test_perf_event_array(struct perf_event_attr *attr,
+ const char *name)
+{
+ int i, status, nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ pid_t pid[nr_cpus];
+ int err = 0;
+
+ printf("Test reading %s counters\n", name);
+
+ for (i = 0; i < nr_cpus; i++) {
+ pid[i] = fork();
+ assert(pid[i] >= 0);
+ if (pid[i] == 0) {
+ check_on_cpu(i, attr);
+ exit(1);
+ }
+ }
+
+ for (i = 0; i < nr_cpus; i++) {
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ err |= status;
+ }
+
+ if (err)
+ printf("Test: %s FAILED\n", name);
+}
+
+static void test_bpf_perf_event(void)
+{
+ struct perf_event_attr attr_cycles = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_HARDWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ };
+ struct perf_event_attr attr_clock = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_SOFTWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ };
+ struct perf_event_attr attr_raw = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_RAW,
+ .read_format = 0,
+ .sample_type = 0,
+ /* Intel Instruction Retired */
+ .config = 0xc0,
+ };
+ struct perf_event_attr attr_l1d_load = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_HW_CACHE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config =
+ PERF_COUNT_HW_CACHE_L1D |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
+ };
+ struct perf_event_attr attr_llc_miss = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_HW_CACHE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config =
+ PERF_COUNT_HW_CACHE_LL |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
+ };
+ struct perf_event_attr attr_msr_tsc = {
+ .freq = 0,
+ .sample_period = 0,
+ .inherit = 0,
+ /* From /sys/bus/event_source/devices/msr/ */
+ .type = 7,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = 0,
+ };
+
+ test_perf_event_array(&attr_cycles, "HARDWARE-cycles");
+ test_perf_event_array(&attr_clock, "SOFTWARE-clock");
+ test_perf_event_array(&attr_raw, "RAW-instruction-retired");
+ test_perf_event_array(&attr_l1d_load, "HW_CACHE-L1D-load");
+
+ /* below tests may fail in qemu */
+ test_perf_event_array(&attr_llc_miss, "HW_CACHE-LLC-miss");
+ test_perf_event_array(&attr_msr_tsc, "Dynamic-msr-tsc");
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ setrlimit(RLIMIT_MEMLOCK, &r);
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ test_bpf_perf_event();
+ return 0;
+}
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index 000000000..1ab308a43
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ unsigned long rc = -12;
+
+ bpf_override_return(ctx, rc);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index 000000000..2ed13e9f3
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,33 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+ FILE *f;
+ char filename[256];
+ char command[256];
+ int ret;
+
+ if (!argv[1]) {
+ fprintf(stderr, "ERROR: Run with the btrfs device argument!\n");
+ return 0;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+ f = popen(command, "r");
+ ret = pclose(f);
+
+ return ret ? 0 : 1;
+}
diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c
new file mode 100644
index 000000000..219742106
--- /dev/null
+++ b/samples/bpf/xdp1_kern.c
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PLUMgrid
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") rxcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 256,
+};
+
+static int parse_ipv4(void *data, u64 nh_off, void *data_end)
+{
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ return iph->protocol;
+}
+
+static int parse_ipv6(void *data, u64 nh_off, void *data_end)
+{
+ struct ipv6hdr *ip6h = data + nh_off;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+ return ip6h->nexthdr;
+}
+
+SEC("xdp1")
+int xdp_prog1(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ int rc = XDP_DROP;
+ long *value;
+ u16 h_proto;
+ u64 nh_off;
+ u32 ipproto;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+
+ if (h_proto == htons(ETH_P_IP))
+ ipproto = parse_ipv4(data, nh_off, data_end);
+ else if (h_proto == htons(ETH_P_IPV6))
+ ipproto = parse_ipv6(data, nh_off, data_end);
+ else
+ ipproto = 0;
+
+ value = bpf_map_lookup_elem(&rxcnt, &ipproto);
+ if (value)
+ *value += 1;
+
+ return rc;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
new file mode 100644
index 000000000..b02c53151
--- /dev/null
+++ b/samples/bpf/xdp1_user.c
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PLUMgrid
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+
+#include "bpf_util.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+static int ifindex;
+static __u32 xdp_flags;
+
+static void int_exit(int sig)
+{
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(0);
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(int map_fd, int interval)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ const unsigned int nr_keys = 256;
+ __u64 values[nr_cpus], prev[nr_keys][nr_cpus];
+ __u32 key;
+ int i;
+
+ memset(prev, 0, sizeof(prev));
+
+ while (1) {
+ sleep(interval);
+
+ for (key = 0; key < nr_keys; key++) {
+ __u64 sum = 0;
+
+ assert(bpf_map_lookup_elem(map_fd, &key, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[key][i]);
+ if (sum)
+ printf("proto %u: %10llu pkt/s\n",
+ key, sum / interval);
+ memcpy(prev[key], values, sizeof(values));
+ }
+ }
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] IFINDEX\n\n"
+ "OPTS:\n"
+ " -S use skb-mode\n"
+ " -N enforce native mode\n",
+ prog);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ const char *optstr = "SN";
+ int prog_fd, map_fd, opt;
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ char filename[256];
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ switch (opt) {
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (optind == argc) {
+ usage(basename(argv[0]));
+ return 1;
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ ifindex = strtoul(argv[optind], NULL, 0);
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return 1;
+
+ map = bpf_map__next(NULL, obj);
+ if (!map) {
+ printf("finding a map in obj file failed\n");
+ return 1;
+ }
+ map_fd = bpf_map__fd(map);
+
+ if (!prog_fd) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+ printf("link set xdp fd failed\n");
+ return 1;
+ }
+
+ poll_stats(map_fd, 2);
+
+ return 0;
+}
diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c
new file mode 100644
index 000000000..e01288867
--- /dev/null
+++ b/samples/bpf/xdp2_kern.c
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PLUMgrid
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") rxcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 256,
+};
+
+static void swap_src_dst_mac(void *data)
+{
+ unsigned short *p = data;
+ unsigned short dst[3];
+
+ dst[0] = p[0];
+ dst[1] = p[1];
+ dst[2] = p[2];
+ p[0] = p[3];
+ p[1] = p[4];
+ p[2] = p[5];
+ p[3] = dst[0];
+ p[4] = dst[1];
+ p[5] = dst[2];
+}
+
+static int parse_ipv4(void *data, u64 nh_off, void *data_end)
+{
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ return iph->protocol;
+}
+
+static int parse_ipv6(void *data, u64 nh_off, void *data_end)
+{
+ struct ipv6hdr *ip6h = data + nh_off;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+ return ip6h->nexthdr;
+}
+
+SEC("xdp1")
+int xdp_prog1(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ int rc = XDP_DROP;
+ long *value;
+ u16 h_proto;
+ u64 nh_off;
+ u32 ipproto;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+
+ if (h_proto == htons(ETH_P_IP))
+ ipproto = parse_ipv4(data, nh_off, data_end);
+ else if (h_proto == htons(ETH_P_IPV6))
+ ipproto = parse_ipv6(data, nh_off, data_end);
+ else
+ ipproto = 0;
+
+ value = bpf_map_lookup_elem(&rxcnt, &ipproto);
+ if (value)
+ *value += 1;
+
+ if (ipproto == IPPROTO_UDP) {
+ swap_src_dst_mac(data);
+ rc = XDP_TX;
+ }
+
+ return rc;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh
new file mode 100755
index 000000000..4bde9d066
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+#
+# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load
+# eBPF programs, both for XDP and clsbpf. Shell script function
+# wrappers and even long options parsing is illustrated, for ease of
+# use.
+#
+# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs
+# that need to collaborate between XDP and TC hooks. Thus, it is
+# convenient that the same tool load both programs that need to work
+# together.
+#
+BPF_FILE=xdp2skb_meta_kern.o
+DIR=$(dirname $0)
+
+[ -z "$TC" ] && TC=tc
+[ -z "$IP" ] && IP=ip
+
+function usage() {
+ echo ""
+ echo "Usage: $0 [-vfh] --dev ethX"
+ echo " -d | --dev : Network device (required)"
+ echo " --flush : Cleanup flush TC and XDP progs"
+ echo " --list : (\$LIST) List TC and XDP progs"
+ echo " -v | --verbose : (\$VERBOSE) Verbose"
+ echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)"
+ echo ""
+}
+
+## -- General shell logging cmds --
+function err() {
+ local exitcode=$1
+ shift
+ echo "ERROR: $@" >&2
+ exit $exitcode
+}
+
+function info() {
+ if [[ -n "$VERBOSE" ]]; then
+ echo "# $@"
+ fi
+}
+
+## -- Helper function calls --
+
+# Wrapper call for TC and IP
+# - Will display the offending command on failure
+function _call_cmd() {
+ local cmd="$1"
+ local allow_fail="$2"
+ shift 2
+ if [[ -n "$VERBOSE" ]]; then
+ echo "$cmd $@"
+ fi
+ if [[ -n "$DRYRUN" ]]; then
+ return
+ fi
+ $cmd "$@"
+ local status=$?
+ if (( $status != 0 )); then
+ if [[ "$allow_fail" == "" ]]; then
+ err 2 "Exec error($status) occurred cmd: \"$cmd $@\""
+ fi
+ fi
+}
+function call_tc() {
+ _call_cmd "$TC" "" "$@"
+}
+function call_tc_allow_fail() {
+ _call_cmd "$TC" "allow_fail" "$@"
+}
+function call_ip() {
+ _call_cmd "$IP" "" "$@"
+}
+
+## --- Parse command line arguments / parameters ---
+# Using external program "getopt" to get --long-options
+OPTIONS=$(getopt -o vfhd: \
+ --long verbose,flush,help,list,dev:,dry-run -- "$@")
+if (( $? != 0 )); then
+ err 4 "Error calling getopt"
+fi
+eval set -- "$OPTIONS"
+
+unset DEV
+unset FLUSH
+while true; do
+ case "$1" in
+ -d | --dev ) # device
+ DEV=$2
+ info "Device set to: DEV=$DEV" >&2
+ shift 2
+ ;;
+ -v | --verbose)
+ VERBOSE=yes
+ # info "Verbose mode: VERBOSE=$VERBOSE" >&2
+ shift
+ ;;
+ --dry-run )
+ DRYRUN=yes
+ VERBOSE=yes
+ info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2
+ shift
+ ;;
+ -f | --flush )
+ FLUSH=yes
+ shift
+ ;;
+ --list )
+ LIST=yes
+ shift
+ ;;
+ -- )
+ shift
+ break
+ ;;
+ -h | --help )
+ usage;
+ exit 0
+ ;;
+ * )
+ shift
+ break
+ ;;
+ esac
+done
+
+FILE="$DIR/$BPF_FILE"
+if [[ ! -e $FILE ]]; then
+ err 3 "Missing BPF object file ($FILE)"
+fi
+
+if [[ -z $DEV ]]; then
+ usage
+ err 2 "Please specify network device -- required option --dev"
+fi
+
+## -- Function calls --
+
+function list_tc()
+{
+ local device="$1"
+ shift
+ info "Listing current TC ingress rules"
+ call_tc filter show dev $device ingress
+}
+
+function list_xdp()
+{
+ local device="$1"
+ shift
+ info "Listing current XDP device($device) setting"
+ call_ip link show dev $device | grep --color=auto xdp
+}
+
+function flush_tc()
+{
+ local device="$1"
+ shift
+ info "Flush TC on device: $device"
+ call_tc_allow_fail filter del dev $device ingress
+ call_tc_allow_fail qdisc del dev $device clsact
+}
+
+function flush_xdp()
+{
+ local device="$1"
+ shift
+ info "Flush XDP on device: $device"
+ call_ip link set dev $device xdp off
+}
+
+function attach_tc_mark()
+{
+ local device="$1"
+ local file="$2"
+ local prog="tc_mark"
+ shift 2
+
+ # Re-attach clsact to clear/flush existing role
+ call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null
+ call_tc qdisc add dev $device clsact
+
+ # Attach BPF prog
+ call_tc filter add dev $device ingress \
+ prio 1 handle 1 bpf da obj $file sec $prog
+}
+
+function attach_xdp_mark()
+{
+ local device="$1"
+ local file="$2"
+ local prog="xdp_mark"
+ shift 2
+
+ # Remove XDP prog in-case it's already loaded
+ # TODO: Need ip-link option to override/replace existing XDP prog
+ flush_xdp $device
+
+ # Attach XDP/BPF prog
+ call_ip link set dev $device xdp obj $file sec $prog
+}
+
+if [[ -n $FLUSH ]]; then
+ flush_tc $DEV
+ flush_xdp $DEV
+ exit 0
+fi
+
+if [[ -n $LIST ]]; then
+ list_tc $DEV
+ list_xdp $DEV
+ exit 0
+fi
+
+attach_tc_mark $DEV $FILE
+attach_xdp_mark $DEV $FILE
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
new file mode 100644
index 000000000..0c12048ac
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * Example howto transfer info from XDP to SKB, e.g. skb->mark
+ * -----------------------------------------------------------
+ * This uses the XDP data_meta infrastructure, and is a cooperation
+ * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook.
+ *
+ * Notice: This example does not use the BPF C-loader (bpf_load.c),
+ * but instead rely on the iproute2 TC tool for loading BPF-objects.
+ */
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/pkt_cls.h>
+
+#include "bpf_helpers.h"
+
+/*
+ * This struct is stored in the XDP 'data_meta' area, which is located
+ * just in-front-of the raw packet payload data. The meaning is
+ * specific to these two BPF programs that use it as a communication
+ * channel. XDP adjust/increase the area via a bpf-helper, and TC use
+ * boundary checks to see if data have been provided.
+ *
+ * The struct must be 4 byte aligned, which here is enforced by the
+ * struct __attribute__((aligned(4))).
+ */
+struct meta_info {
+ __u32 mark;
+} __attribute__((aligned(4)));
+
+SEC("xdp_mark")
+int _xdp_mark(struct xdp_md *ctx)
+{
+ struct meta_info *meta;
+ void *data, *data_end;
+ int ret;
+
+ /* Reserve space in-front of data pointer for our meta info.
+ * (Notice drivers not supporting data_meta will fail here!)
+ */
+ ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
+ if (ret < 0)
+ return XDP_ABORTED;
+
+ /* Notice: Kernel-side verifier requires that loading of
+ * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(),
+ * as pkt-data pointers are invalidated. Helpers that require
+ * this are determined/marked by bpf_helper_changes_pkt_data()
+ */
+ data = (void *)(unsigned long)ctx->data;
+
+ /* Check data_meta have room for meta_info struct */
+ meta = (void *)(unsigned long)ctx->data_meta;
+ if (meta + 1 > data)
+ return XDP_ABORTED;
+
+ meta->mark = 42;
+
+ return XDP_PASS;
+}
+
+SEC("tc_mark")
+int _tc_mark(struct __sk_buff *ctx)
+{
+ void *data = (void *)(unsigned long)ctx->data;
+ void *data_end = (void *)(unsigned long)ctx->data_end;
+ void *data_meta = (void *)(unsigned long)ctx->data_meta;
+ struct meta_info *meta = data_meta;
+
+ /* Check XDP gave us some data_meta */
+ if (meta + 1 > data) {
+ ctx->mark = 41;
+ /* Skip "accept" if no data_meta is avail */
+ return TC_ACT_OK;
+ }
+
+ /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */
+ ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
+
+ return TC_ACT_OK;
+}
+
+/* Manually attaching these programs:
+export DEV=ixgbe2
+export FILE=xdp2skb_meta_kern.o
+
+# via TC command
+tc qdisc del dev $DEV clsact 2> /dev/null
+tc qdisc add dev $DEV clsact
+tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark
+tc filter show dev $DEV ingress
+
+# XDP via IP command:
+ip link set dev $DEV xdp off
+ip link set dev $DEV xdp obj $FILE sec xdp_mark
+
+# Use iptable to "see" if SKBs are marked
+iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29
+iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a
+
+# Hint: catch XDP_ABORTED errors via
+perf record -e xdp:*
+perf script
+
+*/
diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c
new file mode 100644
index 000000000..411fdb21f
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_kern.c
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program shows how to use bpf_xdp_adjust_tail() by
+ * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed
+ * to be more preice in case of v4)" where receiving packets bigger then
+ * 600 bytes.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include "bpf_helpers.h"
+
+#define DEFAULT_TTL 64
+#define MAX_PCKT_SIZE 600
+#define ICMP_TOOBIG_SIZE 98
+#define ICMP_TOOBIG_PAYLOAD_SIZE 92
+
+struct bpf_map_def SEC("maps") icmpcnt = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u64),
+ .max_entries = 1,
+};
+
+static __always_inline void count_icmp(void)
+{
+ u64 key = 0;
+ u64 *icmp_count;
+
+ icmp_count = bpf_map_lookup_elem(&icmpcnt, &key);
+ if (icmp_count)
+ *icmp_count += 1;
+}
+
+static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth)
+{
+ struct ethhdr *eth;
+
+ eth = data;
+ memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN);
+ memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN);
+ eth->h_proto = orig_eth->h_proto;
+}
+
+static __always_inline __u16 csum_fold_helper(__u32 csum)
+{
+ return ~((csum & 0xffff) + (csum >> 16));
+}
+
+static __always_inline void ipv4_csum(void *data_start, int data_size,
+ __u32 *csum)
+{
+ *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum);
+ *csum = csum_fold_helper(*csum);
+}
+
+static __always_inline int send_icmp4_too_big(struct xdp_md *xdp)
+{
+ int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr);
+
+ if (bpf_xdp_adjust_head(xdp, 0 - headroom))
+ return XDP_DROP;
+ void *data = (void *)(long)xdp->data;
+ void *data_end = (void *)(long)xdp->data_end;
+
+ if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end)
+ return XDP_DROP;
+
+ struct iphdr *iph, *orig_iph;
+ struct icmphdr *icmp_hdr;
+ struct ethhdr *orig_eth;
+ __u32 csum = 0;
+ __u64 off = 0;
+
+ orig_eth = data + headroom;
+ swap_mac(data, orig_eth);
+ off += sizeof(struct ethhdr);
+ iph = data + off;
+ off += sizeof(struct iphdr);
+ icmp_hdr = data + off;
+ off += sizeof(struct icmphdr);
+ orig_iph = data + off;
+ icmp_hdr->type = ICMP_DEST_UNREACH;
+ icmp_hdr->code = ICMP_FRAG_NEEDED;
+ icmp_hdr->un.frag.mtu = htons(MAX_PCKT_SIZE-sizeof(struct ethhdr));
+ icmp_hdr->checksum = 0;
+ ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum);
+ icmp_hdr->checksum = csum;
+ iph->ttl = DEFAULT_TTL;
+ iph->daddr = orig_iph->saddr;
+ iph->saddr = orig_iph->daddr;
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->protocol = IPPROTO_ICMP;
+ iph->tos = 0;
+ iph->tot_len = htons(
+ ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr));
+ iph->check = 0;
+ csum = 0;
+ ipv4_csum(iph, sizeof(struct iphdr), &csum);
+ iph->check = csum;
+ count_icmp();
+ return XDP_TX;
+}
+
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ int pckt_size = data_end - data;
+ int offset;
+
+ if (pckt_size > MAX_PCKT_SIZE) {
+ offset = pckt_size - ICMP_TOOBIG_SIZE;
+ if (bpf_xdp_adjust_tail(xdp, 0 - offset))
+ return XDP_PASS;
+ return send_icmp4_too_big(xdp);
+ }
+ return XDP_PASS;
+}
+
+SEC("xdp_icmp")
+int _xdp_icmp(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+
+ if (eth + 1 > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_IP))
+ return handle_ipv4(xdp);
+ else
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
new file mode 100644
index 000000000..3042ce37d
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+#define STATS_INTERVAL_S 2U
+
+static int ifindex = -1;
+static __u32 xdp_flags;
+
+static void int_exit(int sig)
+{
+ if (ifindex > -1)
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(0);
+}
+
+/* simple "icmp packet too big sent" counter
+ */
+static void poll_stats(unsigned int map_fd, unsigned int kill_after_s)
+{
+ time_t started_at = time(NULL);
+ __u64 value = 0;
+ int key = 0;
+
+
+ while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+ sleep(STATS_INTERVAL_S);
+
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+
+ printf("icmp \"packet too big\" sent: %10llu pkts\n", value);
+ }
+}
+
+static void usage(const char *cmd)
+{
+ printf("Start a XDP prog which send ICMP \"packet too big\" \n"
+ "messages if ingress packet is bigger then MAX_SIZE bytes\n");
+ printf("Usage: %s [...]\n", cmd);
+ printf(" -i <ifindex> Interface Index\n");
+ printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
+ printf(" -S use skb-mode\n");
+ printf(" -N enforce native mode\n");
+ printf(" -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ unsigned char opt_flags[256] = {};
+ unsigned int kill_after_s = 0;
+ const char *optstr = "i:T:SNh";
+ int i, prog_fd, map_fd, opt;
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ char filename[256];
+
+ for (i = 0; i < strlen(optstr); i++)
+ if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
+ opt_flags[(unsigned char)optstr[i]] = 1;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+
+ switch (opt) {
+ case 'i':
+ ifindex = atoi(optarg);
+ break;
+ case 'T':
+ kill_after_s = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ opt_flags[opt] = 0;
+ }
+
+ for (i = 0; i < strlen(optstr); i++) {
+ if (opt_flags[(unsigned int)optstr[i]]) {
+ fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return 1;
+
+ map = bpf_map__next(NULL, obj);
+ if (!map) {
+ printf("finding a map in obj file failed\n");
+ return 1;
+ }
+ map_fd = bpf_map__fd(map);
+
+ if (!prog_fd) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+ printf("link set xdp fd failed\n");
+ return 1;
+ }
+
+ poll_stats(map_fd, kill_after_s);
+
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000..a7e94e7ff
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include "bpf_helpers.h"
+
+#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF)
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_DEVMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 64,
+};
+
+/* from include/net/ip.h */
+static __always_inline int ip_decrease_ttl(struct iphdr *iph)
+{
+ u32 check = (__force u32)iph->check;
+
+ check += (__force u32)htons(0x0100);
+ iph->check = (__force __sum16)(check + (check >= 0xFFFF));
+ return --iph->ttl;
+}
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct bpf_fib_lookup fib_params;
+ struct ethhdr *eth = data;
+ struct ipv6hdr *ip6h;
+ struct iphdr *iph;
+ u16 h_proto;
+ u64 nh_off;
+ int rc;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+
+ __builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+ h_proto = eth->h_proto;
+ if (h_proto == htons(ETH_P_IP)) {
+ iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+
+ if (iph->ttl <= 1)
+ return XDP_PASS;
+
+ fib_params.family = AF_INET;
+ fib_params.tos = iph->tos;
+ fib_params.l4_protocol = iph->protocol;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = ntohs(iph->tot_len);
+ fib_params.ipv4_src = iph->saddr;
+ fib_params.ipv4_dst = iph->daddr;
+ } else if (h_proto == htons(ETH_P_IPV6)) {
+ struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst;
+
+ ip6h = data + nh_off;
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ if (ip6h->hop_limit <= 1)
+ return XDP_PASS;
+
+ fib_params.family = AF_INET6;
+ fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+ fib_params.l4_protocol = ip6h->nexthdr;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = ntohs(ip6h->payload_len);
+ *src = ip6h->saddr;
+ *dst = ip6h->daddr;
+ } else {
+ return XDP_PASS;
+ }
+
+ fib_params.ifindex = ctx->ingress_ifindex;
+
+ rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+
+ /* verify egress index has xdp support
+ * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
+ * cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+ * NOTE: without verification that egress index supports XDP
+ * forwarding packets are dropped.
+ */
+ if (rc == 0) {
+ if (h_proto == htons(ETH_P_IP))
+ ip_decrease_ttl(iph);
+ else if (h_proto == htons(ETH_P_IPV6))
+ ip6h->hop_limit--;
+
+ memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+ memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+ return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
+ }
+
+ return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+ return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+ return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000..f88e1d709
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include "bpf/libbpf.h"
+#include <bpf/bpf.h>
+
+
+static int do_attach(int idx, int fd, const char *name)
+{
+ int err;
+
+ err = bpf_set_link_xdp_fd(idx, fd, 0);
+ if (err < 0)
+ printf("ERROR: failed to attach program to %s\n", name);
+
+ return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+ int err;
+
+ err = bpf_set_link_xdp_fd(idx, -1, 0);
+ if (err < 0)
+ printf("ERROR: failed to detach program from %s\n", name);
+
+ return err;
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] interface-list\n"
+ "\nOPTS:\n"
+ " -d detach program\n"
+ " -D direct table lookups (skip fib rules)\n",
+ prog);
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ const char *prog_name = "xdp_fwd";
+ struct bpf_program *prog;
+ char filename[PATH_MAX];
+ struct bpf_object *obj;
+ int opt, i, idx, err;
+ int prog_fd, map_fd;
+ int attach = 1;
+ int ret = 0;
+
+ while ((opt = getopt(argc, argv, ":dD")) != -1) {
+ switch (opt) {
+ case 'd':
+ attach = 0;
+ break;
+ case 'D':
+ prog_name = "xdp_fwd_direct";
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (optind == argc) {
+ usage(basename(argv[0]));
+ return 1;
+ }
+
+ if (attach) {
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
+
+ if (access(filename, O_RDONLY) < 0) {
+ printf("error accessing file %s: %s\n",
+ filename, strerror(errno));
+ return 1;
+ }
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return 1;
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ prog_fd = bpf_program__fd(prog);
+ if (prog_fd < 0) {
+ printf("program not found: %s\n", strerror(prog_fd));
+ return 1;
+ }
+ map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj,
+ "tx_port"));
+ if (map_fd < 0) {
+ printf("map not found: %s\n", strerror(map_fd));
+ return 1;
+ }
+ }
+ if (attach) {
+ for (i = 1; i < 64; ++i)
+ bpf_map_update_elem(map_fd, &i, &i, 0);
+ }
+
+ for (i = optind; i < argc; ++i) {
+ idx = if_nametoindex(argv[i]);
+ if (!idx)
+ idx = strtoul(argv[i], NULL, 0);
+
+ if (!idx) {
+ fprintf(stderr, "Invalid arg\n");
+ return 1;
+ }
+ if (!attach) {
+ err = do_detach(idx, argv[i]);
+ if (err)
+ ret = err;
+ } else {
+ err = do_attach(idx, prog_fd, argv[i]);
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
new file mode 100644
index 000000000..ad10fe700
--- /dev/null
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * XDP monitor tool, based on tracepoints
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") redirect_err_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = 2,
+ /* TODO: have entries for all possible errno's */
+};
+
+#define XDP_UNKNOWN XDP_REDIRECT + 1
+struct bpf_map_def SEC("maps") exception_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = XDP_UNKNOWN + 1,
+};
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_redirect_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12 size:4; signed:0;
+ int ifindex; // offset:16 size:4; signed:1;
+ int err; // offset:20 size:4; signed:1;
+ int to_ifindex; // offset:24 size:4; signed:1;
+ u32 map_id; // offset:28 size:4; signed:0;
+ int map_index; // offset:32 size:4; signed:1;
+}; // offset:36
+
+enum {
+ XDP_REDIRECT_SUCCESS = 0,
+ XDP_REDIRECT_ERROR = 1
+};
+
+static __always_inline
+int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
+{
+ u32 key = XDP_REDIRECT_ERROR;
+ int err = ctx->err;
+ u64 *cnt;
+
+ if (!err)
+ key = XDP_REDIRECT_SUCCESS;
+
+ cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key);
+ if (!cnt)
+ return 1;
+ *cnt += 1;
+
+ return 0; /* Indicate event was filtered (no further processing)*/
+ /*
+ * Returning 1 here would allow e.g. a perf-record tracepoint
+ * to see and record these events, but it doesn't work well
+ * in-practice as stopping perf-record also unload this
+ * bpf_prog. Plus, there is additional overhead of doing so.
+ */
+}
+
+SEC("tracepoint/xdp/xdp_redirect_err")
+int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+
+SEC("tracepoint/xdp/xdp_redirect_map_err")
+int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+/* Likely unloaded when prog starts */
+SEC("tracepoint/xdp/xdp_redirect")
+int trace_xdp_redirect(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+/* Likely unloaded when prog starts */
+SEC("tracepoint/xdp/xdp_redirect_map")
+int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int ifindex; // offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+ u64 *cnt;
+ u32 key;
+
+ key = ctx->act;
+ if (key > XDP_REDIRECT)
+ key = XDP_UNKNOWN;
+
+ cnt = bpf_map_lookup_elem(&exception_cnt, &key);
+ if (!cnt)
+ return 1;
+ *cnt += 1;
+
+ return 0;
+}
+
+/* Common stats data record shared with _user.c */
+struct datarec {
+ u64 processed;
+ u64 dropped;
+ u64 info;
+ u64 err;
+};
+#define MAX_CPUS 64
+
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = MAX_CPUS,
+};
+
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int to_cpu; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+ u32 to_cpu = ctx->to_cpu;
+ struct datarec *rec;
+
+ if (to_cpu >= MAX_CPUS)
+ return 1;
+
+ rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ if (ctx->processed > 0)
+ rec->info += 1;
+
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int sched; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Count times kthread yielded CPU via schedule call */
+ if (ctx->sched)
+ rec->info++;
+
+ return 0;
+}
+
+struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct devmap_xmit_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ u32 map_index; // offset:16; size:4; signed:0;
+ int drops; // offset:20; size:4; signed:1;
+ int sent; // offset:24; size:4; signed:1;
+ int from_ifindex; // offset:28; size:4; signed:1;
+ int to_ifindex; // offset:32; size:4; signed:1;
+ int err; // offset:36; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_devmap_xmit")
+int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->sent;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ rec->info += 1;
+
+ /* Record error cases, where no frame were sent */
+ if (ctx->err)
+ rec->err++;
+
+ /* Catch API error of drv ndo_xdp_xmit sent more than count */
+ if (ctx->drops < 0)
+ rec->err++;
+
+ return 1;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
new file mode 100644
index 000000000..dd558cbb2
--- /dev/null
+++ b/samples/bpf/xdp_monitor_user.c
@@ -0,0 +1,715 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+static const char *__doc__=
+ "XDP monitor tool, based on tracepoints\n"
+;
+
+static const char *__doc_err_only__=
+ " NOTICE: Only tracking XDP redirect errors\n"
+ " Enable TX success stats via '--stats'\n"
+ " (which comes with a per packet processing overhead)\n"
+;
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <locale.h>
+
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+#include "bpf_util.h"
+
+static int verbose = 1;
+static bool debug = false;
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"debug", no_argument, NULL, 'D' },
+ {"stats", no_argument, NULL, 'S' },
+ {"sec", required_argument, NULL, 's' },
+ {0, 0, NULL, 0 }
+};
+
+/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
+#define EXIT_FAIL_MEM 5
+
+static void usage(char *argv[])
+{
+ int i;
+ printf("\nDOCUMENTATION:\n%s\n", __doc__);
+ printf("\n");
+ printf(" Usage: %s (options-see-below)\n",
+ argv[0]);
+ printf(" Listing options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-15s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)",
+ *long_options[i].flag);
+ else
+ printf("short-option: -%c",
+ long_options[i].val);
+ printf("\n");
+ }
+ printf("\n");
+}
+
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+ struct timespec t;
+ int res;
+
+ res = clock_gettime(CLOCK_MONOTONIC, &t);
+ if (res < 0) {
+ fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+ exit(EXIT_FAILURE);
+ }
+ return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+enum {
+ REDIR_SUCCESS = 0,
+ REDIR_ERROR = 1,
+};
+#define REDIR_RES_MAX 2
+static const char *redir_names[REDIR_RES_MAX] = {
+ [REDIR_SUCCESS] = "Success",
+ [REDIR_ERROR] = "Error",
+};
+static const char *err2str(int err)
+{
+ if (err < REDIR_RES_MAX)
+ return redir_names[err];
+ return NULL;
+}
+/* enum xdp_action */
+#define XDP_UNKNOWN XDP_REDIRECT + 1
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+ [XDP_ABORTED] = "XDP_ABORTED",
+ [XDP_DROP] = "XDP_DROP",
+ [XDP_PASS] = "XDP_PASS",
+ [XDP_TX] = "XDP_TX",
+ [XDP_REDIRECT] = "XDP_REDIRECT",
+ [XDP_UNKNOWN] = "XDP_UNKNOWN",
+};
+static const char *action2str(int action)
+{
+ if (action < XDP_ACTION_MAX)
+ return xdp_action_names[action];
+ return NULL;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 info;
+ __u64 err;
+};
+#define MAX_CPUS 64
+
+/* Userspace structs for collection of stats from maps */
+struct record {
+ __u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+struct u64rec {
+ __u64 processed;
+};
+struct record_u64 {
+ /* record for _kern side __u64 values */
+ __u64 timestamp;
+ struct u64rec total;
+ struct u64rec *cpu;
+};
+
+struct stats_record {
+ struct record_u64 xdp_redirect[REDIR_RES_MAX];
+ struct record_u64 xdp_exception[XDP_ACTION_MAX];
+ struct record xdp_cpumap_kthread;
+ struct record xdp_cpumap_enqueue[MAX_CPUS];
+ struct record xdp_devmap_xmit;
+};
+
+static bool map_collect_record(int fd, __u32 key, struct record *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec values[nr_cpus];
+ __u64 sum_processed = 0;
+ __u64 sum_dropped = 0;
+ __u64 sum_info = 0;
+ __u64 sum_err = 0;
+ int i;
+
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_processed += values[i].processed;
+ rec->cpu[i].dropped = values[i].dropped;
+ sum_dropped += values[i].dropped;
+ rec->cpu[i].info = values[i].info;
+ sum_info += values[i].info;
+ rec->cpu[i].err = values[i].err;
+ sum_err += values[i].err;
+ }
+ rec->total.processed = sum_processed;
+ rec->total.dropped = sum_dropped;
+ rec->total.info = sum_info;
+ rec->total.err = sum_err;
+ return true;
+}
+
+static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct u64rec values[nr_cpus];
+ __u64 sum_total = 0;
+ int i;
+
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_total += values[i].processed;
+ }
+ rec->total.processed = sum_total;
+ return true;
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static double calc_pps(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static double calc_drop(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->dropped - p->dropped;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static double calc_info(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->info - p->info;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static double calc_err(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->err - p->err;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ bool err_only)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ int rec_i = 0, i, to_cpu;
+ double t = 0, pps = 0;
+
+ /* Header */
+ printf("%-15s %-7s %-12s %-12s %-9s\n",
+ "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
+
+ /* tracepoint: xdp:xdp_redirect_* */
+ if (err_only)
+ rec_i = REDIR_ERROR;
+
+ for (; rec_i < REDIR_RES_MAX; rec_i++) {
+ struct record_u64 *rec, *prev;
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+ rec = &stats_rec->xdp_redirect[rec_i];
+ prev = &stats_prev->xdp_redirect[rec_i];
+ t = calc_period_u64(rec, prev);
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct u64rec *r = &rec->cpu[i];
+ struct u64rec *p = &prev->cpu[i];
+
+ pps = calc_pps_u64(r, p, t);
+ if (pps > 0)
+ printf(fmt1, "XDP_REDIRECT", i,
+ rec_i ? 0.0: pps, rec_i ? pps : 0.0,
+ err2str(rec_i));
+ }
+ pps = calc_pps_u64(&rec->total, &prev->total, t);
+ printf(fmt2, "XDP_REDIRECT", "total",
+ rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
+ }
+
+ /* tracepoint: xdp:xdp_exception */
+ for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+ struct record_u64 *rec, *prev;
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+ rec = &stats_rec->xdp_exception[rec_i];
+ prev = &stats_prev->xdp_exception[rec_i];
+ t = calc_period_u64(rec, prev);
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct u64rec *r = &rec->cpu[i];
+ struct u64rec *p = &prev->cpu[i];
+
+ pps = calc_pps_u64(r, p, t);
+ if (pps > 0)
+ printf(fmt1, "Exception", i,
+ 0.0, pps, action2str(rec_i));
+ }
+ pps = calc_pps_u64(&rec->total, &prev->total, t);
+ if (pps > 0)
+ printf(fmt2, "Exception", "total",
+ 0.0, pps, action2str(rec_i));
+ }
+
+ /* cpumap enqueue stats */
+ for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+ char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+ char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+ struct record *rec, *prev;
+ char *info_str = "";
+ double drop, info;
+
+ rec = &stats_rec->xdp_cpumap_enqueue[to_cpu];
+ prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop(r, p, t);
+ info = calc_info(r, p, t);
+ if (info > 0) {
+ info_str = "bulk-average";
+ info = pps / info; /* calc average bulk size */
+ }
+ if (pps > 0)
+ printf(fmt1, "cpumap-enqueue",
+ i, to_cpu, pps, drop, info, info_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ if (pps > 0) {
+ drop = calc_drop(&rec->total, &prev->total, t);
+ info = calc_info(&rec->total, &prev->total, t);
+ if (info > 0) {
+ info_str = "bulk-average";
+ info = pps / info; /* calc average bulk size */
+ }
+ printf(fmt2, "cpumap-enqueue",
+ "sum", to_cpu, pps, drop, info, info_str);
+ }
+ }
+
+ /* cpumap kthread stats */
+ {
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
+ struct record *rec, *prev;
+ double drop, info;
+ char *i_str = "";
+
+ rec = &stats_rec->xdp_cpumap_kthread;
+ prev = &stats_prev->xdp_cpumap_kthread;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop(r, p, t);
+ info = calc_info(r, p, t);
+ if (info > 0)
+ i_str = "sched";
+ if (pps > 0 || drop > 0)
+ printf(fmt1, "cpumap-kthread",
+ i, pps, drop, info, i_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop(&rec->total, &prev->total, t);
+ info = calc_info(&rec->total, &prev->total, t);
+ if (info > 0)
+ i_str = "sched-sum";
+ printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
+ }
+
+ /* devmap ndo_xdp_xmit stats */
+ {
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+ struct record *rec, *prev;
+ double drop, info, err;
+ char *i_str = "";
+ char *err_str = "";
+
+ rec = &stats_rec->xdp_devmap_xmit;
+ prev = &stats_prev->xdp_devmap_xmit;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop(r, p, t);
+ info = calc_info(r, p, t);
+ err = calc_err(r, p, t);
+ if (info > 0) {
+ i_str = "bulk-average";
+ info = (pps+drop) / info; /* calc avg bulk */
+ }
+ if (err > 0)
+ err_str = "drv-err";
+ if (pps > 0 || drop > 0)
+ printf(fmt1, "devmap-xmit",
+ i, pps, drop, info, i_str, err_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop(&rec->total, &prev->total, t);
+ info = calc_info(&rec->total, &prev->total, t);
+ err = calc_err(&rec->total, &prev->total, t);
+ if (info > 0) {
+ i_str = "bulk-average";
+ info = (pps+drop) / info; /* calc avg bulk */
+ }
+ if (err > 0)
+ err_str = "drv-err";
+ printf(fmt2, "devmap-xmit", "total", pps, drop,
+ info, i_str, err_str);
+ }
+
+ printf("\n");
+}
+
+static bool stats_collect(struct stats_record *rec)
+{
+ int fd;
+ int i;
+
+ /* TODO: Detect if someone unloaded the perf event_fd's, as
+ * this can happen by someone running perf-record -e
+ */
+
+ fd = map_data[0].fd; /* map0: redirect_err_cnt */
+ for (i = 0; i < REDIR_RES_MAX; i++)
+ map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
+
+ fd = map_data[1].fd; /* map1: exception_cnt */
+ for (i = 0; i < XDP_ACTION_MAX; i++) {
+ map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
+ }
+
+ fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+ for (i = 0; i < MAX_CPUS; i++)
+ map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
+
+ fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+ map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+
+ fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+ map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
+
+ return true;
+}
+
+static void *alloc_rec_per_cpu(int record_size)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ void *array;
+ size_t size;
+
+ size = record_size * nr_cpus;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ struct stats_record *rec;
+ int rec_sz;
+ int i;
+
+ /* Alloc main stats_record structure */
+ rec = malloc(sizeof(*rec));
+ memset(rec, 0, sizeof(*rec));
+ if (!rec) {
+ fprintf(stderr, "Mem alloc error\n");
+ exit(EXIT_FAIL_MEM);
+ }
+
+ /* Alloc stats stored per CPU for each record */
+ rec_sz = sizeof(struct u64rec);
+ for (i = 0; i < REDIR_RES_MAX; i++)
+ rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+ rec_sz = sizeof(struct datarec);
+ rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+ rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz);
+
+ for (i = 0; i < MAX_CPUS; i++)
+ rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+ return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ int i;
+
+ for (i = 0; i < REDIR_RES_MAX; i++)
+ free(r->xdp_redirect[i].cpu);
+
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ free(r->xdp_exception[i].cpu);
+
+ free(r->xdp_cpumap_kthread.cpu);
+ free(r->xdp_devmap_xmit.cpu);
+
+ for (i = 0; i < MAX_CPUS; i++)
+ free(r->xdp_cpumap_enqueue[i].cpu);
+
+ free(r);
+}
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static void stats_poll(int interval, bool err_only)
+{
+ struct stats_record *rec, *prev;
+
+ rec = alloc_stats_record();
+ prev = alloc_stats_record();
+ stats_collect(rec);
+
+ if (err_only)
+ printf("\n%s\n", __doc_err_only__);
+
+ /* Trick to pretty printf with thousands separators use %' */
+ setlocale(LC_NUMERIC, "en_US");
+
+ /* Header */
+ if (verbose)
+ printf("\n%s", __doc__);
+
+ /* TODO Need more advanced stats on error types */
+ if (verbose) {
+ printf(" - Stats map0: %s\n", map_data[0].name);
+ printf(" - Stats map1: %s\n", map_data[1].name);
+ printf("\n");
+ }
+ fflush(stdout);
+
+ while (1) {
+ swap(&prev, &rec);
+ stats_collect(rec);
+ stats_print(rec, prev, err_only);
+ fflush(stdout);
+ sleep(interval);
+ }
+
+ free_stats_record(rec);
+ free_stats_record(prev);
+}
+
+static void print_bpf_prog_info(void)
+{
+ int i;
+
+ /* Prog info */
+ printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt);
+ for (i = 0; i < prog_cnt; i++) {
+ printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]);
+ }
+
+ /* Maps info */
+ printf("Loaded BPF prog have %d map(s)\n", map_data_count);
+ for (i = 0; i < map_data_count; i++) {
+ char *name = map_data[i].name;
+ int fd = map_data[i].fd;
+
+ printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
+ }
+
+ /* Event info */
+ printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt);
+ for (i = 0; i < prog_cnt; i++) {
+ if (event_fd[i] != -1)
+ printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int longindex = 0, opt;
+ int ret = EXIT_SUCCESS;
+ char bpf_obj_file[256];
+
+ /* Default settings: */
+ bool errors_only = true;
+ int interval = 2;
+
+ snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
+
+ /* Parse commands line args */
+ while ((opt = getopt_long(argc, argv, "hDSs:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 'D':
+ debug = true;
+ break;
+ case 'S':
+ errors_only = false;
+ break;
+ case 's':
+ interval = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage(argv);
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return EXIT_FAILURE;
+ }
+
+ if (load_bpf_file(bpf_obj_file)) {
+ printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
+ return EXIT_FAILURE;
+ }
+ if (!prog_fd[0]) {
+ printf("ERROR - load_bpf_file: %s\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ if (debug) {
+ print_bpf_prog_info();
+ }
+
+ /* Unload/stop tracepoint event by closing fd's */
+ if (errors_only) {
+ /* The prog_fd[i] and event_fd[i] depend on the
+ * order the functions was defined in _kern.c
+ */
+ close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */
+ close(prog_fd[2]); /* func: trace_xdp_redirect */
+ close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */
+ close(prog_fd[3]); /* func: trace_xdp_redirect_map */
+ }
+
+ stats_poll(interval, errors_only);
+
+ return ret;
+}
diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c
new file mode 100644
index 000000000..a306d1c75
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_kern.c
@@ -0,0 +1,721 @@
+/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
+ *
+ * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/if_vlan.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/udp.h>
+
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+#include "hash_func01.h"
+
+#define MAX_CPUS 64 /* WARNING - sync with _user.c */
+
+/* Special map type that can XDP_REDIRECT frames to another CPU */
+struct bpf_map_def SEC("maps") cpu_map = {
+ .type = BPF_MAP_TYPE_CPUMAP,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_CPUS,
+};
+
+/* Common stats data record to keep userspace more simple */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 issue;
+};
+
+/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
+ * feedback. Redirect TX errors can be caught via a tracepoint.
+ */
+struct bpf_map_def SEC("maps") rx_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") redirect_err_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 2,
+ /* TODO: have entries for all possible errno's */
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = MAX_CPUS,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Set of maps controlling available CPU, and for iterating through
+ * selectable redirect CPUs.
+ */
+struct bpf_map_def SEC("maps") cpus_available = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_CPUS,
+};
+struct bpf_map_def SEC("maps") cpus_count = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+struct bpf_map_def SEC("maps") cpus_iterator = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") exception_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Helper parse functions */
+
+/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
+ *
+ * Returns false on error and non-supported ether-type
+ */
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+static __always_inline
+bool parse_eth(struct ethhdr *eth, void *data_end,
+ u16 *eth_proto, u64 *l3_offset)
+{
+ u16 eth_type;
+ u64 offset;
+
+ offset = sizeof(*eth);
+ if ((void *)eth + offset > data_end)
+ return false;
+
+ eth_type = eth->h_proto;
+
+ /* Skip non 802.3 Ethertypes */
+ if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
+ return false;
+
+ /* Handle VLAN tagged packet */
+ if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ offset += sizeof(*vlan_hdr);
+ if ((void *)eth + offset > data_end)
+ return false;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ }
+ /* Handle double VLAN tagged packet */
+ if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ offset += sizeof(*vlan_hdr);
+ if ((void *)eth + offset > data_end)
+ return false;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ }
+
+ *eth_proto = ntohs(eth_type);
+ *l3_offset = offset;
+ return true;
+}
+
+static __always_inline
+u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct iphdr *iph = data + nh_off;
+ struct udphdr *udph;
+ u16 dport;
+
+ if (iph + 1 > data_end)
+ return 0;
+ if (!(iph->protocol == IPPROTO_UDP))
+ return 0;
+
+ udph = (void *)(iph + 1);
+ if (udph + 1 > data_end)
+ return 0;
+
+ dport = ntohs(udph->dest);
+ return dport;
+}
+
+static __always_inline
+int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ return iph->protocol;
+}
+
+static __always_inline
+int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ipv6hdr *ip6h = data + nh_off;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+ return ip6h->nexthdr;
+}
+
+SEC("xdp_cpu_map0")
+int xdp_prognum0_no_touch(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct datarec *rec;
+ u32 *cpu_selected;
+ u32 cpu_dest;
+ u32 key = 0;
+
+ /* Only use first entry in cpus_available */
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map1_touch_data")
+int xdp_prognum1_touch_data(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct datarec *rec;
+ u32 *cpu_selected;
+ u32 cpu_dest;
+ u16 eth_type;
+ u32 key = 0;
+
+ /* Only use first entry in cpus_available */
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Validate packet length is minimum Eth header size */
+ if (eth + 1 > data_end)
+ return XDP_ABORTED;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ /* Read packet data, and use it (drop non 802.3 Ethertypes) */
+ eth_type = eth->h_proto;
+ if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ rec->dropped++;
+ return XDP_DROP;
+ }
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map2_round_robin")
+int xdp_prognum2_round_robin(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct datarec *rec;
+ u32 cpu_dest;
+ u32 *cpu_lookup;
+ u32 key0 = 0;
+
+ u32 *cpu_selected;
+ u32 *cpu_iterator;
+ u32 *cpu_max;
+ u32 cpu_idx;
+
+ cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
+ if (!cpu_max)
+ return XDP_ABORTED;
+
+ cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
+ if (!cpu_iterator)
+ return XDP_ABORTED;
+ cpu_idx = *cpu_iterator;
+
+ *cpu_iterator += 1;
+ if (*cpu_iterator == *cpu_max)
+ *cpu_iterator = 0;
+
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key0);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map3_proto_separate")
+int xdp_prognum3_proto_separate(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u8 ip_proto = IPPROTO_UDP;
+ struct datarec *rec;
+ u16 eth_proto = 0;
+ u64 l3_offset = 0;
+ u32 cpu_dest = 0;
+ u32 cpu_idx = 0;
+ u32 *cpu_lookup;
+ u32 key = 0;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+ return XDP_PASS; /* Just skip */
+
+ /* Extract L4 protocol */
+ switch (eth_proto) {
+ case ETH_P_IP:
+ ip_proto = get_proto_ipv4(ctx, l3_offset);
+ break;
+ case ETH_P_IPV6:
+ ip_proto = get_proto_ipv6(ctx, l3_offset);
+ break;
+ case ETH_P_ARP:
+ cpu_idx = 0; /* ARP packet handled on separate CPU */
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ /* Choose CPU based on L4 protocol */
+ switch (ip_proto) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ cpu_idx = 2;
+ break;
+ case IPPROTO_TCP:
+ cpu_idx = 0;
+ break;
+ case IPPROTO_UDP:
+ cpu_idx = 1;
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_lookup)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_lookup;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map4_ddos_filter_pktgen")
+int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u8 ip_proto = IPPROTO_UDP;
+ struct datarec *rec;
+ u16 eth_proto = 0;
+ u64 l3_offset = 0;
+ u32 cpu_dest = 0;
+ u32 cpu_idx = 0;
+ u16 dest_port;
+ u32 *cpu_lookup;
+ u32 key = 0;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+ return XDP_PASS; /* Just skip */
+
+ /* Extract L4 protocol */
+ switch (eth_proto) {
+ case ETH_P_IP:
+ ip_proto = get_proto_ipv4(ctx, l3_offset);
+ break;
+ case ETH_P_IPV6:
+ ip_proto = get_proto_ipv6(ctx, l3_offset);
+ break;
+ case ETH_P_ARP:
+ cpu_idx = 0; /* ARP packet handled on separate CPU */
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ /* Choose CPU based on L4 protocol */
+ switch (ip_proto) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ cpu_idx = 2;
+ break;
+ case IPPROTO_TCP:
+ cpu_idx = 0;
+ break;
+ case IPPROTO_UDP:
+ cpu_idx = 1;
+ /* DDoS filter UDP port 9 (pktgen) */
+ dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
+ if (dest_port == 9) {
+ if (rec)
+ rec->dropped++;
+ return XDP_DROP;
+ }
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_lookup)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_lookup;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+/* Hashing initval */
+#define INITVAL 15485863
+
+static __always_inline
+u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct iphdr *iph = data + nh_off;
+ u32 cpu_hash;
+
+ if (iph + 1 > data_end)
+ return 0;
+
+ cpu_hash = iph->saddr + iph->daddr;
+ cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);
+
+ return cpu_hash;
+}
+
+static __always_inline
+u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ipv6hdr *ip6h = data + nh_off;
+ u32 cpu_hash;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+
+ cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
+ cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
+ cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
+ cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
+ cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);
+
+ return cpu_hash;
+}
+
+/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The
+ * hashing scheme is symmetric, meaning swapping IP src/dest still hit
+ * same CPU.
+ */
+SEC("xdp_cpu_map5_lb_hash_ip_pairs")
+int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u8 ip_proto = IPPROTO_UDP;
+ struct datarec *rec;
+ u16 eth_proto = 0;
+ u64 l3_offset = 0;
+ u32 cpu_dest = 0;
+ u32 cpu_idx = 0;
+ u32 *cpu_lookup;
+ u32 *cpu_max;
+ u32 cpu_hash;
+ u32 key = 0;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
+ if (!cpu_max)
+ return XDP_ABORTED;
+
+ if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+ return XDP_PASS; /* Just skip */
+
+ /* Hash for IPv4 and IPv6 */
+ switch (eth_proto) {
+ case ETH_P_IP:
+ cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset);
+ break;
+ case ETH_P_IPV6:
+ cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset);
+ break;
+ case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */
+ default:
+ cpu_hash = 0;
+ }
+
+ /* Choose CPU based on hash */
+ cpu_idx = cpu_hash % *cpu_max;
+
+ cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_lookup)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_lookup;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+char _license[] SEC("license") = "GPL";
+
+/*** Trace point code ***/
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_redirect_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12 size:4; signed:0;
+ int ifindex; // offset:16 size:4; signed:1;
+ int err; // offset:20 size:4; signed:1;
+ int to_ifindex; // offset:24 size:4; signed:1;
+ u32 map_id; // offset:28 size:4; signed:0;
+ int map_index; // offset:32 size:4; signed:1;
+}; // offset:36
+
+enum {
+ XDP_REDIRECT_SUCCESS = 0,
+ XDP_REDIRECT_ERROR = 1
+};
+
+static __always_inline
+int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
+{
+ u32 key = XDP_REDIRECT_ERROR;
+ struct datarec *rec;
+ int err = ctx->err;
+
+ if (!err)
+ key = XDP_REDIRECT_SUCCESS;
+
+ rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->dropped += 1;
+
+ return 0; /* Indicate event was filtered (no further processing)*/
+ /*
+ * Returning 1 here would allow e.g. a perf-record tracepoint
+ * to see and record these events, but it doesn't work well
+ * in-practice as stopping perf-record also unload this
+ * bpf_prog. Plus, there is additional overhead of doing so.
+ */
+}
+
+SEC("tracepoint/xdp/xdp_redirect_err")
+int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+SEC("tracepoint/xdp/xdp_redirect_map_err")
+int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int ifindex; // offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&exception_cnt, &key);
+ if (!rec)
+ return 1;
+ rec->dropped += 1;
+
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int to_cpu; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+ u32 to_cpu = ctx->to_cpu;
+ struct datarec *rec;
+
+ if (to_cpu >= MAX_CPUS)
+ return 1;
+
+ rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ if (ctx->processed > 0)
+ rec->issue += 1;
+
+ /* Inception: It's possible to detect overload situations, via
+ * this tracepoint. This can be used for creating a feedback
+ * loop to XDP, which can take appropriate actions to mitigate
+ * this overload situation.
+ */
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int sched; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Count times kthread yielded CPU via schedule call */
+ if (ctx->sched)
+ rec->issue++;
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
new file mode 100644
index 000000000..2d23054aa
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -0,0 +1,698 @@
+/* GPLv2 Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+static const char *__doc__ =
+ " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\"";
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#define MAX_CPUS 64 /* WARNING - sync with _kern.c */
+
+/* How many xdp_progs are defined in _kern.c */
+#define MAX_PROG 6
+
+/* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead
+ * use bpf/libbpf.h), but cannot as (currently) needed for XDP
+ * attaching to a device via bpf_set_link_xdp_fd()
+ */
+#include <bpf/bpf.h>
+#include "bpf_load.h"
+
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK 0
+#define EXIT_FAIL 1
+#define EXIT_FAIL_OPTION 2
+#define EXIT_FAIL_XDP 3
+#define EXIT_FAIL_BPF 4
+#define EXIT_FAIL_MEM 5
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"dev", required_argument, NULL, 'd' },
+ {"skb-mode", no_argument, NULL, 'S' },
+ {"debug", no_argument, NULL, 'D' },
+ {"sec", required_argument, NULL, 's' },
+ {"prognum", required_argument, NULL, 'p' },
+ {"qsize", required_argument, NULL, 'q' },
+ {"cpu", required_argument, NULL, 'c' },
+ {"stress-mode", no_argument, NULL, 'x' },
+ {"no-separators", no_argument, NULL, 'z' },
+ {0, 0, NULL, 0 }
+};
+
+static void int_exit(int sig)
+{
+ fprintf(stderr,
+ "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+ ifindex, ifname);
+ if (ifindex > -1)
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(EXIT_OK);
+}
+
+static void usage(char *argv[])
+{
+ int i;
+
+ printf("\nDOCUMENTATION:\n%s\n", __doc__);
+ printf("\n");
+ printf(" Usage: %s (options-see-below)\n", argv[0]);
+ printf(" Listing options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-12s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)",
+ *long_options[i].flag);
+ else
+ printf(" short-option: -%c",
+ long_options[i].val);
+ printf("\n");
+ }
+ printf("\n");
+}
+
+/* gettime returns the current time of day in nanoseconds.
+ * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC)
+ * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE)
+ */
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+ struct timespec t;
+ int res;
+
+ res = clock_gettime(CLOCK_MONOTONIC, &t);
+ if (res < 0) {
+ fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+ exit(EXIT_FAIL);
+ }
+ return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 issue;
+};
+struct record {
+ __u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+struct stats_record {
+ struct record rx_cnt;
+ struct record redir_err;
+ struct record kthread;
+ struct record exception;
+ struct record enq[MAX_CPUS];
+};
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec values[nr_cpus];
+ __u64 sum_processed = 0;
+ __u64 sum_dropped = 0;
+ __u64 sum_issue = 0;
+ int i;
+
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_processed += values[i].processed;
+ rec->cpu[i].dropped = values[i].dropped;
+ sum_dropped += values[i].dropped;
+ rec->cpu[i].issue = values[i].issue;
+ sum_issue += values[i].issue;
+ }
+ rec->total.processed = sum_processed;
+ rec->total.dropped = sum_dropped;
+ rec->total.issue = sum_issue;
+ return true;
+}
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec *array;
+ size_t size;
+
+ size = sizeof(struct datarec) * nr_cpus;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ struct stats_record *rec;
+ int i;
+
+ rec = malloc(sizeof(*rec));
+ memset(rec, 0, sizeof(*rec));
+ if (!rec) {
+ fprintf(stderr, "Mem alloc error\n");
+ exit(EXIT_FAIL_MEM);
+ }
+ rec->rx_cnt.cpu = alloc_record_per_cpu();
+ rec->redir_err.cpu = alloc_record_per_cpu();
+ rec->kthread.cpu = alloc_record_per_cpu();
+ rec->exception.cpu = alloc_record_per_cpu();
+ for (i = 0; i < MAX_CPUS; i++)
+ rec->enq[i].cpu = alloc_record_per_cpu();
+
+ return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ int i;
+
+ for (i = 0; i < MAX_CPUS; i++)
+ free(r->enq[i].cpu);
+ free(r->exception.cpu);
+ free(r->kthread.cpu);
+ free(r->redir_err.cpu);
+ free(r->rx_cnt.cpu);
+ free(r);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->dropped - p->dropped;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+ struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->issue - p->issue;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ int prog_num)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ double pps = 0, drop = 0, err = 0;
+ struct record *rec, *prev;
+ int to_cpu;
+ double t;
+ int i;
+
+ /* Header */
+ printf("Running XDP/eBPF prog_num:%d\n", prog_num);
+ printf("%-15s %-7s %-14s %-11s %-9s\n",
+ "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info");
+
+ /* XDP rx_cnt */
+ {
+ char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n";
+ char *errstr = "";
+
+ rec = &stats_rec->rx_cnt;
+ prev = &stats_prev->rx_cnt;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ errstr = "cpu-dest/err";
+ if (pps > 0)
+ printf(fmt_rx, "XDP-RX",
+ i, pps, drop, err, errstr);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ printf(fm2_rx, "XDP-RX", "total", pps, drop);
+ }
+
+ /* cpumap enqueue stats */
+ for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+ char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+ char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+ char *errstr = "";
+
+ rec = &stats_rec->enq[to_cpu];
+ prev = &stats_prev->enq[to_cpu];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0) {
+ errstr = "bulk-average";
+ err = pps / err; /* calc average bulk size */
+ }
+ if (pps > 0)
+ printf(fmt, "cpumap-enqueue",
+ i, to_cpu, pps, drop, err, errstr);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ if (pps > 0) {
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (err > 0) {
+ errstr = "bulk-average";
+ err = pps / err; /* calc average bulk size */
+ }
+ printf(fm2, "cpumap-enqueue",
+ "sum", to_cpu, pps, drop, err, errstr);
+ }
+ }
+
+ /* cpumap kthread stats */
+ {
+ char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *e_str = "";
+
+ rec = &stats_rec->kthread;
+ prev = &stats_prev->kthread;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ e_str = "sched";
+ if (pps > 0)
+ printf(fmt_k, "cpumap_kthread",
+ i, pps, drop, err, e_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (err > 0)
+ e_str = "sched-sum";
+ printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str);
+ }
+
+ /* XDP redirect err tracepoints (very unlikely) */
+ {
+ char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+ char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+ rec = &stats_rec->redir_err;
+ prev = &stats_prev->redir_err;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ if (pps > 0)
+ printf(fmt_err, "redirect_err", i, pps, drop);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ printf(fm2_err, "redirect_err", "total", pps, drop);
+ }
+
+ /* XDP general exception tracepoints */
+ {
+ char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+ char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+ rec = &stats_rec->exception;
+ prev = &stats_prev->exception;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ if (pps > 0)
+ printf(fmt_err, "xdp_exception", i, pps, drop);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ printf(fm2_err, "xdp_exception", "total", pps, drop);
+ }
+
+ printf("\n");
+ fflush(stdout);
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+ int fd, i;
+
+ fd = map_fd[1]; /* map: rx_cnt */
+ map_collect_percpu(fd, 0, &rec->rx_cnt);
+
+ fd = map_fd[2]; /* map: redirect_err_cnt */
+ map_collect_percpu(fd, 1, &rec->redir_err);
+
+ fd = map_fd[3]; /* map: cpumap_enqueue_cnt */
+ for (i = 0; i < MAX_CPUS; i++)
+ map_collect_percpu(fd, i, &rec->enq[i]);
+
+ fd = map_fd[4]; /* map: cpumap_kthread_cnt */
+ map_collect_percpu(fd, 0, &rec->kthread);
+
+ fd = map_fd[8]; /* map: exception_cnt */
+ map_collect_percpu(fd, 0, &rec->exception);
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static int create_cpu_entry(__u32 cpu, __u32 queue_size,
+ __u32 avail_idx, bool new)
+{
+ __u32 curr_cpus_count = 0;
+ __u32 key = 0;
+ int ret;
+
+ /* Add a CPU entry to cpumap, as this allocate a cpu entry in
+ * the kernel for the cpu.
+ */
+ ret = bpf_map_update_elem(map_fd[0], &cpu, &queue_size, 0);
+ if (ret) {
+ fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* Inform bpf_prog's that a new CPU is available to select
+ * from via some control maps.
+ */
+ /* map_fd[5] = cpus_available */
+ ret = bpf_map_update_elem(map_fd[5], &avail_idx, &cpu, 0);
+ if (ret) {
+ fprintf(stderr, "Add to avail CPUs failed\n");
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* When not replacing/updating existing entry, bump the count */
+ /* map_fd[6] = cpus_count */
+ ret = bpf_map_lookup_elem(map_fd[6], &key, &curr_cpus_count);
+ if (ret) {
+ fprintf(stderr, "Failed reading curr cpus_count\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ if (new) {
+ curr_cpus_count++;
+ ret = bpf_map_update_elem(map_fd[6], &key, &curr_cpus_count, 0);
+ if (ret) {
+ fprintf(stderr, "Failed write curr cpus_count\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ }
+ /* map_fd[7] = cpus_iterator */
+ printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n",
+ new ? "Add-new":"Replace", cpu, avail_idx,
+ queue_size, curr_cpus_count);
+
+ return 0;
+}
+
+/* CPUs are zero-indexed. Thus, add a special sentinel default value
+ * in map cpus_available to mark CPU index'es not configured
+ */
+static void mark_cpus_unavailable(void)
+{
+ __u32 invalid_cpu = MAX_CPUS;
+ int ret, i;
+
+ for (i = 0; i < MAX_CPUS; i++) {
+ /* map_fd[5] = cpus_available */
+ ret = bpf_map_update_elem(map_fd[5], &i, &invalid_cpu, 0);
+ if (ret) {
+ fprintf(stderr, "Failed marking CPU unavailable\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ }
+}
+
+/* Stress cpumap management code by concurrently changing underlying cpumap */
+static void stress_cpumap(void)
+{
+ /* Changing qsize will cause kernel to free and alloc a new
+ * bpf_cpu_map_entry, with an associated/complicated tear-down
+ * procedure.
+ */
+ create_cpu_entry(1, 1024, 0, false);
+ create_cpu_entry(1, 8, 0, false);
+ create_cpu_entry(1, 16000, 0, false);
+}
+
+static void stats_poll(int interval, bool use_separators, int prog_num,
+ bool stress_mode)
+{
+ struct stats_record *record, *prev;
+
+ record = alloc_stats_record();
+ prev = alloc_stats_record();
+ stats_collect(record);
+
+ /* Trick to pretty printf with thousands separators use %' */
+ if (use_separators)
+ setlocale(LC_NUMERIC, "en_US");
+
+ while (1) {
+ swap(&prev, &record);
+ stats_collect(record);
+ stats_print(record, prev, prog_num);
+ sleep(interval);
+ if (stress_mode)
+ stress_cpumap();
+ }
+
+ free_stats_record(record);
+ free_stats_record(prev);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+ bool use_separators = true;
+ bool stress_mode = false;
+ char filename[256];
+ bool debug = false;
+ int added_cpus = 0;
+ int longindex = 0;
+ int interval = 2;
+ int prog_num = 5;
+ int add_cpu = -1;
+ __u32 qsize;
+ int opt;
+
+ /* Notice: choosing he queue size is very important with the
+ * ixgbe driver, because it's driver page recycling trick is
+ * dependend on pages being returned quickly. The number of
+ * out-standing packets in the system must be less-than 2x
+ * RX-ring size.
+ */
+ qsize = 128+64;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+ return EXIT_FAIL;
+ }
+
+ if (!prog_fd[0]) {
+ fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+ return EXIT_FAIL;
+ }
+
+ mark_cpus_unavailable();
+
+ /* Parse commands line args */
+ while ((opt = getopt_long(argc, argv, "hSd:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 'd':
+ if (strlen(optarg) >= IF_NAMESIZE) {
+ fprintf(stderr, "ERR: --dev name too long\n");
+ goto error;
+ }
+ ifname = (char *)&ifname_buf;
+ strncpy(ifname, optarg, IF_NAMESIZE);
+ ifindex = if_nametoindex(ifname);
+ if (ifindex == 0) {
+ fprintf(stderr,
+ "ERR: --dev name unknown err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 's':
+ interval = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'D':
+ debug = true;
+ break;
+ case 'x':
+ stress_mode = true;
+ break;
+ case 'z':
+ use_separators = false;
+ break;
+ case 'p':
+ /* Selecting eBPF prog to load */
+ prog_num = atoi(optarg);
+ if (prog_num < 0 || prog_num >= MAX_PROG) {
+ fprintf(stderr,
+ "--prognum too large err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 'c':
+ /* Add multiple CPUs */
+ add_cpu = strtoul(optarg, NULL, 0);
+ if (add_cpu >= MAX_CPUS) {
+ fprintf(stderr,
+ "--cpu nr too large for cpumap err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ create_cpu_entry(add_cpu, qsize, added_cpus, true);
+ added_cpus++;
+ break;
+ case 'q':
+ qsize = atoi(optarg);
+ break;
+ case 'h':
+ error:
+ default:
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ }
+ /* Required option */
+ if (ifindex == -1) {
+ fprintf(stderr, "ERR: required option --dev missing\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ /* Required option */
+ if (add_cpu == -1) {
+ fprintf(stderr, "ERR: required option --cpu missing\n");
+ fprintf(stderr, " Specify multiple --cpu option to add more\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+
+ /* Remove XDP program when program is interrupted or killed */
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) {
+ fprintf(stderr, "link set xdp fd failed\n");
+ return EXIT_FAIL_XDP;
+ }
+
+ if (debug) {
+ printf("Debug-mode reading trace pipe (fix #define DEBUG)\n");
+ read_trace_pipe();
+ }
+
+ stats_poll(interval, use_separators, prog_num, stress_mode);
+ return EXIT_OK;
+}
diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c
new file mode 100644
index 000000000..8abb151e3
--- /dev/null
+++ b/samples/bpf/xdp_redirect_kern.c
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
+ * feedback. Redirect TX errors can be caught via a tracepoint.
+ */
+struct bpf_map_def SEC("maps") rxcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 1,
+};
+
+static void swap_src_dst_mac(void *data)
+{
+ unsigned short *p = data;
+ unsigned short dst[3];
+
+ dst[0] = p[0];
+ dst[1] = p[1];
+ dst[2] = p[2];
+ p[0] = p[3];
+ p[1] = p[4];
+ p[2] = p[5];
+ p[3] = dst[0];
+ p[4] = dst[1];
+ p[5] = dst[2];
+}
+
+SEC("xdp_redirect")
+int xdp_redirect_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ int rc = XDP_DROP;
+ int *ifindex, port = 0;
+ long *value;
+ u32 key = 0;
+ u64 nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ ifindex = bpf_map_lookup_elem(&tx_port, &port);
+ if (!ifindex)
+ return rc;
+
+ value = bpf_map_lookup_elem(&rxcnt, &key);
+ if (value)
+ *value += 1;
+
+ swap_src_dst_mac(data);
+ return bpf_redirect(*ifindex, 0);
+}
+
+/* Redirect require an XDP bpf_prog loaded on the TX device */
+SEC("xdp_redirect_dummy")
+int xdp_redirect_dummy_prog(struct xdp_md *ctx)
+{
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c
new file mode 100644
index 000000000..740a529ba
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_kern.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_DEVMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 100,
+};
+
+/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
+ * feedback. Redirect TX errors can be caught via a tracepoint.
+ */
+struct bpf_map_def SEC("maps") rxcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 1,
+};
+
+static void swap_src_dst_mac(void *data)
+{
+ unsigned short *p = data;
+ unsigned short dst[3];
+
+ dst[0] = p[0];
+ dst[1] = p[1];
+ dst[2] = p[2];
+ p[0] = p[3];
+ p[1] = p[4];
+ p[2] = p[5];
+ p[3] = dst[0];
+ p[4] = dst[1];
+ p[5] = dst[2];
+}
+
+SEC("xdp_redirect_map")
+int xdp_redirect_map_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ int rc = XDP_DROP;
+ int vport, port = 0, m = 0;
+ long *value;
+ u32 key = 0;
+ u64 nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ /* constant virtual port */
+ vport = 0;
+
+ /* count packet in global counter */
+ value = bpf_map_lookup_elem(&rxcnt, &key);
+ if (value)
+ *value += 1;
+
+ swap_src_dst_mac(data);
+
+ /* send packet out physical port */
+ return bpf_redirect_map(&tx_port, vport, 0);
+}
+
+/* Redirect require an XDP bpf_prog loaded on the TX device */
+SEC("xdp_redirect_dummy")
+int xdp_redirect_dummy_prog(struct xdp_md *ctx)
+{
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
new file mode 100644
index 000000000..4445e7685
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -0,0 +1,152 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+
+static int ifindex_in;
+static int ifindex_out;
+static bool ifindex_out_xdp_dummy_attached = true;
+
+static __u32 xdp_flags;
+
+static void int_exit(int sig)
+{
+ bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
+ if (ifindex_out_xdp_dummy_attached)
+ bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
+ exit(0);
+}
+
+static void poll_stats(int interval, int ifindex)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u64 values[nr_cpus], prev[nr_cpus];
+
+ memset(prev, 0, sizeof(prev));
+
+ while (1) {
+ __u64 sum = 0;
+ __u32 key = 0;
+ int i;
+
+ sleep(interval);
+ assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[i]);
+ if (sum)
+ printf("ifindex %i: %10llu pkt/s\n",
+ ifindex, sum / interval);
+ memcpy(prev, values, sizeof(values));
+ }
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n"
+ "OPTS:\n"
+ " -S use skb-mode\n"
+ " -N enforce native mode\n",
+ prog);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ const char *optstr = "SN";
+ char filename[256];
+ int ret, opt, key = 0;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ switch (opt) {
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (optind == argc) {
+ printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]);
+ return 1;
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ ifindex_in = strtoul(argv[optind], NULL, 0);
+ ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+ printf("input: %d output: %d\n", ifindex_in, ifindex_out);
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (!prog_fd[0]) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
+ printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
+ return 1;
+ }
+
+ /* Loading dummy XDP prog on out-device */
+ if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1],
+ (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
+ printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
+ ifindex_out_xdp_dummy_attached = false;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ printf("map[0] (vports) = %i, map[1] (map) = %i, map[2] (count) = %i\n",
+ map_fd[0], map_fd[1], map_fd[2]);
+
+ /* populate virtual to physical port map */
+ ret = bpf_map_update_elem(map_fd[0], &key, &ifindex_out, 0);
+ if (ret) {
+ perror("bpf_update_elem");
+ goto out;
+ }
+
+ poll_stats(2, ifindex_out);
+
+out:
+ return 0;
+}
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
new file mode 100644
index 000000000..0f96a26b6
--- /dev/null
+++ b/samples/bpf/xdp_redirect_user.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+
+static int ifindex_in;
+static int ifindex_out;
+static bool ifindex_out_xdp_dummy_attached = true;
+
+static __u32 xdp_flags;
+
+static void int_exit(int sig)
+{
+ bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
+ if (ifindex_out_xdp_dummy_attached)
+ bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
+ exit(0);
+}
+
+static void poll_stats(int interval, int ifindex)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u64 values[nr_cpus], prev[nr_cpus];
+
+ memset(prev, 0, sizeof(prev));
+
+ while (1) {
+ __u64 sum = 0;
+ __u32 key = 0;
+ int i;
+
+ sleep(interval);
+ assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[i]);
+ if (sum)
+ printf("ifindex %i: %10llu pkt/s\n",
+ ifindex, sum / interval);
+ memcpy(prev, values, sizeof(values));
+ }
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n"
+ "OPTS:\n"
+ " -S use skb-mode\n"
+ " -N enforce native mode\n",
+ prog);
+}
+
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ const char *optstr = "SN";
+ char filename[256];
+ int ret, opt, key = 0;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ switch (opt) {
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (optind == argc) {
+ printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]);
+ return 1;
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ ifindex_in = strtoul(argv[optind], NULL, 0);
+ ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+ printf("input: %d output: %d\n", ifindex_in, ifindex_out);
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (!prog_fd[0]) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
+ printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
+ return 1;
+ }
+
+ /* Loading dummy XDP prog on out-device */
+ if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1],
+ (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
+ printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
+ ifindex_out_xdp_dummy_attached = false;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ /* bpf redirect port */
+ ret = bpf_map_update_elem(map_fd[0], &key, &ifindex_out, 0);
+ if (ret) {
+ perror("bpf_update_elem");
+ goto out;
+ }
+
+ poll_stats(2, ifindex_out);
+
+out:
+ return ret;
+}
diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c
new file mode 100644
index 000000000..993f56bc7
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_kern.c
@@ -0,0 +1,186 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+#include <linux/slab.h>
+#include <net/ip_fib.h>
+
+struct trie_value {
+ __u8 prefix[4];
+ __be64 value;
+ int ifindex;
+ int metric;
+ __be32 gw;
+};
+
+/* Key for lpm_trie*/
+union key_4 {
+ u32 b32[2];
+ u8 b8[8];
+};
+
+struct arp_entry {
+ __be64 mac;
+ __be32 dst;
+};
+
+struct direct_map {
+ struct arp_entry arp;
+ int ifindex;
+ __be64 mac;
+};
+
+/* Map for trie implementation*/
+struct bpf_map_def SEC("maps") lpm_map = {
+ .type = BPF_MAP_TYPE_LPM_TRIE,
+ .key_size = 8,
+ .value_size = sizeof(struct trie_value),
+ .max_entries = 50,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+/* Map for counter*/
+struct bpf_map_def SEC("maps") rxcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = 256,
+};
+
+/* Map for ARP table*/
+struct bpf_map_def SEC("maps") arp_table = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__be32),
+ .value_size = sizeof(__be64),
+ .max_entries = 50,
+};
+
+/* Map to keep the exact match entries in the route table*/
+struct bpf_map_def SEC("maps") exact_match = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__be32),
+ .value_size = sizeof(struct direct_map),
+ .max_entries = 50,
+};
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_DEVMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 100,
+};
+
+/* Function to set source and destination mac of the packet */
+static inline void set_src_dst_mac(void *data, void *src, void *dst)
+{
+ unsigned short *source = src;
+ unsigned short *dest = dst;
+ unsigned short *p = data;
+
+ __builtin_memcpy(p, dest, 6);
+ __builtin_memcpy(p + 3, source, 6);
+}
+
+/* Parse IPV4 packet to get SRC, DST IP and protocol */
+static inline int parse_ipv4(void *data, u64 nh_off, void *data_end,
+ __be32 *src, __be32 *dest)
+{
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ *src = iph->saddr;
+ *dest = iph->daddr;
+ return iph->protocol;
+}
+
+SEC("xdp_router_ipv4")
+int xdp_router_ipv4_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ __be64 *dest_mac = NULL, *src_mac = NULL;
+ void *data = (void *)(long)ctx->data;
+ struct trie_value *prefix_value;
+ int rc = XDP_DROP, forward_to;
+ struct ethhdr *eth = data;
+ union key_4 key4;
+ long *value;
+ u16 h_proto;
+ u32 ipproto;
+ u64 nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_ARP)) {
+ return XDP_PASS;
+ } else if (h_proto == htons(ETH_P_IP)) {
+ struct direct_map *direct_entry;
+ __be32 src_ip = 0, dest_ip = 0;
+
+ ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip);
+ direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip);
+ /* Check for exact match, this would give a faster lookup*/
+ if (direct_entry && direct_entry->mac && direct_entry->arp.mac) {
+ src_mac = &direct_entry->mac;
+ dest_mac = &direct_entry->arp.mac;
+ forward_to = direct_entry->ifindex;
+ } else {
+ /* Look up in the trie for lpm*/
+ key4.b32[0] = 32;
+ key4.b8[4] = dest_ip & 0xff;
+ key4.b8[5] = (dest_ip >> 8) & 0xff;
+ key4.b8[6] = (dest_ip >> 16) & 0xff;
+ key4.b8[7] = (dest_ip >> 24) & 0xff;
+ prefix_value = bpf_map_lookup_elem(&lpm_map, &key4);
+ if (!prefix_value)
+ return XDP_DROP;
+ src_mac = &prefix_value->value;
+ if (!src_mac)
+ return XDP_DROP;
+ dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
+ if (!dest_mac) {
+ if (!prefix_value->gw)
+ return XDP_DROP;
+ dest_ip = prefix_value->gw;
+ dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
+ }
+ forward_to = prefix_value->ifindex;
+ }
+ } else {
+ ipproto = 0;
+ }
+ if (src_mac && dest_mac) {
+ set_src_dst_mac(data, src_mac, dest_mac);
+ value = bpf_map_lookup_elem(&rxcnt, &ipproto);
+ if (value)
+ *value += 1;
+ return bpf_redirect_map(&tx_port, forward_to, 0);
+ }
+ return rc;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
new file mode 100644
index 000000000..b2b4dfa77
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -0,0 +1,660 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bpf_load.h"
+#include <bpf/bpf.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include "bpf_util.h"
+
+int sock, sock_arp, flags = 0;
+static int total_ifindex;
+int *ifindex_list;
+char buf[8192];
+
+static int get_route_table(int rtm_family);
+static void int_exit(int sig)
+{
+ int i = 0;
+
+ for (i = 0; i < total_ifindex; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+ exit(0);
+}
+
+static void close_and_exit(int sig)
+{
+ int i = 0;
+
+ close(sock);
+ close(sock_arp);
+
+ for (i = 0; i < total_ifindex; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+ exit(0);
+}
+
+/* Get the mac address of the interface given interface name */
+static __be64 getmac(char *iface)
+{
+ struct ifreq ifr;
+ __be64 mac = 0;
+ int fd, i;
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ ifr.ifr_addr.sa_family = AF_INET;
+ strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1);
+ if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) {
+ printf("ioctl failed leaving....\n");
+ return -1;
+ }
+ for (i = 0; i < 6 ; i++)
+ *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i];
+ close(fd);
+ return mac;
+}
+
+static int recv_msg(struct sockaddr_nl sock_addr, int sock)
+{
+ struct nlmsghdr *nh;
+ int len, nll = 0;
+ char *buf_ptr;
+
+ buf_ptr = buf;
+ while (1) {
+ len = recv(sock, buf_ptr, sizeof(buf) - nll, 0);
+ if (len < 0)
+ return len;
+
+ nh = (struct nlmsghdr *)buf_ptr;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ buf_ptr += len;
+ nll += len;
+ if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH)
+ break;
+
+ if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE)
+ break;
+ }
+ return nll;
+}
+
+/* Function to parse the route entry returned by netlink
+ * Updates the route entry related map entries
+ */
+static void read_route(struct nlmsghdr *nh, int nll)
+{
+ char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24];
+ struct bpf_lpm_trie_key *prefix_key;
+ struct rtattr *rt_attr;
+ struct rtmsg *rt_msg;
+ int rtm_family;
+ int rtl;
+ int i;
+ struct route_table {
+ int dst_len, iface, metric;
+ char *iface_name;
+ __be32 dst, gw;
+ __be64 mac;
+ } route;
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ };
+
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ if (nh->nlmsg_type == RTM_DELROUTE)
+ printf("DELETING Route entry\n");
+ else if (nh->nlmsg_type == RTM_GETROUTE)
+ printf("READING Route entry\n");
+ else if (nh->nlmsg_type == RTM_NEWROUTE)
+ printf("NEW Route entry\n");
+ else
+ printf("%d\n", nh->nlmsg_type);
+
+ memset(&route, 0, sizeof(route));
+ printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n");
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct rtmsg *)NLMSG_DATA(nh);
+ rtm_family = rt_msg->rtm_family;
+ if (rtm_family == AF_INET)
+ if (rt_msg->rtm_table != RT_TABLE_MAIN)
+ continue;
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ rtl = RTM_PAYLOAD(nh);
+
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ (*((__be32 *)RTA_DATA(rt_attr))));
+ break;
+ case RTA_GATEWAY:
+ sprintf(gws, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_OIF:
+ sprintf(ifs, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_METRICS:
+ sprintf(metrics, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ default:
+ break;
+ }
+ }
+ sprintf(dsts_len, "%d", rt_msg->rtm_dst_len);
+ route.dst = atoi(dsts);
+ route.dst_len = atoi(dsts_len);
+ route.gw = atoi(gws);
+ route.iface = atoi(ifs);
+ route.metric = atoi(metrics);
+ route.iface_name = alloca(sizeof(char *) * IFNAMSIZ);
+ route.iface_name = if_indextoname(route.iface, route.iface_name);
+ route.mac = getmac(route.iface_name);
+ if (route.mac == -1) {
+ int i = 0;
+
+ for (i = 0; i < total_ifindex; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+ exit(0);
+ }
+ assert(bpf_map_update_elem(map_fd[4], &route.iface, &route.iface, 0) == 0);
+ if (rtm_family == AF_INET) {
+ struct trie_value {
+ __u8 prefix[4];
+ __be64 value;
+ int ifindex;
+ int metric;
+ __be32 gw;
+ } *prefix_value;
+
+ prefix_key = alloca(sizeof(*prefix_key) + 3);
+ prefix_value = alloca(sizeof(*prefix_value));
+
+ prefix_key->prefixlen = 32;
+ prefix_key->prefixlen = route.dst_len;
+ direct_entry.mac = route.mac & 0xffffffffffff;
+ direct_entry.ifindex = route.iface;
+ direct_entry.arp.mac = 0;
+ direct_entry.arp.dst = 0;
+ if (route.dst_len == 32) {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ assert(bpf_map_delete_elem(map_fd[3], &route.dst) == 0);
+ } else {
+ if (bpf_map_lookup_elem(map_fd[2], &route.dst, &direct_entry.arp.mac) == 0)
+ direct_entry.arp.dst = route.dst;
+ assert(bpf_map_update_elem(map_fd[3], &route.dst, &direct_entry, 0) == 0);
+ }
+ }
+ for (i = 0; i < 4; i++)
+ prefix_key->data[i] = (route.dst >> i * 8) & 0xff;
+
+ printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n",
+ (int)prefix_key->data[0],
+ (int)prefix_key->data[1],
+ (int)prefix_key->data[2],
+ (int)prefix_key->data[3],
+ route.gw, route.dst_len,
+ route.metric,
+ route.iface_name);
+ if (bpf_map_lookup_elem(map_fd[0], prefix_key,
+ prefix_value) < 0) {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] = prefix_key->data[i];
+ prefix_value->value = route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+
+ assert(bpf_map_update_elem(map_fd[0],
+ prefix_key,
+ prefix_value, 0
+ ) == 0);
+ } else {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ printf("deleting entry\n");
+ printf("prefix key=%d.%d.%d.%d/%d",
+ prefix_key->data[0],
+ prefix_key->data[1],
+ prefix_key->data[2],
+ prefix_key->data[3],
+ prefix_key->prefixlen);
+ assert(bpf_map_delete_elem(map_fd[0],
+ prefix_key
+ ) == 0);
+ /* Rereading the route table to check if
+ * there is an entry with the same
+ * prefix but a different metric as the
+ * deleted enty.
+ */
+ get_route_table(AF_INET);
+ } else if (prefix_key->data[0] ==
+ prefix_value->prefix[0] &&
+ prefix_key->data[1] ==
+ prefix_value->prefix[1] &&
+ prefix_key->data[2] ==
+ prefix_value->prefix[2] &&
+ prefix_key->data[3] ==
+ prefix_value->prefix[3] &&
+ route.metric >= prefix_value->metric) {
+ continue;
+ } else {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] =
+ prefix_key->data[i];
+ prefix_value->value =
+ route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+ assert(bpf_map_update_elem(
+ map_fd[0],
+ prefix_key,
+ prefix_value,
+ 0) == 0);
+ }
+ }
+ }
+ memset(&route, 0, sizeof(route));
+ memset(dsts, 0, sizeof(dsts));
+ memset(dsts_len, 0, sizeof(dsts_len));
+ memset(gws, 0, sizeof(gws));
+ memset(ifs, 0, sizeof(ifs));
+ memset(&route, 0, sizeof(route));
+ }
+}
+
+/* Function to read the existing route table when the process is launched*/
+static int get_route_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+
+ struct {
+ struct nlmsghdr nl;
+ struct rtmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETROUTE;
+
+ req.rt.rtm_family = rtm_family;
+ req.rt.rtm_table = RT_TABLE_MAIN;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ printf("send to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_route(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to parse the arp entry returned by netlink
+ * Updates the arp entry related map entries
+ */
+static void read_arp(struct nlmsghdr *nh, int nll)
+{
+ struct rtattr *rt_attr;
+ char dsts[24], mac[24];
+ struct ndmsg *rt_msg;
+ int rtl, ndm_family;
+
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ } arp_entry;
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ if (nh->nlmsg_type == RTM_GETNEIGH)
+ printf("READING arp entry\n");
+ printf("Address\tHwAddress\n");
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct ndmsg *)NLMSG_DATA(nh);
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ ndm_family = rt_msg->ndm_family;
+ rtl = RTM_PAYLOAD(nh);
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case NDA_LLADDR:
+ sprintf(mac, "%lld",
+ *((__be64 *)RTA_DATA(rt_attr)));
+ break;
+ default:
+ break;
+ }
+ }
+ arp_entry.dst = atoi(dsts);
+ arp_entry.mac = atol(mac);
+ printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac);
+ if (ndm_family == AF_INET) {
+ if (bpf_map_lookup_elem(map_fd[3], &arp_entry.dst,
+ &direct_entry) == 0) {
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ direct_entry.arp.dst = 0;
+ direct_entry.arp.mac = 0;
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ direct_entry.arp.dst = arp_entry.dst;
+ direct_entry.arp.mac = arp_entry.mac;
+ }
+ assert(bpf_map_update_elem(map_fd[3],
+ &arp_entry.dst,
+ &direct_entry, 0
+ ) == 0);
+ memset(&direct_entry, 0, sizeof(direct_entry));
+ }
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ assert(bpf_map_delete_elem(map_fd[2], &arp_entry.dst) == 0);
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ assert(bpf_map_update_elem(map_fd[2],
+ &arp_entry.dst,
+ &arp_entry.mac, 0
+ ) == 0);
+ }
+ }
+ memset(&arp_entry, 0, sizeof(arp_entry));
+ memset(dsts, 0, sizeof(dsts));
+ }
+}
+
+/* Function to read the existing arp table when the process is launched*/
+static int get_arp_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+ struct {
+ struct nlmsghdr nl;
+ struct ndmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETNEIGH;
+ req.rt.ndm_state = NUD_REACHABLE;
+ req.rt.ndm_family = rtm_family;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ printf("send to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to keep track and update changes in route and arp table
+ * Give regular statistics of packets forwarded
+ */
+static int monitor_route(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ const unsigned int nr_keys = 256;
+ struct pollfd fds_route, fds_arp;
+ __u64 prev[nr_keys][nr_cpus];
+ struct sockaddr_nl la, lr;
+ __u64 values[nr_cpus];
+ struct nlmsghdr *nh;
+ int nll, ret = 0;
+ int interval = 5;
+ __u32 key;
+ int i;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+
+ fcntl(sock, F_SETFL, O_NONBLOCK);
+ memset(&lr, 0, sizeof(lr));
+ lr.nl_family = AF_NETLINK;
+ lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
+ if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ fds_route.fd = sock;
+ fds_route.events = POLL_IN;
+
+ sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock_arp < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+
+ fcntl(sock_arp, F_SETFL, O_NONBLOCK);
+ memset(&la, 0, sizeof(la));
+ la.nl_family = AF_NETLINK;
+ la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY;
+ if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ fds_arp.fd = sock_arp;
+ fds_arp.events = POLL_IN;
+
+ memset(prev, 0, sizeof(prev));
+ do {
+ signal(SIGINT, close_and_exit);
+ signal(SIGTERM, close_and_exit);
+
+ sleep(interval);
+ for (key = 0; key < nr_keys; key++) {
+ __u64 sum = 0;
+
+ assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[key][i]);
+ if (sum)
+ printf("proto %u: %10llu pkt/s\n",
+ key, sum / interval);
+ memcpy(prev[key], values, sizeof(values));
+ }
+
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_route, 1, 3) == POLL_IN) {
+ nll = recv_msg(lr, sock);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ printf("Routing table updated.\n");
+ read_route(nh, nll);
+ }
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_arp, 1, 3) == POLL_IN) {
+ nll = recv_msg(la, sock_arp);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+ }
+
+ } while (1);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ char **ifname_list;
+ int i = 1;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (ac < 2) {
+ printf("usage: %s [-S] Interface name list\n", argv[0]);
+ return 1;
+ }
+ if (!strcmp(argv[1], "-S")) {
+ flags = XDP_FLAGS_SKB_MODE;
+ total_ifindex = ac - 2;
+ ifname_list = (argv + 2);
+ } else {
+ flags = 0;
+ total_ifindex = ac - 1;
+ ifname_list = (argv + 1);
+ }
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+ printf("\n**************loading bpf file*********************\n\n\n");
+ if (!prog_fd[0]) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+ ifindex_list = (int *)malloc(total_ifindex * sizeof(int *));
+ for (i = 0; i < total_ifindex; i++) {
+ ifindex_list[i] = if_nametoindex(ifname_list[i]);
+ if (!ifindex_list[i]) {
+ printf("Couldn't translate interface name: %s",
+ strerror(errno));
+ return 1;
+ }
+ }
+ for (i = 0; i < total_ifindex; i++) {
+ if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) {
+ printf("link set xdp fd failed\n");
+ int recovery_index = i;
+
+ for (i = 0; i < recovery_index; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+
+ return 1;
+ }
+ printf("Attached to %d\n", ifindex_list[i]);
+ }
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ printf("*******************ROUTE TABLE*************************\n\n\n");
+ get_route_table(AF_INET);
+ printf("*******************ARP TABLE***************************\n\n\n");
+ get_arp_table(AF_INET);
+ if (monitor_route() < 0) {
+ printf("Error in receiving route update");
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c
new file mode 100644
index 000000000..222a83eed
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_kern.c
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * Example howto extract XDP RX-queue info
+ */
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/in.h>
+#include "bpf_helpers.h"
+
+/* Config setup from with userspace
+ *
+ * User-side setup ifindex in config_map, to verify that
+ * ctx->ingress_ifindex is correct (against configured ifindex)
+ */
+struct config {
+ __u32 action;
+ int ifindex;
+ __u32 options;
+};
+enum cfg_options_flags {
+ NO_TOUCH = 0x0U,
+ READ_MEM = 0x1U,
+ SWAP_MAC = 0x2U,
+};
+struct bpf_map_def SEC("maps") config_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct config),
+ .max_entries = 1,
+};
+
+/* Common stats data record (shared with userspace) */
+struct datarec {
+ __u64 processed;
+ __u64 issue;
+};
+
+struct bpf_map_def SEC("maps") stats_global_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+#define MAX_RXQs 64
+
+/* Stats per rx_queue_index (per CPU) */
+struct bpf_map_def SEC("maps") rx_queue_index_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = MAX_RXQs + 1,
+};
+
+static __always_inline
+void swap_src_dst_mac(void *data)
+{
+ unsigned short *p = data;
+ unsigned short dst[3];
+
+ dst[0] = p[0];
+ dst[1] = p[1];
+ dst[2] = p[2];
+ p[0] = p[3];
+ p[1] = p[4];
+ p[2] = p[5];
+ p[3] = dst[0];
+ p[4] = dst[1];
+ p[5] = dst[2];
+}
+
+SEC("xdp_prog0")
+int xdp_prognum0(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct datarec *rec, *rxq_rec;
+ int ingress_ifindex;
+ struct config *config;
+ u32 key = 0;
+
+ /* Global stats record */
+ rec = bpf_map_lookup_elem(&stats_global_map, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ /* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF
+ * instructions inside kernel to access xdp_rxq->dev->ifindex
+ */
+ ingress_ifindex = ctx->ingress_ifindex;
+
+ config = bpf_map_lookup_elem(&config_map, &key);
+ if (!config)
+ return XDP_ABORTED;
+
+ /* Simple test: check ctx provided ifindex is as expected */
+ if (ingress_ifindex != config->ifindex) {
+ /* count this error case */
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ /* Update stats per rx_queue_index. Handle if rx_queue_index
+ * is larger than stats map can contain info for.
+ */
+ key = ctx->rx_queue_index;
+ if (key >= MAX_RXQs)
+ key = MAX_RXQs;
+ rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key);
+ if (!rxq_rec)
+ return XDP_ABORTED;
+ rxq_rec->processed++;
+ if (key == MAX_RXQs)
+ rxq_rec->issue++;
+
+ /* Default: Don't touch packet data, only count packets */
+ if (unlikely(config->options & (READ_MEM|SWAP_MAC))) {
+ struct ethhdr *eth = data;
+
+ if (eth + 1 > data_end)
+ return XDP_ABORTED;
+
+ /* Avoid compiler removing this: Drop non 802.3 Ethertypes */
+ if (ntohs(eth->h_proto) < ETH_P_802_3_MIN)
+ return XDP_ABORTED;
+
+ /* XDP_TX requires changing MAC-addrs, else HW may drop.
+ * Can also be enabled with --swapmac (for test purposes)
+ */
+ if (unlikely(config->options & SWAP_MAC))
+ swap_src_dst_mac(data);
+ }
+
+ return config->action;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
new file mode 100644
index 000000000..a55c81301
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -0,0 +1,581 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+static const char *__doc__ = " XDP RX-queue info extract example\n\n"
+ "Monitor how many packets per sec (pps) are received\n"
+ "per NIC RX queue index and which CPU processed the packet\n"
+ ;
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+static struct bpf_map *stats_global_map;
+static struct bpf_map *rx_queue_index_map;
+
+/* Exit return codes */
+#define EXIT_OK 0
+#define EXIT_FAIL 1
+#define EXIT_FAIL_OPTION 2
+#define EXIT_FAIL_XDP 3
+#define EXIT_FAIL_BPF 4
+#define EXIT_FAIL_MEM 5
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"dev", required_argument, NULL, 'd' },
+ {"skb-mode", no_argument, NULL, 'S' },
+ {"sec", required_argument, NULL, 's' },
+ {"no-separators", no_argument, NULL, 'z' },
+ {"action", required_argument, NULL, 'a' },
+ {"readmem", no_argument, NULL, 'r' },
+ {"swapmac", no_argument, NULL, 'm' },
+ {0, 0, NULL, 0 }
+};
+
+static void int_exit(int sig)
+{
+ fprintf(stderr,
+ "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+ ifindex, ifname);
+ if (ifindex > -1)
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(EXIT_OK);
+}
+
+struct config {
+ __u32 action;
+ int ifindex;
+ __u32 options;
+};
+enum cfg_options_flags {
+ NO_TOUCH = 0x0U,
+ READ_MEM = 0x1U,
+ SWAP_MAC = 0x2U,
+};
+#define XDP_ACTION_MAX (XDP_TX + 1)
+#define XDP_ACTION_MAX_STRLEN 11
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+ [XDP_ABORTED] = "XDP_ABORTED",
+ [XDP_DROP] = "XDP_DROP",
+ [XDP_PASS] = "XDP_PASS",
+ [XDP_TX] = "XDP_TX",
+};
+
+static const char *action2str(int action)
+{
+ if (action < XDP_ACTION_MAX)
+ return xdp_action_names[action];
+ return NULL;
+}
+
+static int parse_xdp_action(char *action_str)
+{
+ size_t maxlen;
+ __u64 action = -1;
+ int i;
+
+ for (i = 0; i < XDP_ACTION_MAX; i++) {
+ maxlen = XDP_ACTION_MAX_STRLEN;
+ if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) {
+ action = i;
+ break;
+ }
+ }
+ return action;
+}
+
+static void list_xdp_actions(void)
+{
+ int i;
+
+ printf("Available XDP --action <options>\n");
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ printf("\t%s\n", xdp_action_names[i]);
+ printf("\n");
+}
+
+static char* options2str(enum cfg_options_flags flag)
+{
+ if (flag == NO_TOUCH)
+ return "no_touch";
+ if (flag & SWAP_MAC)
+ return "swapmac";
+ if (flag & READ_MEM)
+ return "read";
+ fprintf(stderr, "ERR: Unknown config option flags");
+ exit(EXIT_FAIL);
+}
+
+static void usage(char *argv[])
+{
+ int i;
+
+ printf("\nDOCUMENTATION:\n%s\n", __doc__);
+ printf(" Usage: %s (options-see-below)\n", argv[0]);
+ printf(" Listing options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-12s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)",
+ *long_options[i].flag);
+ else
+ printf(" short-option: -%c",
+ long_options[i].val);
+ printf("\n");
+ }
+ printf("\n");
+ list_xdp_actions();
+}
+
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+ struct timespec t;
+ int res;
+
+ res = clock_gettime(CLOCK_MONOTONIC, &t);
+ if (res < 0) {
+ fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+ exit(EXIT_FAIL);
+ }
+ return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+ __u64 processed;
+ __u64 issue;
+};
+struct record {
+ __u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+struct stats_record {
+ struct record stats;
+ struct record *rxq;
+};
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec *array;
+ size_t size;
+
+ size = sizeof(struct datarec) * nr_cpus;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct record *alloc_record_per_rxq(void)
+{
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+ struct record *array;
+ size_t size;
+
+ size = sizeof(struct record) * nr_rxqs;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+ struct stats_record *rec;
+ int i;
+
+ rec = malloc(sizeof(*rec));
+ memset(rec, 0, sizeof(*rec));
+ if (!rec) {
+ fprintf(stderr, "Mem alloc error\n");
+ exit(EXIT_FAIL_MEM);
+ }
+ rec->rxq = alloc_record_per_rxq();
+ for (i = 0; i < nr_rxqs; i++)
+ rec->rxq[i].cpu = alloc_record_per_cpu();
+
+ rec->stats.cpu = alloc_record_per_cpu();
+ return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+ int i;
+
+ for (i = 0; i < nr_rxqs; i++)
+ free(r->rxq[i].cpu);
+
+ free(r->rxq);
+ free(r->stats.cpu);
+ free(r);
+}
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec values[nr_cpus];
+ __u64 sum_processed = 0;
+ __u64 sum_issue = 0;
+ int i;
+
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_processed += values[i].processed;
+ rec->cpu[i].issue = values[i].issue;
+ sum_issue += values[i].issue;
+ }
+ rec->total.processed = sum_processed;
+ rec->total.issue = sum_issue;
+ return true;
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+ int fd, i, max_rxqs;
+
+ fd = bpf_map__fd(stats_global_map);
+ map_collect_percpu(fd, 0, &rec->stats);
+
+ fd = bpf_map__fd(rx_queue_index_map);
+ max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+ for (i = 0; i < max_rxqs; i++)
+ map_collect_percpu(fd, i, &rec->rxq[i]);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+ struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->issue - p->issue;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ int action, __u32 cfg_opt)
+{
+ unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ double pps = 0, err = 0;
+ struct record *rec, *prev;
+ double t;
+ int rxq;
+ int i;
+
+ /* Header */
+ printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s options:%s\n",
+ ifname, ifindex, action2str(action), options2str(cfg_opt));
+
+ /* stats_global_map */
+ {
+ char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n";
+ char *fm2_rx = "%-15s %-7s %'-11.0f\n";
+ char *errstr = "";
+
+ printf("%-15s %-7s %-11s %-11s\n",
+ "XDP stats", "CPU", "pps", "issue-pps");
+
+ rec = &stats_rec->stats;
+ prev = &stats_prev->stats;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps (r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ errstr = "invalid-ifindex";
+ if (pps > 0)
+ printf(fmt_rx, "XDP-RX CPU",
+ i, pps, err, errstr);
+ }
+ pps = calc_pps (&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ printf(fm2_rx, "XDP-RX CPU", "total", pps, err);
+ }
+
+ /* rx_queue_index_map */
+ printf("\n%-15s %-7s %-11s %-11s\n",
+ "RXQ stats", "RXQ:CPU", "pps", "issue-pps");
+
+ for (rxq = 0; rxq < nr_rxqs; rxq++) {
+ char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n";
+ char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n";
+ char *errstr = "";
+ int rxq_ = rxq;
+
+ /* Last RXQ in map catch overflows */
+ if (rxq_ == nr_rxqs - 1)
+ rxq_ = -1;
+
+ rec = &stats_rec->rxq[rxq];
+ prev = &stats_prev->rxq[rxq];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps (r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0) {
+ if (rxq_ == -1)
+ errstr = "map-overflow-RXQ";
+ else
+ errstr = "err";
+ }
+ if (pps > 0)
+ printf(fmt_rx, "rx_queue_index",
+ rxq_, i, pps, err, errstr);
+ }
+ pps = calc_pps (&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (pps || err)
+ printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err);
+ }
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static void stats_poll(int interval, int action, __u32 cfg_opt)
+{
+ struct stats_record *record, *prev;
+
+ record = alloc_stats_record();
+ prev = alloc_stats_record();
+ stats_collect(record);
+
+ while (1) {
+ swap(&prev, &record);
+ stats_collect(record);
+ stats_print(record, prev, action, cfg_opt);
+ sleep(interval);
+ }
+
+ free_stats_record(record);
+ free_stats_record(prev);
+}
+
+
+int main(int argc, char **argv)
+{
+ __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */
+ struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ int prog_fd, map_fd, opt, err;
+ bool use_separators = true;
+ struct config cfg = { 0 };
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ char filename[256];
+ int longindex = 0;
+ int interval = 2;
+ __u32 key = 0;
+
+
+ char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
+ int action = XDP_PASS; /* Default action */
+ char *action_str = NULL;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return EXIT_FAIL;
+
+ map = bpf_object__find_map_by_name(obj, "config_map");
+ stats_global_map = bpf_object__find_map_by_name(obj, "stats_global_map");
+ rx_queue_index_map = bpf_object__find_map_by_name(obj, "rx_queue_index_map");
+ if (!map || !stats_global_map || !rx_queue_index_map) {
+ printf("finding a map in obj file failed\n");
+ return EXIT_FAIL;
+ }
+ map_fd = bpf_map__fd(map);
+
+ if (!prog_fd) {
+ fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+ return EXIT_FAIL;
+ }
+
+ /* Parse commands line args */
+ while ((opt = getopt_long(argc, argv, "hSd:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 'd':
+ if (strlen(optarg) >= IF_NAMESIZE) {
+ fprintf(stderr, "ERR: --dev name too long\n");
+ goto error;
+ }
+ ifname = (char *)&ifname_buf;
+ strncpy(ifname, optarg, IF_NAMESIZE);
+ ifindex = if_nametoindex(ifname);
+ if (ifindex == 0) {
+ fprintf(stderr,
+ "ERR: --dev name unknown err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 's':
+ interval = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'z':
+ use_separators = false;
+ break;
+ case 'a':
+ action_str = (char *)&action_str_buf;
+ strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN);
+ break;
+ case 'r':
+ cfg_options |= READ_MEM;
+ break;
+ case 'm':
+ cfg_options |= SWAP_MAC;
+ break;
+ case 'h':
+ error:
+ default:
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ }
+ /* Required option */
+ if (ifindex == -1) {
+ fprintf(stderr, "ERR: required option --dev missing\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ cfg.ifindex = ifindex;
+
+ /* Parse action string */
+ if (action_str) {
+ action = parse_xdp_action(action_str);
+ if (action < 0) {
+ fprintf(stderr, "ERR: Invalid XDP --action: %s\n",
+ action_str);
+ list_xdp_actions();
+ return EXIT_FAIL_OPTION;
+ }
+ }
+ cfg.action = action;
+
+ /* XDP_TX requires changing MAC-addrs, else HW may drop */
+ if (action == XDP_TX)
+ cfg_options |= SWAP_MAC;
+ cfg.options = cfg_options;
+
+ /* Trick to pretty printf with thousands separators use %' */
+ if (use_separators)
+ setlocale(LC_NUMERIC, "en_US");
+
+ /* User-side setup ifindex in config_map */
+ err = bpf_map_update_elem(map_fd, &key, &cfg, 0);
+ if (err) {
+ fprintf(stderr, "Store config failed (err:%d)\n", err);
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* Remove XDP program when program is interrupted or killed */
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+ fprintf(stderr, "link set xdp fd failed\n");
+ return EXIT_FAIL_XDP;
+ }
+
+ stats_poll(interval, action, cfg_options);
+ return EXIT_OK;
+}
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
new file mode 100644
index 000000000..f7ca8b850
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define SAMPLE_SIZE 64ul
+#define MAX_CPUS 128
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_CPUS,
+};
+
+SEC("xdp_sample")
+int xdp_sample_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+
+ /* Metadata will be in the perf event before the packet data. */
+ struct S {
+ u16 cookie;
+ u16 pkt_len;
+ } __packed metadata;
+
+ if (data < data_end) {
+ /* The XDP perf_event_output handler will use the upper 32 bits
+ * of the flags argument as a number of bytes to include of the
+ * packet payload in the event data. If the size is too big, the
+ * call to bpf_perf_event_output will fail and return -EFAULT.
+ *
+ * See bpf_xdp_event_output in net/core/filter.c.
+ *
+ * The BPF_F_CURRENT_CPU flag means that the event output fd
+ * will be indexed by the CPU number in the event map.
+ */
+ u64 flags = BPF_F_CURRENT_CPU;
+ u16 sample_size;
+ int ret;
+
+ metadata.cookie = 0xdead;
+ metadata.pkt_len = (u16)(data_end - data);
+ sample_size = min(metadata.pkt_len, SAMPLE_SIZE);
+ flags |= (u64)sample_size << 32;
+
+ ret = bpf_perf_event_output(ctx, &my_map, flags,
+ &metadata, sizeof(metadata));
+ if (ret)
+ bpf_printk("perf_event_output failed: %d\n", ret);
+ }
+
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
new file mode 100644
index 000000000..8dd87c1eb
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <net/if.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/sysinfo.h>
+#include <sys/ioctl.h>
+#include <signal.h>
+#include <libbpf.h>
+#include <bpf/bpf.h>
+
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define MAX_CPUS 128
+static int pmu_fds[MAX_CPUS], if_idx;
+static struct perf_event_mmap_page *headers[MAX_CPUS];
+static char *if_name;
+
+static int do_attach(int idx, int fd, const char *name)
+{
+ int err;
+
+ err = bpf_set_link_xdp_fd(idx, fd, 0);
+ if (err < 0)
+ printf("ERROR: failed to attach program to %s\n", name);
+
+ return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+ int err;
+
+ err = bpf_set_link_xdp_fd(idx, -1, 0);
+ if (err < 0)
+ printf("ERROR: failed to detach program from %s\n", name);
+
+ return err;
+}
+
+#define SAMPLE_SIZE 64
+
+static int print_bpf_output(void *data, int size)
+{
+ struct {
+ __u16 cookie;
+ __u16 pkt_len;
+ __u8 pkt_data[SAMPLE_SIZE];
+ } __packed *e = data;
+ int i;
+
+ if (e->cookie != 0xdead) {
+ printf("BUG cookie %x sized %d\n",
+ e->cookie, size);
+ return LIBBPF_PERF_EVENT_ERROR;
+ }
+
+ printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len);
+ for (i = 0; i < 14 && i < e->pkt_len; i++)
+ printf("%02x ", e->pkt_data[i]);
+ printf("\n");
+
+ return LIBBPF_PERF_EVENT_CONT;
+}
+
+static void test_bpf_perf_event(int map_fd, int num)
+{
+ struct perf_event_attr attr = {
+ .sample_type = PERF_SAMPLE_RAW,
+ .type = PERF_TYPE_SOFTWARE,
+ .config = PERF_COUNT_SW_BPF_OUTPUT,
+ .wakeup_events = 1, /* get an fd notification for every event */
+ };
+ int i;
+
+ for (i = 0; i < num; i++) {
+ int key = i;
+
+ pmu_fds[i] = sys_perf_event_open(&attr, -1/*pid*/, i/*cpu*/,
+ -1/*group_fd*/, 0);
+
+ assert(pmu_fds[i] >= 0);
+ assert(bpf_map_update_elem(map_fd, &key,
+ &pmu_fds[i], BPF_ANY) == 0);
+ ioctl(pmu_fds[i], PERF_EVENT_IOC_ENABLE, 0);
+ }
+}
+
+static void sig_handler(int signo)
+{
+ do_detach(if_idx, if_name);
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ int prog_fd, map_fd;
+ char filename[256];
+ int ret, err, i;
+ int numcpus;
+
+ if (argc < 2) {
+ printf("Usage: %s <ifname>\n", argv[0]);
+ return 1;
+ }
+
+ numcpus = get_nprocs();
+ if (numcpus > MAX_CPUS)
+ numcpus = MAX_CPUS;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ return 1;
+
+ if (!prog_fd) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ map = bpf_map__next(NULL, obj);
+ if (!map) {
+ printf("finding a map in obj file failed\n");
+ return 1;
+ }
+ map_fd = bpf_map__fd(map);
+
+ if_idx = if_nametoindex(argv[1]);
+ if (!if_idx)
+ if_idx = strtoul(argv[1], NULL, 0);
+
+ if (!if_idx) {
+ fprintf(stderr, "Invalid ifname\n");
+ return 1;
+ }
+ if_name = argv[1];
+ err = do_attach(if_idx, prog_fd, argv[1]);
+ if (err)
+ return err;
+
+ if (signal(SIGINT, sig_handler) ||
+ signal(SIGHUP, sig_handler) ||
+ signal(SIGTERM, sig_handler)) {
+ perror("signal");
+ return 1;
+ }
+
+ test_bpf_perf_event(map_fd, numcpus);
+
+ for (i = 0; i < numcpus; i++)
+ if (perf_event_mmap_header(pmu_fds[i], &headers[i]) < 0)
+ return 1;
+
+ ret = perf_event_poller_multi(pmu_fds, headers, numcpus,
+ print_bpf_output);
+ kill(0, SIGINT);
+ return ret;
+}
diff --git a/samples/bpf/xdp_tx_iptunnel_common.h b/samples/bpf/xdp_tx_iptunnel_common.h
new file mode 100644
index 000000000..dd12cc351
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_common.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
+#define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
+
+#include <linux/types.h>
+
+#define MAX_IPTNL_ENTRIES 256U
+
+struct vip {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 dport;
+ __u16 family;
+ __u8 protocol;
+};
+
+struct iptnl_info {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } saddr;
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 family;
+ __u8 dmac[6];
+};
+
+#endif
diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c
new file mode 100644
index 000000000..0f4f6e8c8
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_kern.c
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program shows how to use bpf_xdp_adjust_head() by
+ * encapsulating the incoming packet in an IPv4/v6 header
+ * and then XDP_TX it out.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+#include "xdp_tx_iptunnel_common.h"
+
+struct bpf_map_def SEC("maps") rxcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u64),
+ .max_entries = 256,
+};
+
+struct bpf_map_def SEC("maps") vip2tnl = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct vip),
+ .value_size = sizeof(struct iptnl_info),
+ .max_entries = MAX_IPTNL_ENTRIES,
+};
+
+static __always_inline void count_tx(u32 protocol)
+{
+ u64 *rxcnt_count;
+
+ rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
+ if (rxcnt_count)
+ *rxcnt_count += 1;
+}
+
+static __always_inline int get_dport(void *trans_data, void *data_end,
+ u8 protocol)
+{
+ struct tcphdr *th;
+ struct udphdr *uh;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)trans_data;
+ if (th + 1 > data_end)
+ return -1;
+ return th->dest;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)trans_data;
+ if (uh + 1 > data_end)
+ return -1;
+ return uh->dest;
+ default:
+ return 0;
+ }
+}
+
+static __always_inline void set_ethhdr(struct ethhdr *new_eth,
+ const struct ethhdr *old_eth,
+ const struct iptnl_info *tnl,
+ __be16 h_proto)
+{
+ memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+ memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
+ new_eth->h_proto = h_proto;
+}
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct iphdr *iph = data + sizeof(struct ethhdr);
+ u16 *next_iph_u16;
+ u16 payload_len;
+ struct vip vip = {};
+ int dport;
+ u32 csum = 0;
+ int i;
+
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(iph + 1, data_end, iph->protocol);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = iph->protocol;
+ vip.family = AF_INET;
+ vip.daddr.v4 = iph->daddr;
+ vip.dport = dport;
+ payload_len = ntohs(iph->tot_len);
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v4-in-v4 */
+ if (!tnl || tnl->family != AF_INET)
+ return XDP_PASS;
+
+ /* The vip key is found. Add an IP header and send it out */
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ iph = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*iph);
+
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end ||
+ iph + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
+
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) >> 2;
+ iph->frag_off = 0;
+ iph->protocol = IPPROTO_IPIP;
+ iph->check = 0;
+ iph->tos = 0;
+ iph->tot_len = htons(payload_len + sizeof(*iph));
+ iph->daddr = tnl->daddr.v4;
+ iph->saddr = tnl->saddr.v4;
+ iph->ttl = 8;
+
+ next_iph_u16 = (u16 *)iph;
+#pragma clang loop unroll(full)
+ for (i = 0; i < sizeof(*iph) >> 1; i++)
+ csum += *next_iph_u16++;
+
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+static __always_inline int handle_ipv6(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = ip6h->nexthdr;
+ vip.family = AF_INET6;
+ memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
+ vip.dport = dport;
+ payload_len = ip6h->payload_len;
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v6-in-v6 */
+ if (!tnl || tnl->family != AF_INET6)
+ return XDP_PASS;
+
+ /* The vip key is found. Add an IP header and send it out */
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ ip6h = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*ip6h);
+
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end ||
+ ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
+
+ ip6h->version = 6;
+ ip6h->priority = 0;
+ memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
+ ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
+ ip6h->nexthdr = IPPROTO_IPV6;
+ ip6h->hop_limit = 8;
+ memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
+ memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+SEC("xdp_tx_iptunnel")
+int _xdp_tx_iptunnel(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+
+ if (eth + 1 > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_IP))
+ return handle_ipv4(xdp);
+ else if (h_proto == htons(ETH_P_IPV6))
+
+ return handle_ipv6(xdp);
+ else
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
new file mode 100644
index 000000000..a4ccc33ad
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -0,0 +1,267 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include "bpf_load.h"
+#include <bpf/bpf.h>
+#include "bpf_util.h"
+#include "xdp_tx_iptunnel_common.h"
+
+#define STATS_INTERVAL_S 2U
+
+static int ifindex = -1;
+static __u32 xdp_flags = 0;
+
+static void int_exit(int sig)
+{
+ if (ifindex > -1)
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(0);
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(unsigned int kill_after_s)
+{
+ const unsigned int nr_protos = 256;
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ time_t started_at = time(NULL);
+ __u64 values[nr_cpus], prev[nr_protos][nr_cpus];
+ __u32 proto;
+ int i;
+
+ memset(prev, 0, sizeof(prev));
+
+ while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+ sleep(STATS_INTERVAL_S);
+
+ for (proto = 0; proto < nr_protos; proto++) {
+ __u64 sum = 0;
+
+ assert(bpf_map_lookup_elem(map_fd[0], &proto, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[proto][i]);
+
+ if (sum)
+ printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n",
+ proto, sum, sum / STATS_INTERVAL_S);
+ memcpy(prev[proto], values, sizeof(values));
+ }
+ }
+}
+
+static void usage(const char *cmd)
+{
+ printf("Start a XDP prog which encapsulates incoming packets\n"
+ "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n"
+ "is used to select packets to encapsulate\n\n");
+ printf("Usage: %s [...]\n", cmd);
+ printf(" -i <ifindex> Interface Index\n");
+ printf(" -a <vip-service-address> IPv4 or IPv6\n");
+ printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n");
+ printf(" -s <source-ip> Used in the IPTunnel header\n");
+ printf(" -d <dest-ip> Used in the IPTunnel header\n");
+ printf(" -m <dest-MAC> Used in sending the IP Tunneled pkt\n");
+ printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
+ printf(" -P <IP-Protocol> Default is TCP\n");
+ printf(" -S use skb-mode\n");
+ printf(" -N enforce native mode\n");
+ printf(" -h Display this help\n");
+}
+
+static int parse_ipstr(const char *ipstr, unsigned int *addr)
+{
+ if (inet_pton(AF_INET6, ipstr, addr) == 1) {
+ return AF_INET6;
+ } else if (inet_pton(AF_INET, ipstr, addr) == 1) {
+ addr[1] = addr[2] = addr[3] = 0;
+ return AF_INET;
+ }
+
+ fprintf(stderr, "%s is an invalid IP\n", ipstr);
+ return AF_UNSPEC;
+}
+
+static int parse_ports(const char *port_str, int *min_port, int *max_port)
+{
+ char *end;
+ long tmp_min_port;
+ long tmp_max_port;
+
+ tmp_min_port = strtol(optarg, &end, 10);
+ if (tmp_min_port < 1 || tmp_min_port > 65535) {
+ fprintf(stderr, "Invalid port(s):%s\n", optarg);
+ return 1;
+ }
+
+ if (*end == '-') {
+ end++;
+ tmp_max_port = strtol(end, NULL, 10);
+ if (tmp_max_port < 1 || tmp_max_port > 65535) {
+ fprintf(stderr, "Invalid port(s):%s\n", optarg);
+ return 1;
+ }
+ } else {
+ tmp_max_port = tmp_min_port;
+ }
+
+ if (tmp_min_port > tmp_max_port) {
+ fprintf(stderr, "Invalid port(s):%s\n", optarg);
+ return 1;
+ }
+
+ if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) {
+ fprintf(stderr, "Port range (%s) is larger than %u\n",
+ port_str, MAX_IPTNL_ENTRIES);
+ return 1;
+ }
+ *min_port = tmp_min_port;
+ *max_port = tmp_max_port;
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ unsigned char opt_flags[256] = {};
+ unsigned int kill_after_s = 0;
+ const char *optstr = "i:a:p:s:d:m:T:P:SNh";
+ int min_port = 0, max_port = 0;
+ struct iptnl_info tnl = {};
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct vip vip = {};
+ char filename[256];
+ int opt;
+ int i;
+
+ tnl.family = AF_UNSPEC;
+ vip.protocol = IPPROTO_TCP;
+
+ for (i = 0; i < strlen(optstr); i++)
+ if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
+ opt_flags[(unsigned char)optstr[i]] = 1;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ unsigned short family;
+ unsigned int *v6;
+
+ switch (opt) {
+ case 'i':
+ ifindex = atoi(optarg);
+ break;
+ case 'a':
+ vip.family = parse_ipstr(optarg, vip.daddr.v6);
+ if (vip.family == AF_UNSPEC)
+ return 1;
+ break;
+ case 'p':
+ if (parse_ports(optarg, &min_port, &max_port))
+ return 1;
+ break;
+ case 'P':
+ vip.protocol = atoi(optarg);
+ break;
+ case 's':
+ case 'd':
+ if (opt == 's')
+ v6 = tnl.saddr.v6;
+ else
+ v6 = tnl.daddr.v6;
+
+ family = parse_ipstr(optarg, v6);
+ if (family == AF_UNSPEC)
+ return 1;
+ if (tnl.family == AF_UNSPEC) {
+ tnl.family = family;
+ } else if (tnl.family != family) {
+ fprintf(stderr,
+ "The IP version of the src and dst addresses used in the IP encapsulation does not match\n");
+ return 1;
+ }
+ break;
+ case 'm':
+ if (!ether_aton_r(optarg,
+ (struct ether_addr *)tnl.dmac)) {
+ fprintf(stderr, "Invalid mac address:%s\n",
+ optarg);
+ return 1;
+ }
+ break;
+ case 'T':
+ kill_after_s = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ opt_flags[opt] = 0;
+ }
+
+ for (i = 0; i < strlen(optstr); i++) {
+ if (opt_flags[(unsigned int)optstr[i]]) {
+ fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (!prog_fd[0]) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ while (min_port <= max_port) {
+ vip.dport = htons(min_port++);
+ if (bpf_map_update_elem(map_fd[1], &vip, &tnl, BPF_NOEXIST)) {
+ perror("bpf_map_update_elem(&vip2tnl)");
+ return 1;
+ }
+ }
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+ printf("link set xdp fd failed\n");
+ return 1;
+ }
+
+ poll_stats(kill_after_s);
+
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+
+ return 0;
+}
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
new file mode 100644
index 000000000..533ab81ad
--- /dev/null
+++ b/samples/bpf/xdpsock.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef XDPSOCK_H_
+#define XDPSOCK_H_
+
+/* Power-of-2 number of sockets */
+#define MAX_SOCKS 4
+
+/* Round-robin receive */
+#define RR_LB 0
+
+#endif /* XDPSOCK_H_ */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
new file mode 100644
index 000000000..d8806c413
--- /dev/null
+++ b/samples/bpf/xdpsock_kern.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#include "xdpsock.h"
+
+struct bpf_map_def SEC("maps") qidconf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") xsks_map = {
+ .type = BPF_MAP_TYPE_XSKMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 4,
+};
+
+struct bpf_map_def SEC("maps") rr_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(unsigned int),
+ .max_entries = 1,
+};
+
+SEC("xdp_sock")
+int xdp_sock_prog(struct xdp_md *ctx)
+{
+ int *qidconf, key = 0, idx;
+ unsigned int *rr;
+
+ qidconf = bpf_map_lookup_elem(&qidconf_map, &key);
+ if (!qidconf)
+ return XDP_ABORTED;
+
+ if (*qidconf != ctx->rx_queue_index)
+ return XDP_PASS;
+
+#if RR_LB /* NB! RR_LB is configured in xdpsock.h */
+ rr = bpf_map_lookup_elem(&rr_map, &key);
+ if (!rr)
+ return XDP_ABORTED;
+
+ *rr = (*rr + 1) & (MAX_SOCKS - 1);
+ idx = *rr;
+#else
+ idx = 0;
+#endif
+
+ return bpf_redirect_map(&xsks_map, idx, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
new file mode 100644
index 000000000..4914788b6
--- /dev/null
+++ b/samples/bpf/xdpsock_user.c
@@ -0,0 +1,987 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2017 - 2018 Intel Corporation. */
+
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+#include <linux/if_ether.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/ethernet.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <poll.h>
+
+#include "bpf/libbpf.h"
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+
+#include "xdpsock.h"
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define NUM_FRAMES 131072
+#define FRAME_HEADROOM 0
+#define FRAME_SHIFT 11
+#define FRAME_SIZE 2048
+#define NUM_DESCS 1024
+#define BATCH_SIZE 16
+
+#define FQ_NUM_DESCS 1024
+#define CQ_NUM_DESCS 1024
+
+#define DEBUG_HEXDUMP 0
+
+typedef __u64 u64;
+typedef __u32 u32;
+
+static unsigned long prev_time;
+
+enum benchmark_type {
+ BENCH_RXDROP = 0,
+ BENCH_TXONLY = 1,
+ BENCH_L2FWD = 2,
+};
+
+static enum benchmark_type opt_bench = BENCH_RXDROP;
+static u32 opt_xdp_flags;
+static const char *opt_if = "";
+static int opt_ifindex;
+static int opt_queue;
+static int opt_poll;
+static int opt_shared_packet_buffer;
+static int opt_interval = 1;
+static u32 opt_xdp_bind_flags;
+
+struct xdp_umem_uqueue {
+ u32 cached_prod;
+ u32 cached_cons;
+ u32 mask;
+ u32 size;
+ u32 *producer;
+ u32 *consumer;
+ u64 *ring;
+ void *map;
+};
+
+struct xdp_umem {
+ char *frames;
+ struct xdp_umem_uqueue fq;
+ struct xdp_umem_uqueue cq;
+ int fd;
+};
+
+struct xdp_uqueue {
+ u32 cached_prod;
+ u32 cached_cons;
+ u32 mask;
+ u32 size;
+ u32 *producer;
+ u32 *consumer;
+ struct xdp_desc *ring;
+ void *map;
+};
+
+struct xdpsock {
+ struct xdp_uqueue rx;
+ struct xdp_uqueue tx;
+ int sfd;
+ struct xdp_umem *umem;
+ u32 outstanding_tx;
+ unsigned long rx_npkts;
+ unsigned long tx_npkts;
+ unsigned long prev_rx_npkts;
+ unsigned long prev_tx_npkts;
+};
+
+#define MAX_SOCKS 4
+static int num_socks;
+struct xdpsock *xsks[MAX_SOCKS];
+
+static unsigned long get_nsecs(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000UL + ts.tv_nsec;
+}
+
+static void dump_stats(void);
+
+#define lassert(expr) \
+ do { \
+ if (!(expr)) { \
+ fprintf(stderr, "%s:%s:%i: Assertion failed: " \
+ #expr ": errno: %d/\"%s\"\n", \
+ __FILE__, __func__, __LINE__, \
+ errno, strerror(errno)); \
+ dump_stats(); \
+ exit(EXIT_FAILURE); \
+ } \
+ } while (0)
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+#ifdef __aarch64__
+#define u_smp_rmb() __asm__ __volatile__("dmb ishld": : :"memory")
+#define u_smp_wmb() __asm__ __volatile__("dmb ishst": : :"memory")
+#else
+#define u_smp_rmb() barrier()
+#define u_smp_wmb() barrier()
+#endif
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+static const char pkt_data[] =
+ "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
+ "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
+ "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
+ "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
+
+static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
+{
+ u32 free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= nb)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
+{
+ u32 free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= ndescs)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
+{
+ u32 entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+
+ return (entries > nb) ? nb : entries;
+}
+
+static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
+{
+ u32 entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+
+ return (entries > ndescs) ? ndescs : entries;
+}
+
+static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
+ struct xdp_desc *d,
+ size_t nb)
+{
+ u32 i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ u32 idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i].addr;
+ }
+
+ u_smp_wmb();
+
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d,
+ size_t nb)
+{
+ u32 i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ u32 idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i];
+ }
+
+ u_smp_wmb();
+
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
+ u64 *d, size_t nb)
+{
+ u32 idx, i, entries = umem_nb_avail(cq, nb);
+
+ u_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = cq->cached_cons++ & cq->mask;
+ d[i] = cq->ring[idx];
+ }
+
+ if (entries > 0) {
+ u_smp_wmb();
+
+ *cq->consumer = cq->cached_cons;
+ }
+
+ return entries;
+}
+
+static inline void *xq_get_data(struct xdpsock *xsk, u64 addr)
+{
+ return &xsk->umem->frames[addr];
+}
+
+static inline int xq_enq(struct xdp_uqueue *uq,
+ const struct xdp_desc *descs,
+ unsigned int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int i;
+
+ if (xq_nb_free(uq, ndescs) < ndescs)
+ return -ENOSPC;
+
+ for (i = 0; i < ndescs; i++) {
+ u32 idx = uq->cached_prod++ & uq->mask;
+
+ r[idx].addr = descs[i].addr;
+ r[idx].len = descs[i].len;
+ }
+
+ u_smp_wmb();
+
+ *uq->producer = uq->cached_prod;
+ return 0;
+}
+
+static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
+ unsigned int id, unsigned int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int i;
+
+ if (xq_nb_free(uq, ndescs) < ndescs)
+ return -ENOSPC;
+
+ for (i = 0; i < ndescs; i++) {
+ u32 idx = uq->cached_prod++ & uq->mask;
+
+ r[idx].addr = (id + i) << FRAME_SHIFT;
+ r[idx].len = sizeof(pkt_data) - 1;
+ }
+
+ u_smp_wmb();
+
+ *uq->producer = uq->cached_prod;
+ return 0;
+}
+
+static inline int xq_deq(struct xdp_uqueue *uq,
+ struct xdp_desc *descs,
+ int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int idx;
+ int i, entries;
+
+ entries = xq_nb_avail(uq, ndescs);
+
+ u_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = uq->cached_cons++ & uq->mask;
+ descs[i] = r[idx];
+ }
+
+ if (entries > 0) {
+ u_smp_wmb();
+
+ *uq->consumer = uq->cached_cons;
+ }
+
+ return entries;
+}
+
+static void swap_mac_addresses(void *data)
+{
+ struct ether_header *eth = (struct ether_header *)data;
+ struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
+ struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
+ struct ether_addr tmp;
+
+ tmp = *src_addr;
+ *src_addr = *dst_addr;
+ *dst_addr = tmp;
+}
+
+static void hex_dump(void *pkt, size_t length, u64 addr)
+{
+ const unsigned char *address = (unsigned char *)pkt;
+ const unsigned char *line = address;
+ size_t line_size = 32;
+ unsigned char c;
+ char buf[32];
+ int i = 0;
+
+ if (!DEBUG_HEXDUMP)
+ return;
+
+ sprintf(buf, "addr=%llu", addr);
+ printf("length = %zu\n", length);
+ printf("%s | ", buf);
+ while (length-- > 0) {
+ printf("%02X ", *address++);
+ if (!(++i % line_size) || (length == 0 && i % line_size)) {
+ if (length == 0) {
+ while (i++ % line_size)
+ printf("__ ");
+ }
+ printf(" | "); /* right close */
+ while (line < address) {
+ c = *line++;
+ printf("%c", (c < 33 || c == 255) ? 0x2E : c);
+ }
+ printf("\n");
+ if (length > 0)
+ printf("%s | ", buf);
+ }
+ }
+ printf("\n");
+}
+
+static size_t gen_eth_frame(char *frame)
+{
+ memcpy(frame, pkt_data, sizeof(pkt_data) - 1);
+ return sizeof(pkt_data) - 1;
+}
+
+static struct xdp_umem *xdp_umem_configure(int sfd)
+{
+ int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
+ struct xdp_mmap_offsets off;
+ struct xdp_umem_reg mr;
+ struct xdp_umem *umem;
+ socklen_t optlen;
+ void *bufs;
+
+ umem = calloc(1, sizeof(*umem));
+ lassert(umem);
+
+ lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+ NUM_FRAMES * FRAME_SIZE) == 0);
+
+ mr.addr = (__u64)bufs;
+ mr.len = NUM_FRAMES * FRAME_SIZE;
+ mr.chunk_size = FRAME_SIZE;
+ mr.headroom = FRAME_HEADROOM;
+
+ lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
+ lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
+ sizeof(int)) == 0);
+ lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
+ sizeof(int)) == 0);
+
+ optlen = sizeof(off);
+ lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
+ &optlen) == 0);
+
+ umem->fq.map = mmap(0, off.fr.desc +
+ FQ_NUM_DESCS * sizeof(u64),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_FILL_RING);
+ lassert(umem->fq.map != MAP_FAILED);
+
+ umem->fq.mask = FQ_NUM_DESCS - 1;
+ umem->fq.size = FQ_NUM_DESCS;
+ umem->fq.producer = umem->fq.map + off.fr.producer;
+ umem->fq.consumer = umem->fq.map + off.fr.consumer;
+ umem->fq.ring = umem->fq.map + off.fr.desc;
+ umem->fq.cached_cons = FQ_NUM_DESCS;
+
+ umem->cq.map = mmap(0, off.cr.desc +
+ CQ_NUM_DESCS * sizeof(u64),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_COMPLETION_RING);
+ lassert(umem->cq.map != MAP_FAILED);
+
+ umem->cq.mask = CQ_NUM_DESCS - 1;
+ umem->cq.size = CQ_NUM_DESCS;
+ umem->cq.producer = umem->cq.map + off.cr.producer;
+ umem->cq.consumer = umem->cq.map + off.cr.consumer;
+ umem->cq.ring = umem->cq.map + off.cr.desc;
+
+ umem->frames = bufs;
+ umem->fd = sfd;
+
+ if (opt_bench == BENCH_TXONLY) {
+ int i;
+
+ for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE)
+ (void)gen_eth_frame(&umem->frames[i]);
+ }
+
+ return umem;
+}
+
+static struct xdpsock *xsk_configure(struct xdp_umem *umem)
+{
+ struct sockaddr_xdp sxdp = {};
+ struct xdp_mmap_offsets off;
+ int sfd, ndescs = NUM_DESCS;
+ struct xdpsock *xsk;
+ bool shared = true;
+ socklen_t optlen;
+ u64 i;
+
+ sfd = socket(PF_XDP, SOCK_RAW, 0);
+ lassert(sfd >= 0);
+
+ xsk = calloc(1, sizeof(*xsk));
+ lassert(xsk);
+
+ xsk->sfd = sfd;
+ xsk->outstanding_tx = 0;
+
+ if (!umem) {
+ shared = false;
+ xsk->umem = xdp_umem_configure(sfd);
+ } else {
+ xsk->umem = umem;
+ }
+
+ lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING,
+ &ndescs, sizeof(int)) == 0);
+ lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
+ &ndescs, sizeof(int)) == 0);
+ optlen = sizeof(off);
+ lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
+ &optlen) == 0);
+
+ /* Rx */
+ xsk->rx.map = mmap(NULL,
+ off.rx.desc +
+ NUM_DESCS * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_PGOFF_RX_RING);
+ lassert(xsk->rx.map != MAP_FAILED);
+
+ if (!shared) {
+ for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE)
+ lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
+ == 0);
+ }
+
+ /* Tx */
+ xsk->tx.map = mmap(NULL,
+ off.tx.desc +
+ NUM_DESCS * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_PGOFF_TX_RING);
+ lassert(xsk->tx.map != MAP_FAILED);
+
+ xsk->rx.mask = NUM_DESCS - 1;
+ xsk->rx.size = NUM_DESCS;
+ xsk->rx.producer = xsk->rx.map + off.rx.producer;
+ xsk->rx.consumer = xsk->rx.map + off.rx.consumer;
+ xsk->rx.ring = xsk->rx.map + off.rx.desc;
+
+ xsk->tx.mask = NUM_DESCS - 1;
+ xsk->tx.size = NUM_DESCS;
+ xsk->tx.producer = xsk->tx.map + off.tx.producer;
+ xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
+ xsk->tx.ring = xsk->tx.map + off.tx.desc;
+ xsk->tx.cached_cons = NUM_DESCS;
+
+ sxdp.sxdp_family = PF_XDP;
+ sxdp.sxdp_ifindex = opt_ifindex;
+ sxdp.sxdp_queue_id = opt_queue;
+
+ if (shared) {
+ sxdp.sxdp_flags = XDP_SHARED_UMEM;
+ sxdp.sxdp_shared_umem_fd = umem->fd;
+ } else {
+ sxdp.sxdp_flags = opt_xdp_bind_flags;
+ }
+
+ lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
+
+ return xsk;
+}
+
+static void print_benchmark(bool running)
+{
+ const char *bench_str = "INVALID";
+
+ if (opt_bench == BENCH_RXDROP)
+ bench_str = "rxdrop";
+ else if (opt_bench == BENCH_TXONLY)
+ bench_str = "txonly";
+ else if (opt_bench == BENCH_L2FWD)
+ bench_str = "l2fwd";
+
+ printf("%s:%d %s ", opt_if, opt_queue, bench_str);
+ if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
+ printf("xdp-skb ");
+ else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
+ printf("xdp-drv ");
+ else
+ printf(" ");
+
+ if (opt_poll)
+ printf("poll() ");
+
+ if (running) {
+ printf("running...");
+ fflush(stdout);
+ }
+}
+
+static void dump_stats(void)
+{
+ unsigned long now = get_nsecs();
+ long dt = now - prev_time;
+ int i;
+
+ prev_time = now;
+
+ for (i = 0; i < num_socks; i++) {
+ char *fmt = "%-15s %'-11.0f %'-11lu\n";
+ double rx_pps, tx_pps;
+
+ rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+ 1000000000. / dt;
+ tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+ 1000000000. / dt;
+
+ printf("\n sock%d@", i);
+ print_benchmark(false);
+ printf("\n");
+
+ printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+ dt / 1000000000.);
+ printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
+ printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
+
+ xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
+ xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
+ }
+}
+
+static void *poller(void *arg)
+{
+ (void)arg;
+ for (;;) {
+ sleep(opt_interval);
+ dump_stats();
+ }
+
+ return NULL;
+}
+
+static void int_exit(int sig)
+{
+ (void)sig;
+ dump_stats();
+ bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+ exit(EXIT_SUCCESS);
+}
+
+static struct option long_options[] = {
+ {"rxdrop", no_argument, 0, 'r'},
+ {"txonly", no_argument, 0, 't'},
+ {"l2fwd", no_argument, 0, 'l'},
+ {"interface", required_argument, 0, 'i'},
+ {"queue", required_argument, 0, 'q'},
+ {"poll", no_argument, 0, 'p'},
+ {"shared-buffer", no_argument, 0, 's'},
+ {"xdp-skb", no_argument, 0, 'S'},
+ {"xdp-native", no_argument, 0, 'N'},
+ {"interval", required_argument, 0, 'n'},
+ {0, 0, 0, 0}
+};
+
+static void usage(const char *prog)
+{
+ const char *str =
+ " Usage: %s [OPTIONS]\n"
+ " Options:\n"
+ " -r, --rxdrop Discard all incoming packets (default)\n"
+ " -t, --txonly Only send packets\n"
+ " -l, --l2fwd MAC swap L2 forwarding\n"
+ " -i, --interface=n Run on interface n\n"
+ " -q, --queue=n Use queue n (default 0)\n"
+ " -p, --poll Use poll syscall\n"
+ " -s, --shared-buffer Use shared packet buffer\n"
+ " -S, --xdp-skb=n Use XDP skb-mod\n"
+ " -N, --xdp-native=n Enfore XDP native mode\n"
+ " -n, --interval=n Specify statistics update interval (default 1 sec).\n"
+ "\n";
+ fprintf(stderr, str, prog);
+ exit(EXIT_FAILURE);
+}
+
+static void parse_command_line(int argc, char **argv)
+{
+ int option_index, c;
+
+ opterr = 0;
+
+ for (;;) {
+ c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
+ &option_index);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'r':
+ opt_bench = BENCH_RXDROP;
+ break;
+ case 't':
+ opt_bench = BENCH_TXONLY;
+ break;
+ case 'l':
+ opt_bench = BENCH_L2FWD;
+ break;
+ case 'i':
+ opt_if = optarg;
+ break;
+ case 'q':
+ opt_queue = atoi(optarg);
+ break;
+ case 's':
+ opt_shared_packet_buffer = 1;
+ break;
+ case 'p':
+ opt_poll = 1;
+ break;
+ case 'S':
+ opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
+ opt_xdp_bind_flags |= XDP_COPY;
+ break;
+ case 'N':
+ opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ case 'n':
+ opt_interval = atoi(optarg);
+ break;
+ default:
+ usage(basename(argv[0]));
+ }
+ }
+
+ opt_ifindex = if_nametoindex(opt_if);
+ if (!opt_ifindex) {
+ fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
+ opt_if);
+ usage(basename(argv[0]));
+ }
+}
+
+static void kick_tx(int fd)
+{
+ int ret;
+
+ ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+ if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY)
+ return;
+ lassert(0);
+}
+
+static inline void complete_tx_l2fwd(struct xdpsock *xsk)
+{
+ u64 descs[BATCH_SIZE];
+ unsigned int rcvd;
+ size_t ndescs;
+
+ if (!xsk->outstanding_tx)
+ return;
+
+ kick_tx(xsk->sfd);
+ ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
+ xsk->outstanding_tx;
+
+ /* re-add completed Tx buffers */
+ rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs);
+ if (rcvd > 0) {
+ umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd);
+ xsk->outstanding_tx -= rcvd;
+ xsk->tx_npkts += rcvd;
+ }
+}
+
+static inline void complete_tx_only(struct xdpsock *xsk)
+{
+ u64 descs[BATCH_SIZE];
+ unsigned int rcvd;
+
+ if (!xsk->outstanding_tx)
+ return;
+
+ kick_tx(xsk->sfd);
+
+ rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE);
+ if (rcvd > 0) {
+ xsk->outstanding_tx -= rcvd;
+ xsk->tx_npkts += rcvd;
+ }
+}
+
+static void rx_drop(struct xdpsock *xsk)
+{
+ struct xdp_desc descs[BATCH_SIZE];
+ unsigned int rcvd, i;
+
+ rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+ if (!rcvd)
+ return;
+
+ for (i = 0; i < rcvd; i++) {
+ char *pkt = xq_get_data(xsk, descs[i].addr);
+
+ hex_dump(pkt, descs[i].len, descs[i].addr);
+ }
+
+ xsk->rx_npkts += rcvd;
+
+ umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
+}
+
+static void rx_drop_all(void)
+{
+ struct pollfd fds[MAX_SOCKS + 1];
+ int i, ret, timeout, nfds = 1;
+
+ memset(fds, 0, sizeof(fds));
+
+ for (i = 0; i < num_socks; i++) {
+ fds[i].fd = xsks[i]->sfd;
+ fds[i].events = POLLIN;
+ timeout = 1000; /* 1sn */
+ }
+
+ for (;;) {
+ if (opt_poll) {
+ ret = poll(fds, nfds, timeout);
+ if (ret <= 0)
+ continue;
+ }
+
+ for (i = 0; i < num_socks; i++)
+ rx_drop(xsks[i]);
+ }
+}
+
+static void tx_only(struct xdpsock *xsk)
+{
+ int timeout, ret, nfds = 1;
+ struct pollfd fds[nfds + 1];
+ unsigned int idx = 0;
+
+ memset(fds, 0, sizeof(fds));
+ fds[0].fd = xsk->sfd;
+ fds[0].events = POLLOUT;
+ timeout = 1000; /* 1sn */
+
+ for (;;) {
+ if (opt_poll) {
+ ret = poll(fds, nfds, timeout);
+ if (ret <= 0)
+ continue;
+
+ if (fds[0].fd != xsk->sfd ||
+ !(fds[0].revents & POLLOUT))
+ continue;
+ }
+
+ if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) {
+ lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0);
+
+ xsk->outstanding_tx += BATCH_SIZE;
+ idx += BATCH_SIZE;
+ idx %= NUM_FRAMES;
+ }
+
+ complete_tx_only(xsk);
+ }
+}
+
+static void l2fwd(struct xdpsock *xsk)
+{
+ for (;;) {
+ struct xdp_desc descs[BATCH_SIZE];
+ unsigned int rcvd, i;
+ int ret;
+
+ for (;;) {
+ complete_tx_l2fwd(xsk);
+
+ rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+ if (rcvd > 0)
+ break;
+ }
+
+ for (i = 0; i < rcvd; i++) {
+ char *pkt = xq_get_data(xsk, descs[i].addr);
+
+ swap_mac_addresses(pkt);
+
+ hex_dump(pkt, descs[i].len, descs[i].addr);
+ }
+
+ xsk->rx_npkts += rcvd;
+
+ ret = xq_enq(&xsk->tx, descs, rcvd);
+ lassert(ret == 0);
+ xsk->outstanding_tx += rcvd;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_XDP,
+ };
+ int prog_fd, qidconf_map, xsks_map;
+ struct bpf_object *obj;
+ char xdp_filename[256];
+ struct bpf_map *map;
+ int i, ret, key = 0;
+ pthread_t pt;
+
+ parse_command_line(argc, argv);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = xdp_filename;
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ exit(EXIT_FAILURE);
+ if (prog_fd < 0) {
+ fprintf(stderr, "ERROR: no program found: %s\n",
+ strerror(prog_fd));
+ exit(EXIT_FAILURE);
+ }
+
+ map = bpf_object__find_map_by_name(obj, "qidconf_map");
+ qidconf_map = bpf_map__fd(map);
+ if (qidconf_map < 0) {
+ fprintf(stderr, "ERROR: no qidconf map found: %s\n",
+ strerror(qidconf_map));
+ exit(EXIT_FAILURE);
+ }
+
+ map = bpf_object__find_map_by_name(obj, "xsks_map");
+ xsks_map = bpf_map__fd(map);
+ if (xsks_map < 0) {
+ fprintf(stderr, "ERROR: no xsks map found: %s\n",
+ strerror(xsks_map));
+ exit(EXIT_FAILURE);
+ }
+
+ if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
+ fprintf(stderr, "ERROR: link set xdp fd failed\n");
+ exit(EXIT_FAILURE);
+ }
+
+ ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0);
+ if (ret) {
+ fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Create sockets... */
+ xsks[num_socks++] = xsk_configure(NULL);
+
+#if RR_LB
+ for (i = 0; i < MAX_SOCKS - 1; i++)
+ xsks[num_socks++] = xsk_configure(xsks[0]->umem);
+#endif
+
+ /* ...and insert them into the map. */
+ for (i = 0; i < num_socks; i++) {
+ key = i;
+ ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0);
+ if (ret) {
+ fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+ signal(SIGABRT, int_exit);
+
+ setlocale(LC_ALL, "");
+
+ ret = pthread_create(&pt, NULL, poller, NULL);
+ lassert(ret == 0);
+
+ prev_time = get_nsecs();
+
+ if (opt_bench == BENCH_RXDROP)
+ rx_drop_all();
+ else if (opt_bench == BENCH_TXONLY)
+ tx_only(xsks[0]);
+ else
+ l2fwd(xsks[0]);
+
+ return 0;
+}
diff --git a/samples/configfs/Makefile b/samples/configfs/Makefile
new file mode 100644
index 000000000..a9afd9963
--- /dev/null
+++ b/samples/configfs/Makefile
@@ -0,0 +1,2 @@
+
+obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs_sample.o
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c
new file mode 100644
index 000000000..004a4e201
--- /dev/null
+++ b/samples/configfs/configfs_sample.c
@@ -0,0 +1,404 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example_macros.c - This file is a demonstration module
+ * containing a number of configfs subsystems. It uses the helper
+ * macros defined by configfs.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle. All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem. It cannot create
+ * any config_items. It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem. See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+ struct configfs_subsystem subsys;
+ int showme;
+ int storeme;
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+ return item ? container_of(to_configfs_subsystem(to_config_group(item)),
+ struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_show(struct config_item *item, char *page)
+{
+ struct childless *childless = to_childless(item);
+ ssize_t pos;
+
+ pos = sprintf(page, "%d\n", childless->showme);
+ childless->showme++;
+
+ return pos;
+}
+
+static ssize_t childless_storeme_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", to_childless(item)->storeme);
+}
+
+static ssize_t childless_storeme_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct childless *childless = to_childless(item);
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ childless->storeme = tmp;
+
+ return count;
+}
+
+static ssize_t childless_description_show(struct config_item *item, char *page)
+{
+ return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs. It does not support the creation of child config_items.\n"
+"It only has a few attributes. In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+CONFIGFS_ATTR_RO(childless_, showme);
+CONFIGFS_ATTR(childless_, storeme);
+CONFIGFS_ATTR_RO(childless_, description);
+
+static struct configfs_attribute *childless_attrs[] = {
+ &childless_attr_showme,
+ &childless_attr_storeme,
+ &childless_attr_description,
+ NULL,
+};
+
+static const struct config_item_type childless_type = {
+ .ct_attrs = childless_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "01-childless",
+ .ci_type = &childless_type,
+ },
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child. Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go. Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+ struct config_item item;
+ int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+ return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static ssize_t simple_child_storeme_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", to_simple_child(item)->storeme);
+}
+
+static ssize_t simple_child_storeme_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct simple_child *simple_child = to_simple_child(item);
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ simple_child->storeme = tmp;
+
+ return count;
+}
+
+CONFIGFS_ATTR(simple_child_, storeme);
+
+static struct configfs_attribute *simple_child_attrs[] = {
+ &simple_child_attr_storeme,
+ NULL,
+};
+
+static void simple_child_release(struct config_item *item)
+{
+ kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+ .release = simple_child_release,
+};
+
+static const struct config_item_type simple_child_type = {
+ .ct_item_ops = &simple_child_item_ops,
+ .ct_attrs = simple_child_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+
+struct simple_children {
+ struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item),
+ struct simple_children, group) : NULL;
+}
+
+static struct config_item *simple_children_make_item(struct config_group *group,
+ const char *name)
+{
+ struct simple_child *simple_child;
+
+ simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
+ if (!simple_child)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&simple_child->item, name,
+ &simple_child_type);
+
+ simple_child->storeme = 0;
+
+ return &simple_child->item;
+}
+
+static ssize_t simple_children_description_show(struct config_item *item,
+ char *page)
+{
+ return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items. These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+CONFIGFS_ATTR_RO(simple_children_, description);
+
+static struct configfs_attribute *simple_children_attrs[] = {
+ &simple_children_attr_description,
+ NULL,
+};
+
+static void simple_children_release(struct config_item *item)
+{
+ kfree(to_simple_children(item));
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+ .release = simple_children_release,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+ .make_item = simple_children_make_item,
+};
+
+static const struct config_item_type simple_children_type = {
+ .ct_item_ops = &simple_children_item_ops,
+ .ct_group_ops = &simple_children_group_ops,
+ .ct_attrs = simple_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "02-simple-children",
+ .ci_type = &simple_children_type,
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above. However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem. Creation of a group in the subsystem creates
+ * a new simple_children group. That group can then have simple_child
+ * children of its own.
+ */
+
+static struct config_group *group_children_make_group(
+ struct config_group *group, const char *name)
+{
+ struct simple_children *simple_children;
+
+ simple_children = kzalloc(sizeof(struct simple_children),
+ GFP_KERNEL);
+ if (!simple_children)
+ return ERR_PTR(-ENOMEM);
+
+ config_group_init_type_name(&simple_children->group, name,
+ &simple_children_type);
+
+ return &simple_children->group;
+}
+
+static ssize_t group_children_description_show(struct config_item *item,
+ char *page)
+{
+ return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups. These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+CONFIGFS_ATTR_RO(group_children_, description);
+
+static struct configfs_attribute *group_children_attrs[] = {
+ &group_children_attr_description,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+ .make_group = group_children_make_group,
+};
+
+static const struct config_item_type group_children_type = {
+ .ct_group_ops = &group_children_group_ops,
+ .ct_attrs = group_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "03-group-children",
+ .ci_type = &group_children_type,
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all. It
+ * allows the init function to easily register them. Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+ &childless_subsys.subsys,
+ &simple_children_subsys,
+ &group_children_subsys,
+ NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+ int ret;
+ int i;
+ struct configfs_subsystem *subsys;
+
+ for (i = 0; example_subsys[i]; i++) {
+ subsys = example_subsys[i];
+
+ config_group_init(&subsys->su_group);
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ printk(KERN_ERR "Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+ }
+
+ return 0;
+
+out_unregister:
+ for (i--; i >= 0; i--)
+ configfs_unregister_subsystem(example_subsys[i]);
+
+ return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+ int i;
+
+ for (i = 0; example_subsys[i]; i++)
+ configfs_unregister_subsystem(example_subsys[i]);
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore
new file mode 100644
index 000000000..d2b9c32ac
--- /dev/null
+++ b/samples/connector/.gitignore
@@ -0,0 +1 @@
+ucon
diff --git a/samples/connector/Makefile b/samples/connector/Makefile
new file mode 100644
index 000000000..fe3c8542a
--- /dev/null
+++ b/samples/connector/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o
+
+# List of programs to build
+ifdef CONFIG_SAMPLE_CONNECTOR
+hostprogs-y := ucon
+endif
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include
+
+all: modules
+
+modules clean:
+ $(MAKE) -C ../.. SUBDIRS=$(CURDIR) $@
diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c
new file mode 100644
index 000000000..95cd06f4e
--- /dev/null
+++ b/samples/connector/cn_test.c
@@ -0,0 +1,201 @@
+/*
+ * cn_test.c
+ *
+ * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define pr_fmt(fmt) "cn_test: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+
+#include <linux/connector.h>
+
+static struct cb_id cn_test_id = { CN_NETLINK_USERS + 3, 0x456 };
+static char cn_test_name[] = "cn_test";
+static struct sock *nls;
+static struct timer_list cn_test_timer;
+
+static void cn_test_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+{
+ pr_info("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n",
+ __func__, jiffies, msg->id.idx, msg->id.val,
+ msg->seq, msg->ack, msg->len,
+ msg->len ? (char *)msg->data : "");
+}
+
+/*
+ * Do not remove this function even if no one is using it as
+ * this is an example of how to get notifications about new
+ * connector user registration
+ */
+#if 0
+static int cn_test_want_notify(void)
+{
+ struct cn_ctl_msg *ctl;
+ struct cn_notify_req *req;
+ struct cn_msg *msg = NULL;
+ int size, size0;
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ u32 group = 1;
+
+ size0 = sizeof(*msg) + sizeof(*ctl) + 3 * sizeof(*req);
+
+ size = NLMSG_SPACE(size0);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb) {
+ pr_err("failed to allocate new skb with size=%u\n", size);
+ return -ENOMEM;
+ }
+
+ nlh = nlmsg_put(skb, 0, 0x123, NLMSG_DONE, size - sizeof(*nlh), 0);
+ if (!nlh) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ msg = nlmsg_data(nlh);
+
+ memset(msg, 0, size0);
+
+ msg->id.idx = -1;
+ msg->id.val = -1;
+ msg->seq = 0x123;
+ msg->ack = 0x345;
+ msg->len = size0 - sizeof(*msg);
+
+ ctl = (struct cn_ctl_msg *)(msg + 1);
+
+ ctl->idx_notify_num = 1;
+ ctl->val_notify_num = 2;
+ ctl->group = group;
+ ctl->len = msg->len - sizeof(*ctl);
+
+ req = (struct cn_notify_req *)(ctl + 1);
+
+ /*
+ * Idx.
+ */
+ req->first = cn_test_id.idx;
+ req->range = 10;
+
+ /*
+ * Val 0.
+ */
+ req++;
+ req->first = cn_test_id.val;
+ req->range = 10;
+
+ /*
+ * Val 1.
+ */
+ req++;
+ req->first = cn_test_id.val + 20;
+ req->range = 10;
+
+ NETLINK_CB(skb).dst_group = ctl->group;
+ //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC);
+ netlink_unicast(nls, skb, 0, 0);
+
+ pr_info("request was sent: group=0x%x\n", ctl->group);
+
+ return 0;
+}
+#endif
+
+static u32 cn_test_timer_counter;
+static void cn_test_timer_func(struct timer_list *unused)
+{
+ struct cn_msg *m;
+ char data[32];
+
+ pr_debug("%s: timer fired\n", __func__);
+
+ m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC);
+ if (m) {
+
+ memcpy(&m->id, &cn_test_id, sizeof(m->id));
+ m->seq = cn_test_timer_counter;
+ m->len = sizeof(data);
+
+ m->len =
+ scnprintf(data, sizeof(data), "counter = %u",
+ cn_test_timer_counter) + 1;
+
+ memcpy(m + 1, data, m->len);
+
+ cn_netlink_send(m, 0, 0, GFP_ATOMIC);
+ kfree(m);
+ }
+
+ cn_test_timer_counter++;
+
+ mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
+}
+
+static int cn_test_init(void)
+{
+ int err;
+
+ err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
+ if (err)
+ goto err_out;
+ cn_test_id.val++;
+ err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
+ if (err) {
+ cn_del_callback(&cn_test_id);
+ goto err_out;
+ }
+
+ timer_setup(&cn_test_timer, cn_test_timer_func, 0);
+ mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
+
+ pr_info("initialized with id={%u.%u}\n",
+ cn_test_id.idx, cn_test_id.val);
+
+ return 0;
+
+ err_out:
+ if (nls && nls->sk_socket)
+ sock_release(nls->sk_socket);
+
+ return err;
+}
+
+static void cn_test_fini(void)
+{
+ del_timer_sync(&cn_test_timer);
+ cn_del_callback(&cn_test_id);
+ cn_test_id.val--;
+ cn_del_callback(&cn_test_id);
+ if (nls && nls->sk_socket)
+ sock_release(nls->sk_socket);
+}
+
+module_init(cn_test_init);
+module_exit(cn_test_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_DESCRIPTION("Connector's test module");
diff --git a/samples/connector/ucon.c b/samples/connector/ucon.c
new file mode 100644
index 000000000..8a4da64e0
--- /dev/null
+++ b/samples/connector/ucon.c
@@ -0,0 +1,250 @@
+/*
+ * ucon.c
+ *
+ * Copyright (c) 2004+ Evgeniy Polyakov <zbr@ioremap.net>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/types.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <arpa/inet.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <getopt.h>
+
+#include <linux/connector.h>
+
+#define DEBUG
+#define NETLINK_CONNECTOR 11
+
+/* Hopefully your userspace connector.h matches this kernel */
+#define CN_TEST_IDX CN_NETLINK_USERS + 3
+#define CN_TEST_VAL 0x456
+
+#ifdef DEBUG
+#define ulog(f, a...) fprintf(stdout, f, ##a)
+#else
+#define ulog(f, a...) do {} while (0)
+#endif
+
+static int need_exit;
+static __u32 seq;
+
+static int netlink_send(int s, struct cn_msg *msg)
+{
+ struct nlmsghdr *nlh;
+ unsigned int size;
+ int err;
+ char buf[128];
+ struct cn_msg *m;
+
+ size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len);
+
+ nlh = (struct nlmsghdr *)buf;
+ nlh->nlmsg_seq = seq++;
+ nlh->nlmsg_pid = getpid();
+ nlh->nlmsg_type = NLMSG_DONE;
+ nlh->nlmsg_len = size;
+ nlh->nlmsg_flags = 0;
+
+ m = NLMSG_DATA(nlh);
+#if 0
+ ulog("%s: [%08x.%08x] len=%u, seq=%u, ack=%u.\n",
+ __func__, msg->id.idx, msg->id.val, msg->len, msg->seq, msg->ack);
+#endif
+ memcpy(m, msg, sizeof(*m) + msg->len);
+
+ err = send(s, nlh, size, 0);
+ if (err == -1)
+ ulog("Failed to send: %s [%d].\n",
+ strerror(errno), errno);
+
+ return err;
+}
+
+static void usage(void)
+{
+ printf(
+ "Usage: ucon [options] [output file]\n"
+ "\n"
+ "\t-h\tthis help screen\n"
+ "\t-s\tsend buffers to the test module\n"
+ "\n"
+ "The default behavior of ucon is to subscribe to the test module\n"
+ "and wait for state messages. Any ones received are dumped to the\n"
+ "specified output file (or stdout). The test module is assumed to\n"
+ "have an id of {%u.%u}\n"
+ "\n"
+ "If you get no output, then verify the cn_test module id matches\n"
+ "the expected id above.\n"
+ , CN_TEST_IDX, CN_TEST_VAL
+ );
+}
+
+int main(int argc, char *argv[])
+{
+ int s;
+ char buf[1024];
+ int len;
+ struct nlmsghdr *reply;
+ struct sockaddr_nl l_local;
+ struct cn_msg *data;
+ FILE *out;
+ time_t tm;
+ struct pollfd pfd;
+ bool send_msgs = false;
+
+ while ((s = getopt(argc, argv, "hs")) != -1) {
+ switch (s) {
+ case 's':
+ send_msgs = true;
+ break;
+
+ case 'h':
+ usage();
+ return 0;
+
+ default:
+ /* getopt() outputs an error for us */
+ usage();
+ return 1;
+ }
+ }
+
+ if (argc != optind) {
+ out = fopen(argv[optind], "a+");
+ if (!out) {
+ ulog("Unable to open %s for writing: %s\n",
+ argv[1], strerror(errno));
+ out = stdout;
+ }
+ } else
+ out = stdout;
+
+ memset(buf, 0, sizeof(buf));
+
+ s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+ if (s == -1) {
+ perror("socket");
+ return -1;
+ }
+
+ l_local.nl_family = AF_NETLINK;
+ l_local.nl_groups = -1; /* bitmask of requested groups */
+ l_local.nl_pid = 0;
+
+ ulog("subscribing to %u.%u\n", CN_TEST_IDX, CN_TEST_VAL);
+
+ if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+ perror("bind");
+ close(s);
+ return -1;
+ }
+
+#if 0
+ {
+ int on = 0x57; /* Additional group number */
+ setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on));
+ }
+#endif
+ if (send_msgs) {
+ int i, j;
+
+ memset(buf, 0, sizeof(buf));
+
+ data = (struct cn_msg *)buf;
+
+ data->id.idx = CN_TEST_IDX;
+ data->id.val = CN_TEST_VAL;
+ data->seq = seq++;
+ data->ack = 0;
+ data->len = 0;
+
+ for (j=0; j<10; ++j) {
+ for (i=0; i<1000; ++i) {
+ len = netlink_send(s, data);
+ }
+
+ ulog("%d messages have been sent to %08x.%08x.\n", i, data->id.idx, data->id.val);
+ }
+
+ return 0;
+ }
+
+
+ pfd.fd = s;
+
+ while (!need_exit) {
+ pfd.events = POLLIN;
+ pfd.revents = 0;
+ switch (poll(&pfd, 1, -1)) {
+ case 0:
+ need_exit = 1;
+ break;
+ case -1:
+ if (errno != EINTR) {
+ need_exit = 1;
+ break;
+ }
+ continue;
+ }
+ if (need_exit)
+ break;
+
+ memset(buf, 0, sizeof(buf));
+ len = recv(s, buf, sizeof(buf), 0);
+ if (len == -1) {
+ perror("recv buf");
+ close(s);
+ return -1;
+ }
+ reply = (struct nlmsghdr *)buf;
+
+ switch (reply->nlmsg_type) {
+ case NLMSG_ERROR:
+ fprintf(out, "Error message received.\n");
+ fflush(out);
+ break;
+ case NLMSG_DONE:
+ data = (struct cn_msg *)NLMSG_DATA(reply);
+
+ time(&tm);
+ fprintf(out, "%.24s : [%x.%x] [%08u.%08u].\n",
+ ctime(&tm), data->id.idx, data->id.val, data->seq, data->ack);
+ fflush(out);
+ break;
+ default:
+ break;
+ }
+ }
+
+ close(s);
+ return 0;
+}
diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore
new file mode 100644
index 000000000..05e51a685
--- /dev/null
+++ b/samples/hidraw/.gitignore
@@ -0,0 +1 @@
+hid-example
diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile
new file mode 100644
index 000000000..dec1b22ad
--- /dev/null
+++ b/samples/hidraw/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of programs to build
+hostprogs-y := hid-example
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_hid-example.o += -I$(objtree)/usr/include
+
+all: hid-example
diff --git a/samples/hidraw/hid-example.c b/samples/hidraw/hid-example.c
new file mode 100644
index 000000000..9bfd8ff6d
--- /dev/null
+++ b/samples/hidraw/hid-example.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hidraw Userspace Example
+ *
+ * Copyright (c) 2010 Alan Ott <alan@signal11.us>
+ * Copyright (c) 2010 Signal 11 Software
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using hidraw.
+ */
+
+/* Linux */
+#include <linux/types.h>
+#include <linux/input.h>
+#include <linux/hidraw.h>
+
+/*
+ * Ugly hack to work around failing compilation on systems that don't
+ * yet populate new version of hidraw.h to userspace.
+ */
+#ifndef HIDIOCSFEATURE
+#warning Please have your distro update the userspace kernel headers
+#define HIDIOCSFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x06, len)
+#define HIDIOCGFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x07, len)
+#endif
+
+/* Unix */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+/* C */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+const char *bus_str(int bus);
+
+int main(int argc, char **argv)
+{
+ int fd;
+ int i, res, desc_size = 0;
+ char buf[256];
+ struct hidraw_report_descriptor rpt_desc;
+ struct hidraw_devinfo info;
+ char *device = "/dev/hidraw0";
+
+ if (argc > 1)
+ device = argv[1];
+
+ /* Open the Device with non-blocking reads. In real life,
+ don't use a hard coded path; use libudev instead. */
+ fd = open(device, O_RDWR|O_NONBLOCK);
+
+ if (fd < 0) {
+ perror("Unable to open device");
+ return 1;
+ }
+
+ memset(&rpt_desc, 0x0, sizeof(rpt_desc));
+ memset(&info, 0x0, sizeof(info));
+ memset(buf, 0x0, sizeof(buf));
+
+ /* Get Report Descriptor Size */
+ res = ioctl(fd, HIDIOCGRDESCSIZE, &desc_size);
+ if (res < 0)
+ perror("HIDIOCGRDESCSIZE");
+ else
+ printf("Report Descriptor Size: %d\n", desc_size);
+
+ /* Get Report Descriptor */
+ rpt_desc.size = desc_size;
+ res = ioctl(fd, HIDIOCGRDESC, &rpt_desc);
+ if (res < 0) {
+ perror("HIDIOCGRDESC");
+ } else {
+ printf("Report Descriptor:\n");
+ for (i = 0; i < rpt_desc.size; i++)
+ printf("%hhx ", rpt_desc.value[i]);
+ puts("\n");
+ }
+
+ /* Get Raw Name */
+ res = ioctl(fd, HIDIOCGRAWNAME(256), buf);
+ if (res < 0)
+ perror("HIDIOCGRAWNAME");
+ else
+ printf("Raw Name: %s\n", buf);
+
+ /* Get Physical Location */
+ res = ioctl(fd, HIDIOCGRAWPHYS(256), buf);
+ if (res < 0)
+ perror("HIDIOCGRAWPHYS");
+ else
+ printf("Raw Phys: %s\n", buf);
+
+ /* Get Raw Info */
+ res = ioctl(fd, HIDIOCGRAWINFO, &info);
+ if (res < 0) {
+ perror("HIDIOCGRAWINFO");
+ } else {
+ printf("Raw Info:\n");
+ printf("\tbustype: %d (%s)\n",
+ info.bustype, bus_str(info.bustype));
+ printf("\tvendor: 0x%04hx\n", info.vendor);
+ printf("\tproduct: 0x%04hx\n", info.product);
+ }
+
+ /* Set Feature */
+ buf[0] = 0x9; /* Report Number */
+ buf[1] = 0xff;
+ buf[2] = 0xff;
+ buf[3] = 0xff;
+ res = ioctl(fd, HIDIOCSFEATURE(4), buf);
+ if (res < 0)
+ perror("HIDIOCSFEATURE");
+ else
+ printf("ioctl HIDIOCGFEATURE returned: %d\n", res);
+
+ /* Get Feature */
+ buf[0] = 0x9; /* Report Number */
+ res = ioctl(fd, HIDIOCGFEATURE(256), buf);
+ if (res < 0) {
+ perror("HIDIOCGFEATURE");
+ } else {
+ printf("ioctl HIDIOCGFEATURE returned: %d\n", res);
+ printf("Report data (not containing the report number):\n\t");
+ for (i = 0; i < res; i++)
+ printf("%hhx ", buf[i]);
+ puts("\n");
+ }
+
+ /* Send a Report to the Device */
+ buf[0] = 0x1; /* Report Number */
+ buf[1] = 0x77;
+ res = write(fd, buf, 2);
+ if (res < 0) {
+ printf("Error: %d\n", errno);
+ perror("write");
+ } else {
+ printf("write() wrote %d bytes\n", res);
+ }
+
+ /* Get a report from the device */
+ res = read(fd, buf, 16);
+ if (res < 0) {
+ perror("read");
+ } else {
+ printf("read() read %d bytes:\n\t", res);
+ for (i = 0; i < res; i++)
+ printf("%hhx ", buf[i]);
+ puts("\n");
+ }
+ close(fd);
+ return 0;
+}
+
+const char *
+bus_str(int bus)
+{
+ switch (bus) {
+ case BUS_USB:
+ return "USB";
+ break;
+ case BUS_HIL:
+ return "HIL";
+ break;
+ case BUS_BLUETOOTH:
+ return "Bluetooth";
+ break;
+ case BUS_VIRTUAL:
+ return "Virtual";
+ break;
+ default:
+ return "Other";
+ break;
+ }
+}
diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile
new file mode 100644
index 000000000..0f5c31c2f
--- /dev/null
+++ b/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
new file mode 100644
index 000000000..ef7f32291
--- /dev/null
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,90 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * usage: insmod data_breakpoint.ko ksym=<ksym_name>
+ *
+ * This file is a kernel module that places a breakpoint over ksym_name kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked every time a write operation is performed on
+ * that variable.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ *
+ * Author: K.Prasad <prasad@linux.vnet.ibm.com>
+ */
+#include <linux/module.h> /* Needed by all modules */
+#include <linux/kernel.h> /* Needed for KERN_INFO */
+#include <linux/init.h> /* Needed for the macros */
+#include <linux/kallsyms.h>
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+
+struct perf_event * __percpu *sample_hbp;
+
+static char ksym_name[KSYM_NAME_LEN] = "pid_max";
+module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
+MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
+ " write operations on the kernel symbol");
+
+static void sample_hbp_handler(struct perf_event *bp,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ printk(KERN_INFO "%s value is changed\n", ksym_name);
+ dump_stack();
+ printk(KERN_INFO "Dump stack from sample_hbp_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+ int ret;
+ struct perf_event_attr attr;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = kallsyms_lookup_name(ksym_name);
+ attr.bp_len = HW_BREAKPOINT_LEN_4;
+ attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+
+ sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL);
+ if (IS_ERR((void __force *)sample_hbp)) {
+ ret = PTR_ERR((void __force *)sample_hbp);
+ goto fail;
+ }
+
+ printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
+
+ return 0;
+
+fail:
+ printk(KERN_INFO "Breakpoint registration failed\n");
+
+ return ret;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+ unregister_wide_hw_breakpoint(sample_hbp);
+ printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("ksym breakpoint");
diff --git a/samples/kdb/Makefile b/samples/kdb/Makefile
new file mode 100644
index 000000000..fbedf39d9
--- /dev/null
+++ b/samples/kdb/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_KDB) += kdb_hello.o
diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c
new file mode 100644
index 000000000..c1c2fa0f6
--- /dev/null
+++ b/samples/kdb/kdb_hello.c
@@ -0,0 +1,60 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2010 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/kdb.h>
+
+/*
+ * All kdb shell command call backs receive argc and argv, where
+ * argv[0] is the command the end user typed
+ */
+static int kdb_hello_cmd(int argc, const char **argv)
+{
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+
+ if (argc)
+ kdb_printf("Hello %s.\n", argv[1]);
+ else
+ kdb_printf("Hello world!\n");
+
+ return 0;
+}
+
+
+static int __init kdb_hello_cmd_init(void)
+{
+ /*
+ * Registration of a dynamically added kdb command is done with
+ * kdb_register() with the arguments being:
+ * 1: The name of the shell command
+ * 2: The function that processes the command
+ * 3: Description of the usage of any arguments
+ * 4: Descriptive text when you run help
+ * 5: Number of characters to complete the command
+ * 0 == type the whole command
+ * 1 == match both "g" and "go" for example
+ */
+ kdb_register("hello", kdb_hello_cmd, "[string]",
+ "Say Hello World or Hello [string]", 0);
+ return 0;
+}
+
+static void __exit kdb_hello_cmd_exit(void)
+{
+ kdb_unregister("hello");
+}
+
+module_init(kdb_hello_cmd_init);
+module_exit(kdb_hello_cmd_exit);
+
+MODULE_AUTHOR("WindRiver");
+MODULE_DESCRIPTION("KDB example to add a hello command");
+MODULE_LICENSE("GPL");
diff --git a/samples/kfifo/Makefile b/samples/kfifo/Makefile
new file mode 100644
index 000000000..bcc9484a1
--- /dev/null
+++ b/samples/kfifo/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_KFIFO) += bytestream-example.o dma-example.o inttype-example.o record-example.o
diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c
new file mode 100644
index 000000000..a7f5ee8b6
--- /dev/null
+++ b/samples/kfifo/bytestream-example.c
@@ -0,0 +1,198 @@
+/*
+ * Sample kfifo byte stream implementation
+ *
+ * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
+ *
+ * Released under the GPL version 2 only.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/kfifo.h>
+
+/*
+ * This module shows how to create a byte stream fifo.
+ */
+
+/* fifo size in elements (bytes) */
+#define FIFO_SIZE 32
+
+/* name of the proc entry */
+#define PROC_FIFO "bytestream-fifo"
+
+/* lock for procfs read access */
+static DEFINE_MUTEX(read_lock);
+
+/* lock for procfs write access */
+static DEFINE_MUTEX(write_lock);
+
+/*
+ * define DYNAMIC in this example for a dynamically allocated fifo.
+ *
+ * Otherwise the fifo storage will be a part of the fifo structure.
+ */
+#if 0
+#define DYNAMIC
+#endif
+
+#ifdef DYNAMIC
+static struct kfifo test;
+#else
+static DECLARE_KFIFO(test, unsigned char, FIFO_SIZE);
+#endif
+
+static const unsigned char expected_result[FIFO_SIZE] = {
+ 3, 4, 5, 6, 7, 8, 9, 0,
+ 1, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+};
+
+static int __init testfunc(void)
+{
+ unsigned char buf[6];
+ unsigned char i, j;
+ unsigned int ret;
+
+ printk(KERN_INFO "byte stream fifo test start\n");
+
+ /* put string into the fifo */
+ kfifo_in(&test, "hello", 5);
+
+ /* put values into the fifo */
+ for (i = 0; i != 10; i++)
+ kfifo_put(&test, i);
+
+ /* show the number of used elements */
+ printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
+
+ /* get max of 5 bytes from the fifo */
+ i = kfifo_out(&test, buf, 5);
+ printk(KERN_INFO "buf: %.*s\n", i, buf);
+
+ /* get max of 2 elements from the fifo */
+ ret = kfifo_out(&test, buf, 2);
+ printk(KERN_INFO "ret: %d\n", ret);
+ /* and put it back to the end of the fifo */
+ ret = kfifo_in(&test, buf, ret);
+ printk(KERN_INFO "ret: %d\n", ret);
+
+ /* skip first element of the fifo */
+ printk(KERN_INFO "skip 1st element\n");
+ kfifo_skip(&test);
+
+ /* put values into the fifo until is full */
+ for (i = 20; kfifo_put(&test, i); i++)
+ ;
+
+ printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
+
+ /* show the first value without removing from the fifo */
+ if (kfifo_peek(&test, &i))
+ printk(KERN_INFO "%d\n", i);
+
+ /* check the correctness of all values in the fifo */
+ j = 0;
+ while (kfifo_get(&test, &i)) {
+ printk(KERN_INFO "item = %d\n", i);
+ if (i != expected_result[j++]) {
+ printk(KERN_WARNING "value mismatch: test failed\n");
+ return -EIO;
+ }
+ }
+ if (j != ARRAY_SIZE(expected_result)) {
+ printk(KERN_WARNING "size mismatch: test failed\n");
+ return -EIO;
+ }
+ printk(KERN_INFO "test passed\n");
+
+ return 0;
+}
+
+static ssize_t fifo_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ unsigned int copied;
+
+ if (mutex_lock_interruptible(&write_lock))
+ return -ERESTARTSYS;
+
+ ret = kfifo_from_user(&test, buf, count, &copied);
+
+ mutex_unlock(&write_lock);
+ if (ret)
+ return ret;
+
+ return copied;
+}
+
+static ssize_t fifo_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ unsigned int copied;
+
+ if (mutex_lock_interruptible(&read_lock))
+ return -ERESTARTSYS;
+
+ ret = kfifo_to_user(&test, buf, count, &copied);
+
+ mutex_unlock(&read_lock);
+ if (ret)
+ return ret;
+
+ return copied;
+}
+
+static const struct file_operations fifo_fops = {
+ .owner = THIS_MODULE,
+ .read = fifo_read,
+ .write = fifo_write,
+ .llseek = noop_llseek,
+};
+
+static int __init example_init(void)
+{
+#ifdef DYNAMIC
+ int ret;
+
+ ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL);
+ if (ret) {
+ printk(KERN_ERR "error kfifo_alloc\n");
+ return ret;
+ }
+#else
+ INIT_KFIFO(test);
+#endif
+ if (testfunc() < 0) {
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+ return -EIO;
+ }
+
+ if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) {
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void __exit example_exit(void)
+{
+ remove_proc_entry(PROC_FIFO, NULL);
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+}
+
+module_init(example_init);
+module_exit(example_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c
new file mode 100644
index 000000000..be0d4a5fd
--- /dev/null
+++ b/samples/kfifo/dma-example.c
@@ -0,0 +1,143 @@
+/*
+ * Sample fifo dma implementation
+ *
+ * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
+ *
+ * Released under the GPL version 2 only.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+
+/*
+ * This module shows how to handle fifo dma operations.
+ */
+
+/* fifo size in elements (bytes) */
+#define FIFO_SIZE 32
+
+static struct kfifo fifo;
+
+static int __init example_init(void)
+{
+ int i;
+ unsigned int ret;
+ unsigned int nents;
+ struct scatterlist sg[10];
+
+ printk(KERN_INFO "DMA fifo test start\n");
+
+ if (kfifo_alloc(&fifo, FIFO_SIZE, GFP_KERNEL)) {
+ printk(KERN_WARNING "error kfifo_alloc\n");
+ return -ENOMEM;
+ }
+
+ printk(KERN_INFO "queue size: %u\n", kfifo_size(&fifo));
+
+ kfifo_in(&fifo, "test", 4);
+
+ for (i = 0; i != 9; i++)
+ kfifo_put(&fifo, i);
+
+ /* kick away first byte */
+ kfifo_skip(&fifo);
+
+ printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo));
+
+ /*
+ * Configure the kfifo buffer to receive data from DMA input.
+ *
+ * .--------------------------------------.
+ * | 0 | 1 | 2 | ... | 12 | 13 | ... | 31 |
+ * |---|------------------|---------------|
+ * \_/ \________________/ \_____________/
+ * \ \ \
+ * \ \_allocated data \
+ * \_*free space* \_*free space*
+ *
+ * We need two different SG entries: one for the free space area at the
+ * end of the kfifo buffer (19 bytes) and another for the first free
+ * byte at the beginning, after the kfifo_skip().
+ */
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nents = kfifo_dma_in_prepare(&fifo, sg, ARRAY_SIZE(sg), FIFO_SIZE);
+ printk(KERN_INFO "DMA sgl entries: %d\n", nents);
+ if (!nents) {
+ /* fifo is full and no sgl was created */
+ printk(KERN_WARNING "error kfifo_dma_in_prepare\n");
+ return -EIO;
+ }
+
+ /* receive data */
+ printk(KERN_INFO "scatterlist for receive:\n");
+ for (i = 0; i < nents; i++) {
+ printk(KERN_INFO
+ "sg[%d] -> "
+ "page %p offset 0x%.8x length 0x%.8x\n",
+ i, sg_page(&sg[i]), sg[i].offset, sg[i].length);
+
+ if (sg_is_last(&sg[i]))
+ break;
+ }
+
+ /* put here your code to setup and exectute the dma operation */
+ /* ... */
+
+ /* example: zero bytes received */
+ ret = 0;
+
+ /* finish the dma operation and update the received data */
+ kfifo_dma_in_finish(&fifo, ret);
+
+ /* Prepare to transmit data, example: 8 bytes */
+ nents = kfifo_dma_out_prepare(&fifo, sg, ARRAY_SIZE(sg), 8);
+ printk(KERN_INFO "DMA sgl entries: %d\n", nents);
+ if (!nents) {
+ /* no data was available and no sgl was created */
+ printk(KERN_WARNING "error kfifo_dma_out_prepare\n");
+ return -EIO;
+ }
+
+ printk(KERN_INFO "scatterlist for transmit:\n");
+ for (i = 0; i < nents; i++) {
+ printk(KERN_INFO
+ "sg[%d] -> "
+ "page %p offset 0x%.8x length 0x%.8x\n",
+ i, sg_page(&sg[i]), sg[i].offset, sg[i].length);
+
+ if (sg_is_last(&sg[i]))
+ break;
+ }
+
+ /* put here your code to setup and exectute the dma operation */
+ /* ... */
+
+ /* example: 5 bytes transmitted */
+ ret = 5;
+
+ /* finish the dma operation and update the transmitted data */
+ kfifo_dma_out_finish(&fifo, ret);
+
+ ret = kfifo_len(&fifo);
+ printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo));
+
+ if (ret != 7) {
+ printk(KERN_WARNING "size mismatch: test failed");
+ return -EIO;
+ }
+ printk(KERN_INFO "test passed\n");
+
+ return 0;
+}
+
+static void __exit example_exit(void)
+{
+ kfifo_free(&fifo);
+}
+
+module_init(example_init);
+module_exit(example_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c
new file mode 100644
index 000000000..a326a37e9
--- /dev/null
+++ b/samples/kfifo/inttype-example.c
@@ -0,0 +1,189 @@
+/*
+ * Sample kfifo int type implementation
+ *
+ * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
+ *
+ * Released under the GPL version 2 only.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/kfifo.h>
+
+/*
+ * This module shows how to create a int type fifo.
+ */
+
+/* fifo size in elements (ints) */
+#define FIFO_SIZE 32
+
+/* name of the proc entry */
+#define PROC_FIFO "int-fifo"
+
+/* lock for procfs read access */
+static DEFINE_MUTEX(read_lock);
+
+/* lock for procfs write access */
+static DEFINE_MUTEX(write_lock);
+
+/*
+ * define DYNAMIC in this example for a dynamically allocated fifo.
+ *
+ * Otherwise the fifo storage will be a part of the fifo structure.
+ */
+#if 0
+#define DYNAMIC
+#endif
+
+#ifdef DYNAMIC
+static DECLARE_KFIFO_PTR(test, int);
+#else
+static DEFINE_KFIFO(test, int, FIFO_SIZE);
+#endif
+
+static const int expected_result[FIFO_SIZE] = {
+ 3, 4, 5, 6, 7, 8, 9, 0,
+ 1, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+};
+
+static int __init testfunc(void)
+{
+ int buf[6];
+ int i, j;
+ unsigned int ret;
+
+ printk(KERN_INFO "int fifo test start\n");
+
+ /* put values into the fifo */
+ for (i = 0; i != 10; i++)
+ kfifo_put(&test, i);
+
+ /* show the number of used elements */
+ printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
+
+ /* get max of 2 elements from the fifo */
+ ret = kfifo_out(&test, buf, 2);
+ printk(KERN_INFO "ret: %d\n", ret);
+ /* and put it back to the end of the fifo */
+ ret = kfifo_in(&test, buf, ret);
+ printk(KERN_INFO "ret: %d\n", ret);
+
+ /* skip first element of the fifo */
+ printk(KERN_INFO "skip 1st element\n");
+ kfifo_skip(&test);
+
+ /* put values into the fifo until is full */
+ for (i = 20; kfifo_put(&test, i); i++)
+ ;
+
+ printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
+
+ /* show the first value without removing from the fifo */
+ if (kfifo_peek(&test, &i))
+ printk(KERN_INFO "%d\n", i);
+
+ /* check the correctness of all values in the fifo */
+ j = 0;
+ while (kfifo_get(&test, &i)) {
+ printk(KERN_INFO "item = %d\n", i);
+ if (i != expected_result[j++]) {
+ printk(KERN_WARNING "value mismatch: test failed\n");
+ return -EIO;
+ }
+ }
+ if (j != ARRAY_SIZE(expected_result)) {
+ printk(KERN_WARNING "size mismatch: test failed\n");
+ return -EIO;
+ }
+ printk(KERN_INFO "test passed\n");
+
+ return 0;
+}
+
+static ssize_t fifo_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ unsigned int copied;
+
+ if (mutex_lock_interruptible(&write_lock))
+ return -ERESTARTSYS;
+
+ ret = kfifo_from_user(&test, buf, count, &copied);
+
+ mutex_unlock(&write_lock);
+ if (ret)
+ return ret;
+
+ return copied;
+}
+
+static ssize_t fifo_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ unsigned int copied;
+
+ if (mutex_lock_interruptible(&read_lock))
+ return -ERESTARTSYS;
+
+ ret = kfifo_to_user(&test, buf, count, &copied);
+
+ mutex_unlock(&read_lock);
+ if (ret)
+ return ret;
+
+ return copied;
+}
+
+static const struct file_operations fifo_fops = {
+ .owner = THIS_MODULE,
+ .read = fifo_read,
+ .write = fifo_write,
+ .llseek = noop_llseek,
+};
+
+static int __init example_init(void)
+{
+#ifdef DYNAMIC
+ int ret;
+
+ ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL);
+ if (ret) {
+ printk(KERN_ERR "error kfifo_alloc\n");
+ return ret;
+ }
+#endif
+ if (testfunc() < 0) {
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+ return -EIO;
+ }
+
+ if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) {
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void __exit example_exit(void)
+{
+ remove_proc_entry(PROC_FIFO, NULL);
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+}
+
+module_init(example_init);
+module_exit(example_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c
new file mode 100644
index 000000000..deb87a2e4
--- /dev/null
+++ b/samples/kfifo/record-example.c
@@ -0,0 +1,205 @@
+/*
+ * Sample dynamic sized record fifo implementation
+ *
+ * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
+ *
+ * Released under the GPL version 2 only.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/kfifo.h>
+
+/*
+ * This module shows how to create a variable sized record fifo.
+ */
+
+/* fifo size in elements (bytes) */
+#define FIFO_SIZE 128
+
+/* name of the proc entry */
+#define PROC_FIFO "record-fifo"
+
+/* lock for procfs read access */
+static DEFINE_MUTEX(read_lock);
+
+/* lock for procfs write access */
+static DEFINE_MUTEX(write_lock);
+
+/*
+ * define DYNAMIC in this example for a dynamically allocated fifo.
+ *
+ * Otherwise the fifo storage will be a part of the fifo structure.
+ */
+#if 0
+#define DYNAMIC
+#endif
+
+/*
+ * struct kfifo_rec_ptr_1 and STRUCT_KFIFO_REC_1 can handle records of a
+ * length between 0 and 255 bytes.
+ *
+ * struct kfifo_rec_ptr_2 and STRUCT_KFIFO_REC_2 can handle records of a
+ * length between 0 and 65535 bytes.
+ */
+
+#ifdef DYNAMIC
+struct kfifo_rec_ptr_1 test;
+
+#else
+typedef STRUCT_KFIFO_REC_1(FIFO_SIZE) mytest;
+
+static mytest test;
+#endif
+
+static const char *expected_result[] = {
+ "a",
+ "bb",
+ "ccc",
+ "dddd",
+ "eeeee",
+ "ffffff",
+ "ggggggg",
+ "hhhhhhhh",
+ "iiiiiiiii",
+ "jjjjjjjjjj",
+};
+
+static int __init testfunc(void)
+{
+ char buf[100];
+ unsigned int i;
+ unsigned int ret;
+ struct { unsigned char buf[6]; } hello = { "hello" };
+
+ printk(KERN_INFO "record fifo test start\n");
+
+ kfifo_in(&test, &hello, sizeof(hello));
+
+ /* show the size of the next record in the fifo */
+ printk(KERN_INFO "fifo peek len: %u\n", kfifo_peek_len(&test));
+
+ /* put in variable length data */
+ for (i = 0; i < 10; i++) {
+ memset(buf, 'a' + i, i + 1);
+ kfifo_in(&test, buf, i + 1);
+ }
+
+ /* skip first element of the fifo */
+ printk(KERN_INFO "skip 1st element\n");
+ kfifo_skip(&test);
+
+ printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
+
+ /* show the first record without removing from the fifo */
+ ret = kfifo_out_peek(&test, buf, sizeof(buf));
+ if (ret)
+ printk(KERN_INFO "%.*s\n", ret, buf);
+
+ /* check the correctness of all values in the fifo */
+ i = 0;
+ while (!kfifo_is_empty(&test)) {
+ ret = kfifo_out(&test, buf, sizeof(buf));
+ buf[ret] = '\0';
+ printk(KERN_INFO "item = %.*s\n", ret, buf);
+ if (strcmp(buf, expected_result[i++])) {
+ printk(KERN_WARNING "value mismatch: test failed\n");
+ return -EIO;
+ }
+ }
+ if (i != ARRAY_SIZE(expected_result)) {
+ printk(KERN_WARNING "size mismatch: test failed\n");
+ return -EIO;
+ }
+ printk(KERN_INFO "test passed\n");
+
+ return 0;
+}
+
+static ssize_t fifo_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ unsigned int copied;
+
+ if (mutex_lock_interruptible(&write_lock))
+ return -ERESTARTSYS;
+
+ ret = kfifo_from_user(&test, buf, count, &copied);
+
+ mutex_unlock(&write_lock);
+ if (ret)
+ return ret;
+
+ return copied;
+}
+
+static ssize_t fifo_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ unsigned int copied;
+
+ if (mutex_lock_interruptible(&read_lock))
+ return -ERESTARTSYS;
+
+ ret = kfifo_to_user(&test, buf, count, &copied);
+
+ mutex_unlock(&read_lock);
+ if (ret)
+ return ret;
+
+ return copied;
+}
+
+static const struct file_operations fifo_fops = {
+ .owner = THIS_MODULE,
+ .read = fifo_read,
+ .write = fifo_write,
+ .llseek = noop_llseek,
+};
+
+static int __init example_init(void)
+{
+#ifdef DYNAMIC
+ int ret;
+
+ ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL);
+ if (ret) {
+ printk(KERN_ERR "error kfifo_alloc\n");
+ return ret;
+ }
+#else
+ INIT_KFIFO(test);
+#endif
+ if (testfunc() < 0) {
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+ return -EIO;
+ }
+
+ if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) {
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void __exit example_exit(void)
+{
+ remove_proc_entry(PROC_FIFO, NULL);
+#ifdef DYNAMIC
+ kfifo_free(&test);
+#endif
+}
+
+module_init(example_init);
+module_exit(example_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kobject/Makefile b/samples/kobject/Makefile
new file mode 100644
index 000000000..4a194203c
--- /dev/null
+++ b/samples/kobject/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_KOBJECT) += kobject-example.o kset-example.o
diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c
new file mode 100644
index 000000000..9e383fdba
--- /dev/null
+++ b/samples/kobject/kobject-example.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sample kobject implementation
+ *
+ * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
+ * Copyright (C) 2007 Novell Inc.
+ */
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+/*
+ * This module shows how to create a simple subdirectory in sysfs called
+ * /sys/kernel/kobject-example In that directory, 3 files are created:
+ * "foo", "baz", and "bar". If an integer is written to these files, it can be
+ * later read out of it.
+ */
+
+static int foo;
+static int baz;
+static int bar;
+
+/*
+ * The "foo" file where a static variable is read from and written to.
+ */
+static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", foo);
+}
+
+static ssize_t foo_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret;
+
+ ret = kstrtoint(buf, 10, &foo);
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+/* Sysfs attributes cannot be world-writable. */
+static struct kobj_attribute foo_attribute =
+ __ATTR(foo, 0664, foo_show, foo_store);
+
+/*
+ * More complex function where we determine which variable is being accessed by
+ * looking at the attribute for the "baz" and "bar" files.
+ */
+static ssize_t b_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int var;
+
+ if (strcmp(attr->attr.name, "baz") == 0)
+ var = baz;
+ else
+ var = bar;
+ return sprintf(buf, "%d\n", var);
+}
+
+static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int var, ret;
+
+ ret = kstrtoint(buf, 10, &var);
+ if (ret < 0)
+ return ret;
+
+ if (strcmp(attr->attr.name, "baz") == 0)
+ baz = var;
+ else
+ bar = var;
+ return count;
+}
+
+static struct kobj_attribute baz_attribute =
+ __ATTR(baz, 0664, b_show, b_store);
+static struct kobj_attribute bar_attribute =
+ __ATTR(bar, 0664, b_show, b_store);
+
+
+/*
+ * Create a group of attributes so that we can create and destroy them all
+ * at once.
+ */
+static struct attribute *attrs[] = {
+ &foo_attribute.attr,
+ &baz_attribute.attr,
+ &bar_attribute.attr,
+ NULL, /* need to NULL terminate the list of attributes */
+};
+
+/*
+ * An unnamed attribute group will put all of the attributes directly in
+ * the kobject directory. If we specify a name, a subdirectory will be
+ * created for the attributes with the directory being the name of the
+ * attribute group.
+ */
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+static struct kobject *example_kobj;
+
+static int __init example_init(void)
+{
+ int retval;
+
+ /*
+ * Create a simple kobject with the name of "kobject_example",
+ * located under /sys/kernel/
+ *
+ * As this is a simple directory, no uevent will be sent to
+ * userspace. That is why this function should not be used for
+ * any type of dynamic kobjects, where the name and number are
+ * not known ahead of time.
+ */
+ example_kobj = kobject_create_and_add("kobject_example", kernel_kobj);
+ if (!example_kobj)
+ return -ENOMEM;
+
+ /* Create the files associated with this kobject */
+ retval = sysfs_create_group(example_kobj, &attr_group);
+ if (retval)
+ kobject_put(example_kobj);
+
+ return retval;
+}
+
+static void __exit example_exit(void)
+{
+ kobject_put(example_kobj);
+}
+
+module_init(example_init);
+module_exit(example_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>");
diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c
new file mode 100644
index 000000000..401328fd6
--- /dev/null
+++ b/samples/kobject/kset-example.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sample kset and ktype implementation
+ *
+ * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
+ * Copyright (C) 2007 Novell Inc.
+ */
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+/*
+ * This module shows how to create a kset in sysfs called
+ * /sys/kernel/kset-example
+ * Then tree kobjects are created and assigned to this kset, "foo", "baz",
+ * and "bar". In those kobjects, attributes of the same name are also
+ * created and if an integer is written to these files, it can be later
+ * read out of it.
+ */
+
+
+/*
+ * This is our "object" that we will create a few of and register them with
+ * sysfs.
+ */
+struct foo_obj {
+ struct kobject kobj;
+ int foo;
+ int baz;
+ int bar;
+};
+#define to_foo_obj(x) container_of(x, struct foo_obj, kobj)
+
+/* a custom attribute that works just for a struct foo_obj. */
+struct foo_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct foo_obj *foo, struct foo_attribute *attr, char *buf);
+ ssize_t (*store)(struct foo_obj *foo, struct foo_attribute *attr, const char *buf, size_t count);
+};
+#define to_foo_attr(x) container_of(x, struct foo_attribute, attr)
+
+/*
+ * The default show function that must be passed to sysfs. This will be
+ * called by sysfs for whenever a show function is called by the user on a
+ * sysfs file associated with the kobjects we have registered. We need to
+ * transpose back from a "default" kobject to our custom struct foo_obj and
+ * then call the show function for that specific object.
+ */
+static ssize_t foo_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct foo_attribute *attribute;
+ struct foo_obj *foo;
+
+ attribute = to_foo_attr(attr);
+ foo = to_foo_obj(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(foo, attribute, buf);
+}
+
+/*
+ * Just like the default show function above, but this one is for when the
+ * sysfs "store" is requested (when a value is written to a file.)
+ */
+static ssize_t foo_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct foo_attribute *attribute;
+ struct foo_obj *foo;
+
+ attribute = to_foo_attr(attr);
+ foo = to_foo_obj(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(foo, attribute, buf, len);
+}
+
+/* Our custom sysfs_ops that we will associate with our ktype later on */
+static const struct sysfs_ops foo_sysfs_ops = {
+ .show = foo_attr_show,
+ .store = foo_attr_store,
+};
+
+/*
+ * The release function for our object. This is REQUIRED by the kernel to
+ * have. We free the memory held in our object here.
+ *
+ * NEVER try to get away with just a "blank" release function to try to be
+ * smarter than the kernel. Turns out, no one ever is...
+ */
+static void foo_release(struct kobject *kobj)
+{
+ struct foo_obj *foo;
+
+ foo = to_foo_obj(kobj);
+ kfree(foo);
+}
+
+/*
+ * The "foo" file where the .foo variable is read from and written to.
+ */
+static ssize_t foo_show(struct foo_obj *foo_obj, struct foo_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", foo_obj->foo);
+}
+
+static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret;
+
+ ret = kstrtoint(buf, 10, &foo_obj->foo);
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+/* Sysfs attributes cannot be world-writable. */
+static struct foo_attribute foo_attribute =
+ __ATTR(foo, 0664, foo_show, foo_store);
+
+/*
+ * More complex function where we determine which variable is being accessed by
+ * looking at the attribute for the "baz" and "bar" files.
+ */
+static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr,
+ char *buf)
+{
+ int var;
+
+ if (strcmp(attr->attr.name, "baz") == 0)
+ var = foo_obj->baz;
+ else
+ var = foo_obj->bar;
+ return sprintf(buf, "%d\n", var);
+}
+
+static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr,
+ const char *buf, size_t count)
+{
+ int var, ret;
+
+ ret = kstrtoint(buf, 10, &var);
+ if (ret < 0)
+ return ret;
+
+ if (strcmp(attr->attr.name, "baz") == 0)
+ foo_obj->baz = var;
+ else
+ foo_obj->bar = var;
+ return count;
+}
+
+static struct foo_attribute baz_attribute =
+ __ATTR(baz, 0664, b_show, b_store);
+static struct foo_attribute bar_attribute =
+ __ATTR(bar, 0664, b_show, b_store);
+
+/*
+ * Create a group of attributes so that we can create and destroy them all
+ * at once.
+ */
+static struct attribute *foo_default_attrs[] = {
+ &foo_attribute.attr,
+ &baz_attribute.attr,
+ &bar_attribute.attr,
+ NULL, /* need to NULL terminate the list of attributes */
+};
+
+/*
+ * Our own ktype for our kobjects. Here we specify our sysfs ops, the
+ * release function, and the set of default attributes we want created
+ * whenever a kobject of this type is registered with the kernel.
+ */
+static struct kobj_type foo_ktype = {
+ .sysfs_ops = &foo_sysfs_ops,
+ .release = foo_release,
+ .default_attrs = foo_default_attrs,
+};
+
+static struct kset *example_kset;
+static struct foo_obj *foo_obj;
+static struct foo_obj *bar_obj;
+static struct foo_obj *baz_obj;
+
+static struct foo_obj *create_foo_obj(const char *name)
+{
+ struct foo_obj *foo;
+ int retval;
+
+ /* allocate the memory for the whole object */
+ foo = kzalloc(sizeof(*foo), GFP_KERNEL);
+ if (!foo)
+ return NULL;
+
+ /*
+ * As we have a kset for this kobject, we need to set it before calling
+ * the kobject core.
+ */
+ foo->kobj.kset = example_kset;
+
+ /*
+ * Initialize and add the kobject to the kernel. All the default files
+ * will be created here. As we have already specified a kset for this
+ * kobject, we don't have to set a parent for the kobject, the kobject
+ * will be placed beneath that kset automatically.
+ */
+ retval = kobject_init_and_add(&foo->kobj, &foo_ktype, NULL, "%s", name);
+ if (retval) {
+ kobject_put(&foo->kobj);
+ return NULL;
+ }
+
+ /*
+ * We are always responsible for sending the uevent that the kobject
+ * was added to the system.
+ */
+ kobject_uevent(&foo->kobj, KOBJ_ADD);
+
+ return foo;
+}
+
+static void destroy_foo_obj(struct foo_obj *foo)
+{
+ kobject_put(&foo->kobj);
+}
+
+static int __init example_init(void)
+{
+ /*
+ * Create a kset with the name of "kset_example",
+ * located under /sys/kernel/
+ */
+ example_kset = kset_create_and_add("kset_example", NULL, kernel_kobj);
+ if (!example_kset)
+ return -ENOMEM;
+
+ /*
+ * Create three objects and register them with our kset
+ */
+ foo_obj = create_foo_obj("foo");
+ if (!foo_obj)
+ goto foo_error;
+
+ bar_obj = create_foo_obj("bar");
+ if (!bar_obj)
+ goto bar_error;
+
+ baz_obj = create_foo_obj("baz");
+ if (!baz_obj)
+ goto baz_error;
+
+ return 0;
+
+baz_error:
+ destroy_foo_obj(bar_obj);
+bar_error:
+ destroy_foo_obj(foo_obj);
+foo_error:
+ kset_unregister(example_kset);
+ return -EINVAL;
+}
+
+static void __exit example_exit(void)
+{
+ destroy_foo_obj(baz_obj);
+ destroy_foo_obj(bar_obj);
+ destroy_foo_obj(foo_obj);
+ kset_unregister(example_kset);
+}
+
+module_init(example_init);
+module_exit(example_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>");
diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile
new file mode 100644
index 000000000..880e54d2c
--- /dev/null
+++ b/samples/kprobes/Makefile
@@ -0,0 +1,5 @@
+# builds the kprobes example kernel modules;
+# then to use one (as root): insmod <module_name.ko>
+
+obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o
+obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
new file mode 100644
index 000000000..02be8984c
--- /dev/null
+++ b/samples/kprobes/kprobe_example.c
@@ -0,0 +1,117 @@
+/*
+ * NOTE: This example is works on x86 and powerpc.
+ * Here's a sample kernel module showing the use of kprobes to dump a
+ * stack trace and selected registers when _do_fork() is called.
+ *
+ * For more information on theory of operation of kprobes, see
+ * Documentation/kprobes.txt
+ *
+ * You will see the trace data in /var/log/messages and on the console
+ * whenever _do_fork() is invoked to create a new process.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+
+#define MAX_SYMBOL_LEN 64
+static char symbol[MAX_SYMBOL_LEN] = "_do_fork";
+module_param_string(symbol, symbol, sizeof(symbol), 0644);
+
+/* For each probe you need to allocate a kprobe structure */
+static struct kprobe kp = {
+ .symbol_name = symbol,
+};
+
+/* kprobe pre_handler: called just before the probed instruction is executed */
+static int handler_pre(struct kprobe *p, struct pt_regs *regs)
+{
+#ifdef CONFIG_X86
+ pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->ip, regs->flags);
+#endif
+#ifdef CONFIG_PPC
+ pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",
+ p->symbol_name, p->addr, regs->nip, regs->msr);
+#endif
+#ifdef CONFIG_MIPS
+ pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",
+ p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
+#endif
+#ifdef CONFIG_ARM64
+ pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
+ " pstate = 0x%lx\n",
+ p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
+#endif
+#ifdef CONFIG_S390
+ pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->psw.addr, regs->flags);
+#endif
+
+ /* A dump_stack() here will give a stack backtrace */
+ return 0;
+}
+
+/* kprobe post_handler: called after the probed instruction is executed */
+static void handler_post(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+#ifdef CONFIG_X86
+ pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->flags);
+#endif
+#ifdef CONFIG_PPC
+ pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n",
+ p->symbol_name, p->addr, regs->msr);
+#endif
+#ifdef CONFIG_MIPS
+ pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
+ p->symbol_name, p->addr, regs->cp0_status);
+#endif
+#ifdef CONFIG_ARM64
+ pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
+ p->symbol_name, p->addr, (long)regs->pstate);
+#endif
+#ifdef CONFIG_S390
+ pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->flags);
+#endif
+}
+
+/*
+ * fault_handler: this is called if an exception is generated for any
+ * instruction within the pre- or post-handler, or when Kprobes
+ * single-steps the probed instruction.
+ */
+static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
+{
+ pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
+ /* Return 0 because we don't handle the fault. */
+ return 0;
+}
+
+static int __init kprobe_init(void)
+{
+ int ret;
+ kp.pre_handler = handler_pre;
+ kp.post_handler = handler_post;
+ kp.fault_handler = handler_fault;
+
+ ret = register_kprobe(&kp);
+ if (ret < 0) {
+ pr_err("register_kprobe failed, returned %d\n", ret);
+ return ret;
+ }
+ pr_info("Planted kprobe at %p\n", kp.addr);
+ return 0;
+}
+
+static void __exit kprobe_exit(void)
+{
+ unregister_kprobe(&kp);
+ pr_info("kprobe at %p unregistered\n", kp.addr);
+}
+
+module_init(kprobe_init)
+module_exit(kprobe_exit)
+MODULE_LICENSE("GPL");
diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c
new file mode 100644
index 000000000..da6de5e78
--- /dev/null
+++ b/samples/kprobes/kretprobe_example.c
@@ -0,0 +1,105 @@
+/*
+ * kretprobe_example.c
+ *
+ * Here's a sample kernel module showing the use of return probes to
+ * report the return value and total time taken for probed function
+ * to run.
+ *
+ * usage: insmod kretprobe_example.ko func=<func_name>
+ *
+ * If no func_name is specified, _do_fork is instrumented
+ *
+ * For more information on theory of operation of kretprobes, see
+ * Documentation/kprobes.txt
+ *
+ * Build and insert the kernel module as done in the kprobe example.
+ * You will see the trace data in /var/log/messages and on the console
+ * whenever the probed function returns. (Some messages may be suppressed
+ * if syslogd is configured to eliminate duplicate messages.)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/ktime.h>
+#include <linux/limits.h>
+#include <linux/sched.h>
+
+static char func_name[NAME_MAX] = "_do_fork";
+module_param_string(func, func_name, NAME_MAX, S_IRUGO);
+MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
+ " function's execution time");
+
+/* per-instance private data */
+struct my_data {
+ ktime_t entry_stamp;
+};
+
+/* Here we use the entry_hanlder to timestamp function entry */
+static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ struct my_data *data;
+
+ if (!current->mm)
+ return 1; /* Skip kernel threads */
+
+ data = (struct my_data *)ri->data;
+ data->entry_stamp = ktime_get();
+ return 0;
+}
+
+/*
+ * Return-probe handler: Log the return value and duration. Duration may turn
+ * out to be zero consistently, depending upon the granularity of time
+ * accounting on the platform.
+ */
+static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ unsigned long retval = regs_return_value(regs);
+ struct my_data *data = (struct my_data *)ri->data;
+ s64 delta;
+ ktime_t now;
+
+ now = ktime_get();
+ delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
+ pr_info("%s returned %lu and took %lld ns to execute\n",
+ func_name, retval, (long long)delta);
+ return 0;
+}
+
+static struct kretprobe my_kretprobe = {
+ .handler = ret_handler,
+ .entry_handler = entry_handler,
+ .data_size = sizeof(struct my_data),
+ /* Probe up to 20 instances concurrently. */
+ .maxactive = 20,
+};
+
+static int __init kretprobe_init(void)
+{
+ int ret;
+
+ my_kretprobe.kp.symbol_name = func_name;
+ ret = register_kretprobe(&my_kretprobe);
+ if (ret < 0) {
+ pr_err("register_kretprobe failed, returned %d\n", ret);
+ return ret;
+ }
+ pr_info("Planted return probe at %s: %p\n",
+ my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
+ return 0;
+}
+
+static void __exit kretprobe_exit(void)
+{
+ unregister_kretprobe(&my_kretprobe);
+ pr_info("kretprobe at %p unregistered\n", my_kretprobe.kp.addr);
+
+ /* nmissed > 0 suggests that maxactive was set too low. */
+ pr_info("Missed probing %d instances of %s\n",
+ my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
+}
+
+module_init(kretprobe_init)
+module_exit(kretprobe_exit)
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
new file mode 100644
index 000000000..2472ce39a
--- /dev/null
+++ b/samples/livepatch/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c
new file mode 100644
index 000000000..80d06e103
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-busymod.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-callbacks-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+
+static int sleep_secs;
+module_param(sleep_secs, int, 0644);
+MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)");
+
+static void busymod_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(work, busymod_work_func);
+
+static void busymod_work_func(struct work_struct *work)
+{
+ pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs);
+ msleep(sleep_secs * 1000);
+ pr_info("%s exit\n", __func__);
+}
+
+static int livepatch_callbacks_mod_init(void)
+{
+ pr_info("%s\n", __func__);
+ schedule_delayed_work(&work,
+ msecs_to_jiffies(1000 * 0));
+ return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+ cancel_delayed_work_sync(&work);
+ pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
new file mode 100644
index 000000000..72f9e6d13
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Demonstration of registering livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - load the simple module
+ *
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *
+ * Step 2 - load the demonstration livepatch (with callbacks)
+ *
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *
+ * Step 3 - cleanup
+ *
+ * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ * rmmod livepatch_callbacks_demo
+ * rmmod livepatch_callbacks_mod
+ *
+ * Watch dmesg output to see livepatch enablement, callback execution
+ * and patching operations for both vmlinux and module targets.
+ *
+ * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and
+ * livepatch-callbacks-demo.ko to observe what happens when a
+ * target module is loaded after a livepatch with callbacks.
+ *
+ * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch
+ * callback return status. Try setting up a non-zero status
+ * such as -19 (-ENODEV):
+ *
+ * # Load demo livepatch, vmlinux is patched
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ * # Setup next pre-patch callback to return -ENODEV
+ * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+ *
+ * # Module loader refuses to load the target module
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+ *
+ * NOTE: There is a second target module,
+ * livepatch-callbacks-busymod.ko, available for experimenting
+ * with livepatch (un)patch callbacks. This module contains
+ * a 'sleep_secs' parameter that parks the module on one of the
+ * functions that the livepatch demo module wants to patch.
+ * Modifying this value and tweaking the order of module loads can
+ * effectively demonstrate stalled patch transitions:
+ *
+ * # Load a target module, let it park on 'busymod_work_func' for
+ * # thirty seconds
+ * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+ *
+ * # Meanwhile load the livepatch
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ * # ... then load and unload another target module while the
+ * # transition is in progress
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ * rmmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ * # Finally cleanup
+ * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ * rmmod samples/livepatch/livepatch-callbacks-demo.ko
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+static int pre_patch_ret;
+module_param(pre_patch_ret, int, 0644);
+MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)");
+
+static const char *const module_state[] = {
+ [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state",
+ [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init",
+ [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away",
+ [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up",
+};
+
+static void callback_info(const char *callback, struct klp_object *obj)
+{
+ if (obj->mod)
+ pr_info("%s: %s -> %s\n", callback, obj->mod->name,
+ module_state[obj->mod->state]);
+ else
+ pr_info("%s: vmlinux\n", callback);
+}
+
+/* Executed on object patching (ie, patch enablement) */
+static int pre_patch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+ return pre_patch_ret;
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_patch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void pre_unpatch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_unpatch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+static void patched_work_func(struct work_struct *work)
+{
+ pr_info("%s\n", __func__);
+}
+
+static struct klp_func no_funcs[] = {
+ { }
+};
+
+static struct klp_func busymod_funcs[] = {
+ {
+ .old_name = "busymod_work_func",
+ .new_func = patched_work_func,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = NULL, /* vmlinux */
+ .funcs = no_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, {
+ .name = "livepatch_callbacks_mod",
+ .funcs = no_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, {
+ .name = "livepatch_callbacks_busymod",
+ .funcs = busymod_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_callbacks_demo_init(void)
+{
+ int ret;
+
+ ret = klp_register_patch(&patch);
+ if (ret)
+ return ret;
+ ret = klp_enable_patch(&patch);
+ if (ret) {
+ WARN_ON(klp_unregister_patch(&patch));
+ return ret;
+ }
+ return 0;
+}
+
+static void livepatch_callbacks_demo_exit(void)
+{
+ WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_callbacks_demo_init);
+module_exit(livepatch_callbacks_demo_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c
new file mode 100644
index 000000000..e610ce29b
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-mod.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-mod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-callbacks-demo.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+static int livepatch_callbacks_mod_init(void)
+{
+ pr_info("%s\n", __func__);
+ return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+ pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
new file mode 100644
index 000000000..2d554dd93
--- /dev/null
+++ b/samples/livepatch/livepatch-sample.c
@@ -0,0 +1,93 @@
+/*
+ * livepatch-sample.c - Kernel Live Patching Sample Module
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+/*
+ * This (dumb) live patch overrides the function that prints the
+ * kernel boot cmdline when /proc/cmdline is read.
+ *
+ * Example:
+ *
+ * $ cat /proc/cmdline
+ * <your cmdline>
+ *
+ * $ insmod livepatch-sample.ko
+ * $ cat /proc/cmdline
+ * this has been live patched
+ *
+ * $ echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled
+ * $ cat /proc/cmdline
+ * <your cmdline>
+ */
+
+#include <linux/seq_file.h>
+static int livepatch_cmdline_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%s\n", "this has been live patched");
+ return 0;
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "cmdline_proc_show",
+ .new_func = livepatch_cmdline_proc_show,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ /* name being NULL means vmlinux */
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_init(void)
+{
+ int ret;
+
+ ret = klp_register_patch(&patch);
+ if (ret)
+ return ret;
+ ret = klp_enable_patch(&patch);
+ if (ret) {
+ WARN_ON(klp_unregister_patch(&patch));
+ return ret;
+ }
+ return 0;
+}
+
+static void livepatch_exit(void)
+{
+ WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_init);
+module_exit(livepatch_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
new file mode 100644
index 000000000..e8f1bd6b2
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-fix1.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Fixes the memory leak introduced in livepatch-shadow-mod through the
+ * use of a shadow variable. This fix demonstrates the "extending" of
+ * short-lived data structures by patching its allocation and release
+ * functions.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK 1
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD 1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+/*
+ * The constructor makes more sense together with klp_shadow_get_or_alloc().
+ * In this example, it would be safe to assign the pointer also to the shadow
+ * variable returned by klp_shadow_alloc(). But we wanted to show the more
+ * complicated use of the API.
+ */
+static int shadow_leak_ctor(void *obj, void *shadow_data, void *ctor_data)
+{
+ void **shadow_leak = shadow_data;
+ void *leak = ctor_data;
+
+ *shadow_leak = leak;
+ return 0;
+}
+
+struct dummy *livepatch_fix1_dummy_alloc(void)
+{
+ struct dummy *d;
+ void *leak;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return NULL;
+
+ d->jiffies_expire = jiffies +
+ msecs_to_jiffies(1000 * EXPIRE_PERIOD);
+
+ /*
+ * Patch: save the extra memory location into a SV_LEAK shadow
+ * variable. A patched dummy_free routine can later fetch this
+ * pointer to handle resource release.
+ */
+ leak = kzalloc(sizeof(int), GFP_KERNEL);
+ if (!leak) {
+ kfree(d);
+ return NULL;
+ }
+
+ klp_shadow_alloc(d, SV_LEAK, sizeof(leak), GFP_KERNEL,
+ shadow_leak_ctor, leak);
+
+ pr_info("%s: dummy @ %p, expires @ %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ return d;
+}
+
+static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data)
+{
+ void *d = obj;
+ void **shadow_leak = shadow_data;
+
+ kfree(*shadow_leak);
+ pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+ __func__, d, *shadow_leak);
+}
+
+void livepatch_fix1_dummy_free(struct dummy *d)
+{
+ void **shadow_leak;
+
+ /*
+ * Patch: fetch the saved SV_LEAK shadow variable, detach and
+ * free it. Note: handle cases where this shadow variable does
+ * not exist (ie, dummy structures allocated before this livepatch
+ * was loaded.)
+ */
+ shadow_leak = klp_shadow_get(d, SV_LEAK);
+ if (shadow_leak)
+ klp_shadow_free(d, SV_LEAK, livepatch_fix1_dummy_leak_dtor);
+ else
+ pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+
+ kfree(d);
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "dummy_alloc",
+ .new_func = livepatch_fix1_dummy_alloc,
+ },
+ {
+ .old_name = "dummy_free",
+ .new_func = livepatch_fix1_dummy_free,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = "livepatch_shadow_mod",
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_shadow_fix1_init(void)
+{
+ int ret;
+
+ ret = klp_register_patch(&patch);
+ if (ret)
+ return ret;
+ ret = klp_enable_patch(&patch);
+ if (ret) {
+ WARN_ON(klp_unregister_patch(&patch));
+ return ret;
+ }
+ return 0;
+}
+
+static void livepatch_shadow_fix1_exit(void)
+{
+ /* Cleanup any existing SV_LEAK shadow variables */
+ klp_shadow_free_all(SV_LEAK, livepatch_fix1_dummy_leak_dtor);
+
+ WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_shadow_fix1_init);
+module_exit(livepatch_shadow_fix1_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
new file mode 100644
index 000000000..b34c7bf83
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-fix2.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Adds functionality to livepatch-shadow-mod's in-flight data
+ * structures through a shadow variable. The livepatch patches a
+ * routine that periodically inspects data structures, incrementing a
+ * per-data-structure counter, creating the counter if needed.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK 1
+#define SV_COUNTER 2
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies)
+{
+ int *shadow_count;
+
+ /*
+ * Patch: handle in-flight dummy structures, if they do not
+ * already have a SV_COUNTER shadow variable, then attach a
+ * new one.
+ */
+ shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER,
+ sizeof(*shadow_count), GFP_NOWAIT,
+ NULL, NULL);
+ if (shadow_count)
+ *shadow_count += 1;
+
+ return time_after(jiffies, d->jiffies_expire);
+}
+
+static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data)
+{
+ void *d = obj;
+ void **shadow_leak = shadow_data;
+
+ kfree(*shadow_leak);
+ pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+ __func__, d, *shadow_leak);
+}
+
+void livepatch_fix2_dummy_free(struct dummy *d)
+{
+ void **shadow_leak;
+ int *shadow_count;
+
+ /* Patch: copy the memory leak patch from the fix1 module. */
+ shadow_leak = klp_shadow_get(d, SV_LEAK);
+ if (shadow_leak)
+ klp_shadow_free(d, SV_LEAK, livepatch_fix2_dummy_leak_dtor);
+ else
+ pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+
+ /*
+ * Patch: fetch the SV_COUNTER shadow variable and display
+ * the final count. Detach the shadow variable.
+ */
+ shadow_count = klp_shadow_get(d, SV_COUNTER);
+ if (shadow_count) {
+ pr_info("%s: dummy @ %p, check counter = %d\n",
+ __func__, d, *shadow_count);
+ klp_shadow_free(d, SV_COUNTER, NULL);
+ }
+
+ kfree(d);
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "dummy_check",
+ .new_func = livepatch_fix2_dummy_check,
+ },
+ {
+ .old_name = "dummy_free",
+ .new_func = livepatch_fix2_dummy_free,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = "livepatch_shadow_mod",
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_shadow_fix2_init(void)
+{
+ int ret;
+
+ ret = klp_register_patch(&patch);
+ if (ret)
+ return ret;
+ ret = klp_enable_patch(&patch);
+ if (ret) {
+ WARN_ON(klp_unregister_patch(&patch));
+ return ret;
+ }
+ return 0;
+}
+
+static void livepatch_shadow_fix2_exit(void)
+{
+ /* Cleanup any existing SV_COUNTER shadow variables */
+ klp_shadow_free_all(SV_COUNTER, NULL);
+
+ WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_shadow_fix2_init);
+module_exit(livepatch_shadow_fix2_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c
new file mode 100644
index 000000000..4aa8a88d3
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-mod.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-mod.c - Shadow variables, buggy module demo
+ *
+ * Purpose
+ * -------
+ *
+ * As a demonstration of livepatch shadow variable API, this module
+ * introduces memory leak behavior that livepatch modules
+ * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and
+ * enhance.
+ *
+ * WARNING - even though the livepatch-shadow-fix modules patch the
+ * memory leak, please load these modules at your own risk -- some
+ * amount of memory may leaked before the bug is patched.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - Load the buggy demonstration module:
+ *
+ * insmod samples/livepatch/livepatch-shadow-mod.ko
+ *
+ * Watch dmesg output for a few moments to see new dummy being allocated
+ * and a periodic cleanup check. (Note: a small amount of memory is
+ * being leaked.)
+ *
+ *
+ * Step 2 - Load livepatch fix1:
+ *
+ * insmod samples/livepatch/livepatch-shadow-fix1.ko
+ *
+ * Continue watching dmesg and note that now livepatch_fix1_dummy_free()
+ * and livepatch_fix1_dummy_alloc() are logging messages about leaked
+ * memory and eventually leaks prevented.
+ *
+ *
+ * Step 3 - Load livepatch fix2 (on top of fix1):
+ *
+ * insmod samples/livepatch/livepatch-shadow-fix2.ko
+ *
+ * This module extends functionality through shadow variables, as a new
+ * "check" counter is added to the dummy structure. Periodic dmesg
+ * messages will log these as dummies are cleaned up.
+ *
+ *
+ * Step 4 - Cleanup
+ *
+ * Unwind the demonstration by disabling the livepatch fix modules, then
+ * removing them and the demo module:
+ *
+ * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled
+ * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled
+ * rmmod livepatch-shadow-fix2
+ * rmmod livepatch-shadow-fix1
+ * rmmod livepatch-shadow-mod
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/workqueue.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>");
+MODULE_DESCRIPTION("Buggy module for shadow variable demo");
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD 1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
+
+/*
+ * Keep a list of all the dummies so we can clean up any residual ones
+ * on module exit
+ */
+LIST_HEAD(dummy_list);
+DEFINE_MUTEX(dummy_list_mutex);
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+noinline struct dummy *dummy_alloc(void)
+{
+ struct dummy *d;
+ void *leak;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return NULL;
+
+ d->jiffies_expire = jiffies +
+ msecs_to_jiffies(1000 * EXPIRE_PERIOD);
+
+ /* Oops, forgot to save leak! */
+ leak = kzalloc(sizeof(int), GFP_KERNEL);
+ if (!leak) {
+ kfree(d);
+ return NULL;
+ }
+
+ pr_info("%s: dummy @ %p, expires @ %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ return d;
+}
+
+noinline void dummy_free(struct dummy *d)
+{
+ pr_info("%s: dummy @ %p, expired = %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ kfree(d);
+}
+
+noinline bool dummy_check(struct dummy *d, unsigned long jiffies)
+{
+ return time_after(jiffies, d->jiffies_expire);
+}
+
+/*
+ * alloc_work_func: allocates new dummy structures, allocates additional
+ * memory, aptly named "leak", but doesn't keep
+ * permanent record of it.
+ */
+
+static void alloc_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func);
+
+static void alloc_work_func(struct work_struct *work)
+{
+ struct dummy *d;
+
+ d = dummy_alloc();
+ if (!d)
+ return;
+
+ mutex_lock(&dummy_list_mutex);
+ list_add(&d->list, &dummy_list);
+ mutex_unlock(&dummy_list_mutex);
+
+ schedule_delayed_work(&alloc_dwork,
+ msecs_to_jiffies(1000 * ALLOC_PERIOD));
+}
+
+/*
+ * cleanup_work_func: frees dummy structures. Without knownledge of
+ * "leak", it leaks the additional memory that
+ * alloc_work_func created.
+ */
+
+static void cleanup_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func);
+
+static void cleanup_work_func(struct work_struct *work)
+{
+ struct dummy *d, *tmp;
+ unsigned long j;
+
+ j = jiffies;
+ pr_info("%s: jiffies = %lx\n", __func__, j);
+
+ mutex_lock(&dummy_list_mutex);
+ list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+
+ /* Kick out and free any expired dummies */
+ if (dummy_check(d, j)) {
+ list_del(&d->list);
+ dummy_free(d);
+ }
+ }
+ mutex_unlock(&dummy_list_mutex);
+
+ schedule_delayed_work(&cleanup_dwork,
+ msecs_to_jiffies(1000 * CLEANUP_PERIOD));
+}
+
+static int livepatch_shadow_mod_init(void)
+{
+ schedule_delayed_work(&alloc_dwork,
+ msecs_to_jiffies(1000 * ALLOC_PERIOD));
+ schedule_delayed_work(&cleanup_dwork,
+ msecs_to_jiffies(1000 * CLEANUP_PERIOD));
+
+ return 0;
+}
+
+static void livepatch_shadow_mod_exit(void)
+{
+ struct dummy *d, *tmp;
+
+ /* Wait for any dummies at work */
+ cancel_delayed_work_sync(&alloc_dwork);
+ cancel_delayed_work_sync(&cleanup_dwork);
+
+ /* Cleanup residual dummies */
+ list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+ list_del(&d->list);
+ dummy_free(d);
+ }
+}
+
+module_init(livepatch_shadow_mod_init);
+module_exit(livepatch_shadow_mod_exit);
diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore
new file mode 100644
index 000000000..f356b81ca
--- /dev/null
+++ b/samples/mei/.gitignore
@@ -0,0 +1 @@
+mei-amt-version
diff --git a/samples/mei/Makefile b/samples/mei/Makefile
new file mode 100644
index 000000000..c7e52e9e9
--- /dev/null
+++ b/samples/mei/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+CC := $(CROSS_COMPILE)gcc
+CFLAGS := -I../../usr/include
+
+PROGS := mei-amt-version
+
+all: $(PROGS)
+
+clean:
+ rm -fr $(PROGS)
diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c
new file mode 100644
index 000000000..32234481a
--- /dev/null
+++ b/samples/mei/mei-amt-version.c
@@ -0,0 +1,479 @@
+/******************************************************************************
+ * Intel Management Engine Interface (Intel MEI) Linux driver
+ * Intel MEI Interface Header
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
+ * USA
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called LICENSE.GPL.
+ *
+ * Contact Information:
+ * Intel Corporation.
+ * linux-mei@linux.intel.com
+ * http://www.intel.com
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2003 - 2012 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <bits/wordsize.h>
+#include <linux/mei.h>
+
+/*****************************************************************************
+ * Intel Management Engine Interface
+ *****************************************************************************/
+
+#define mei_msg(_me, fmt, ARGS...) do { \
+ if (_me->verbose) \
+ fprintf(stderr, fmt, ##ARGS); \
+} while (0)
+
+#define mei_err(_me, fmt, ARGS...) do { \
+ fprintf(stderr, "Error: " fmt, ##ARGS); \
+} while (0)
+
+struct mei {
+ uuid_le guid;
+ bool initialized;
+ bool verbose;
+ unsigned int buf_size;
+ unsigned char prot_ver;
+ int fd;
+};
+
+static void mei_deinit(struct mei *cl)
+{
+ if (cl->fd != -1)
+ close(cl->fd);
+ cl->fd = -1;
+ cl->buf_size = 0;
+ cl->prot_ver = 0;
+ cl->initialized = false;
+}
+
+static bool mei_init(struct mei *me, const uuid_le *guid,
+ unsigned char req_protocol_version, bool verbose)
+{
+ int result;
+ struct mei_client *cl;
+ struct mei_connect_client_data data;
+
+ me->verbose = verbose;
+
+ me->fd = open("/dev/mei0", O_RDWR);
+ if (me->fd == -1) {
+ mei_err(me, "Cannot establish a handle to the Intel MEI driver\n");
+ goto err;
+ }
+ memcpy(&me->guid, guid, sizeof(*guid));
+ memset(&data, 0, sizeof(data));
+ me->initialized = true;
+
+ memcpy(&data.in_client_uuid, &me->guid, sizeof(me->guid));
+ result = ioctl(me->fd, IOCTL_MEI_CONNECT_CLIENT, &data);
+ if (result) {
+ mei_err(me, "IOCTL_MEI_CONNECT_CLIENT receive message. err=%d\n", result);
+ goto err;
+ }
+ cl = &data.out_client_properties;
+ mei_msg(me, "max_message_length %d\n", cl->max_msg_length);
+ mei_msg(me, "protocol_version %d\n", cl->protocol_version);
+
+ if ((req_protocol_version > 0) &&
+ (cl->protocol_version != req_protocol_version)) {
+ mei_err(me, "Intel MEI protocol version not supported\n");
+ goto err;
+ }
+
+ me->buf_size = cl->max_msg_length;
+ me->prot_ver = cl->protocol_version;
+
+ return true;
+err:
+ mei_deinit(me);
+ return false;
+}
+
+static ssize_t mei_recv_msg(struct mei *me, unsigned char *buffer,
+ ssize_t len, unsigned long timeout)
+{
+ ssize_t rc;
+
+ mei_msg(me, "call read length = %zd\n", len);
+
+ rc = read(me->fd, buffer, len);
+ if (rc < 0) {
+ mei_err(me, "read failed with status %zd %s\n",
+ rc, strerror(errno));
+ mei_deinit(me);
+ } else {
+ mei_msg(me, "read succeeded with result %zd\n", rc);
+ }
+ return rc;
+}
+
+static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer,
+ ssize_t len, unsigned long timeout)
+{
+ struct timeval tv;
+ ssize_t written;
+ ssize_t rc;
+ fd_set set;
+
+ tv.tv_sec = timeout / 1000;
+ tv.tv_usec = (timeout % 1000) * 1000000;
+
+ mei_msg(me, "call write length = %zd\n", len);
+
+ written = write(me->fd, buffer, len);
+ if (written < 0) {
+ rc = -errno;
+ mei_err(me, "write failed with status %zd %s\n",
+ written, strerror(errno));
+ goto out;
+ }
+
+ FD_ZERO(&set);
+ FD_SET(me->fd, &set);
+ rc = select(me->fd + 1 , &set, NULL, NULL, &tv);
+ if (rc > 0 && FD_ISSET(me->fd, &set)) {
+ mei_msg(me, "write success\n");
+ } else if (rc == 0) {
+ mei_err(me, "write failed on timeout with status\n");
+ goto out;
+ } else { /* rc < 0 */
+ mei_err(me, "write failed on select with status %zd\n", rc);
+ goto out;
+ }
+
+ rc = written;
+out:
+ if (rc < 0)
+ mei_deinit(me);
+
+ return rc;
+}
+
+/***************************************************************************
+ * Intel Advanced Management Technology ME Client
+ ***************************************************************************/
+
+#define AMT_MAJOR_VERSION 1
+#define AMT_MINOR_VERSION 1
+
+#define AMT_STATUS_SUCCESS 0x0
+#define AMT_STATUS_INTERNAL_ERROR 0x1
+#define AMT_STATUS_NOT_READY 0x2
+#define AMT_STATUS_INVALID_AMT_MODE 0x3
+#define AMT_STATUS_INVALID_MESSAGE_LENGTH 0x4
+
+#define AMT_STATUS_HOST_IF_EMPTY_RESPONSE 0x4000
+#define AMT_STATUS_SDK_RESOURCES 0x1004
+
+
+#define AMT_BIOS_VERSION_LEN 65
+#define AMT_VERSIONS_NUMBER 50
+#define AMT_UNICODE_STRING_LEN 20
+
+struct amt_unicode_string {
+ uint16_t length;
+ char string[AMT_UNICODE_STRING_LEN];
+} __attribute__((packed));
+
+struct amt_version_type {
+ struct amt_unicode_string description;
+ struct amt_unicode_string version;
+} __attribute__((packed));
+
+struct amt_version {
+ uint8_t major;
+ uint8_t minor;
+} __attribute__((packed));
+
+struct amt_code_versions {
+ uint8_t bios[AMT_BIOS_VERSION_LEN];
+ uint32_t count;
+ struct amt_version_type versions[AMT_VERSIONS_NUMBER];
+} __attribute__((packed));
+
+/***************************************************************************
+ * Intel Advanced Management Technology Host Interface
+ ***************************************************************************/
+
+struct amt_host_if_msg_header {
+ struct amt_version version;
+ uint16_t _reserved;
+ uint32_t command;
+ uint32_t length;
+} __attribute__((packed));
+
+struct amt_host_if_resp_header {
+ struct amt_host_if_msg_header header;
+ uint32_t status;
+ unsigned char data[0];
+} __attribute__((packed));
+
+const uuid_le MEI_IAMTHIF = UUID_LE(0x12f80028, 0xb4b7, 0x4b2d, \
+ 0xac, 0xa8, 0x46, 0xe0, 0xff, 0x65, 0x81, 0x4c);
+
+#define AMT_HOST_IF_CODE_VERSIONS_REQUEST 0x0400001A
+#define AMT_HOST_IF_CODE_VERSIONS_RESPONSE 0x0480001A
+
+const struct amt_host_if_msg_header CODE_VERSION_REQ = {
+ .version = {AMT_MAJOR_VERSION, AMT_MINOR_VERSION},
+ ._reserved = 0,
+ .command = AMT_HOST_IF_CODE_VERSIONS_REQUEST,
+ .length = 0
+};
+
+
+struct amt_host_if {
+ struct mei mei_cl;
+ unsigned long send_timeout;
+ bool initialized;
+};
+
+
+static bool amt_host_if_init(struct amt_host_if *acmd,
+ unsigned long send_timeout, bool verbose)
+{
+ acmd->send_timeout = (send_timeout) ? send_timeout : 20000;
+ acmd->initialized = mei_init(&acmd->mei_cl, &MEI_IAMTHIF, 0, verbose);
+ return acmd->initialized;
+}
+
+static void amt_host_if_deinit(struct amt_host_if *acmd)
+{
+ mei_deinit(&acmd->mei_cl);
+ acmd->initialized = false;
+}
+
+static uint32_t amt_verify_code_versions(const struct amt_host_if_resp_header *resp)
+{
+ uint32_t status = AMT_STATUS_SUCCESS;
+ struct amt_code_versions *code_ver;
+ size_t code_ver_len;
+ uint32_t ver_type_cnt;
+ uint32_t len;
+ uint32_t i;
+
+ code_ver = (struct amt_code_versions *)resp->data;
+ /* length - sizeof(status) */
+ code_ver_len = resp->header.length - sizeof(uint32_t);
+ ver_type_cnt = code_ver_len -
+ sizeof(code_ver->bios) -
+ sizeof(code_ver->count);
+ if (code_ver->count != ver_type_cnt / sizeof(struct amt_version_type)) {
+ status = AMT_STATUS_INTERNAL_ERROR;
+ goto out;
+ }
+
+ for (i = 0; i < code_ver->count; i++) {
+ len = code_ver->versions[i].description.length;
+
+ if (len > AMT_UNICODE_STRING_LEN) {
+ status = AMT_STATUS_INTERNAL_ERROR;
+ goto out;
+ }
+
+ len = code_ver->versions[i].version.length;
+ if (code_ver->versions[i].version.string[len] != '\0' ||
+ len != strlen(code_ver->versions[i].version.string)) {
+ status = AMT_STATUS_INTERNAL_ERROR;
+ goto out;
+ }
+ }
+out:
+ return status;
+}
+
+static uint32_t amt_verify_response_header(uint32_t command,
+ const struct amt_host_if_msg_header *resp_hdr,
+ uint32_t response_size)
+{
+ if (response_size < sizeof(struct amt_host_if_resp_header)) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (response_size != (resp_hdr->length +
+ sizeof(struct amt_host_if_msg_header))) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (resp_hdr->command != command) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (resp_hdr->_reserved != 0) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (resp_hdr->version.major != AMT_MAJOR_VERSION ||
+ resp_hdr->version.minor < AMT_MINOR_VERSION) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ }
+ return AMT_STATUS_SUCCESS;
+}
+
+static uint32_t amt_host_if_call(struct amt_host_if *acmd,
+ const unsigned char *command, ssize_t command_sz,
+ uint8_t **read_buf, uint32_t rcmd,
+ unsigned int expected_sz)
+{
+ uint32_t in_buf_sz;
+ ssize_t out_buf_sz;
+ ssize_t written;
+ uint32_t status;
+ struct amt_host_if_resp_header *msg_hdr;
+
+ in_buf_sz = acmd->mei_cl.buf_size;
+ *read_buf = (uint8_t *)malloc(sizeof(uint8_t) * in_buf_sz);
+ if (*read_buf == NULL)
+ return AMT_STATUS_SDK_RESOURCES;
+ memset(*read_buf, 0, in_buf_sz);
+ msg_hdr = (struct amt_host_if_resp_header *)*read_buf;
+
+ written = mei_send_msg(&acmd->mei_cl,
+ command, command_sz, acmd->send_timeout);
+ if (written != command_sz)
+ return AMT_STATUS_INTERNAL_ERROR;
+
+ out_buf_sz = mei_recv_msg(&acmd->mei_cl, *read_buf, in_buf_sz, 2000);
+ if (out_buf_sz <= 0)
+ return AMT_STATUS_HOST_IF_EMPTY_RESPONSE;
+
+ status = msg_hdr->status;
+ if (status != AMT_STATUS_SUCCESS)
+ return status;
+
+ status = amt_verify_response_header(rcmd,
+ &msg_hdr->header, out_buf_sz);
+ if (status != AMT_STATUS_SUCCESS)
+ return status;
+
+ if (expected_sz && expected_sz != out_buf_sz)
+ return AMT_STATUS_INTERNAL_ERROR;
+
+ return AMT_STATUS_SUCCESS;
+}
+
+
+static uint32_t amt_get_code_versions(struct amt_host_if *cmd,
+ struct amt_code_versions *versions)
+{
+ struct amt_host_if_resp_header *response = NULL;
+ uint32_t status;
+
+ status = amt_host_if_call(cmd,
+ (const unsigned char *)&CODE_VERSION_REQ,
+ sizeof(CODE_VERSION_REQ),
+ (uint8_t **)&response,
+ AMT_HOST_IF_CODE_VERSIONS_RESPONSE, 0);
+
+ if (status != AMT_STATUS_SUCCESS)
+ goto out;
+
+ status = amt_verify_code_versions(response);
+ if (status != AMT_STATUS_SUCCESS)
+ goto out;
+
+ memcpy(versions, response->data, sizeof(struct amt_code_versions));
+out:
+ if (response != NULL)
+ free(response);
+
+ return status;
+}
+
+/************************** end of amt_host_if_command ***********************/
+int main(int argc, char **argv)
+{
+ struct amt_code_versions ver;
+ struct amt_host_if acmd;
+ unsigned int i;
+ uint32_t status;
+ int ret;
+ bool verbose;
+
+ verbose = (argc > 1 && strcmp(argv[1], "-v") == 0);
+
+ if (!amt_host_if_init(&acmd, 5000, verbose)) {
+ ret = 1;
+ goto out;
+ }
+
+ status = amt_get_code_versions(&acmd, &ver);
+
+ amt_host_if_deinit(&acmd);
+
+ switch (status) {
+ case AMT_STATUS_HOST_IF_EMPTY_RESPONSE:
+ printf("Intel AMT: DISABLED\n");
+ ret = 0;
+ break;
+ case AMT_STATUS_SUCCESS:
+ printf("Intel AMT: ENABLED\n");
+ for (i = 0; i < ver.count; i++) {
+ printf("%s:\t%s\n", ver.versions[i].description.string,
+ ver.versions[i].version.string);
+ }
+ ret = 0;
+ break;
+ default:
+ printf("An error has occurred\n");
+ ret = 1;
+ break;
+ }
+
+out:
+ return ret;
+}
diff --git a/samples/mic/mpssd/.gitignore b/samples/mic/mpssd/.gitignore
new file mode 100644
index 000000000..8b7c72f07
--- /dev/null
+++ b/samples/mic/mpssd/.gitignore
@@ -0,0 +1 @@
+mpssd
diff --git a/samples/mic/mpssd/Makefile b/samples/mic/mpssd/Makefile
new file mode 100644
index 000000000..a7a6e0c70
--- /dev/null
+++ b/samples/mic/mpssd/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+ifndef CROSS_COMPILE
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+
+ifeq ($(ARCH),x86)
+
+PROGS := mpssd
+CC = $(CROSS_COMPILE)gcc
+CFLAGS := -I../../../usr/include -I../../../tools/include
+
+ifdef DEBUG
+CFLAGS += -DDEBUG=$(DEBUG)
+endif
+
+all: $(PROGS)
+mpssd: mpssd.c sysfs.c
+ $(CC) $(CFLAGS) mpssd.c sysfs.c -o mpssd -lpthread
+
+install:
+ install mpssd /usr/sbin/mpssd
+ install micctrl /usr/sbin/micctrl
+
+clean:
+ rm -fr $(PROGS)
+
+endif
+endif
diff --git a/samples/mic/mpssd/micctrl b/samples/mic/mpssd/micctrl
new file mode 100755
index 000000000..8f2629b41
--- /dev/null
+++ b/samples/mic/mpssd/micctrl
@@ -0,0 +1,173 @@
+#!/bin/bash
+# Intel MIC Platform Software Stack (MPSS)
+#
+# Copyright(c) 2013 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License, version 2, as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Intel MIC User Space Tools.
+#
+# micctrl - Controls MIC boot/start/stop.
+#
+# chkconfig: 2345 95 05
+# description: start MPSS stack processing.
+#
+### BEGIN INIT INFO
+# Provides: micctrl
+### END INIT INFO
+
+# Source function library.
+. /etc/init.d/functions
+
+sysfs="/sys/class/mic"
+
+_status()
+{
+ f=$sysfs/$1
+ echo -e $1 state: "`cat $f/state`" shutdown_status: "`cat $f/shutdown_status`"
+}
+
+status()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _status $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _status `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_reset()
+{
+ f=$sysfs/$1
+ echo reset > $f/state
+}
+
+reset()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _reset $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _reset `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_boot()
+{
+ f=$sysfs/$1
+ echo "linux" > $f/bootmode
+ echo "mic/uos.img" > $f/firmware
+ echo "mic/$1.image" > $f/ramdisk
+ echo "boot" > $f/state
+}
+
+boot()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _boot $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _boot `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_shutdown()
+{
+ f=$sysfs/$1
+ echo shutdown > $f/state
+}
+
+shutdown()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _shutdown $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _shutdown `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_wait()
+{
+ f=$sysfs/$1
+ while [ "`cat $f/state`" != "offline" -a "`cat $f/state`" != "online" ]
+ do
+ sleep 1
+ echo -e "Waiting for $1 to go offline"
+ done
+}
+
+wait()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _wait $1
+ return $?
+ fi
+ # Wait for the cards to go offline
+ for f in $sysfs/*
+ do
+ _wait `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+if [ ! -d "$sysfs" ]; then
+ echo -e $"Module unloaded "
+ exit 3
+fi
+
+case $1 in
+ -s)
+ status $2
+ ;;
+ -r)
+ reset $2
+ ;;
+ -b)
+ boot $2
+ ;;
+ -S)
+ shutdown $2
+ ;;
+ -w)
+ wait $2
+ ;;
+ *)
+ echo $"Usage: $0 {-s (status) |-r (reset) |-b (boot) |-S (shutdown) |-w (wait)}"
+ exit 2
+esac
+
+exit $?
diff --git a/samples/mic/mpssd/mpss b/samples/mic/mpssd/mpss
new file mode 100755
index 000000000..5fcf9fa4b
--- /dev/null
+++ b/samples/mic/mpssd/mpss
@@ -0,0 +1,200 @@
+#!/bin/bash
+# Intel MIC Platform Software Stack (MPSS)
+#
+# Copyright(c) 2013 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License, version 2, as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Intel MIC User Space Tools.
+#
+# mpss Start mpssd.
+#
+# chkconfig: 2345 95 05
+# description: start MPSS stack processing.
+#
+### BEGIN INIT INFO
+# Provides: mpss
+# Required-Start:
+# Required-Stop:
+# Short-Description: MPSS stack control
+# Description: MPSS stack control
+### END INIT INFO
+
+# Source function library.
+. /etc/init.d/functions
+
+exec=/usr/sbin/mpssd
+sysfs="/sys/class/mic"
+mic_modules="mic_host mic_x100_dma scif vop"
+
+start()
+{
+ [ -x $exec ] || exit 5
+
+ if [ "`ps -e | awk '{print $4}' | grep mpssd | head -1`" = "mpssd" ]; then
+ echo -e $"MPSSD already running! "
+ success
+ echo
+ return 0
+ fi
+
+ echo -e $"Starting MPSS Stack"
+ echo -e $"Loading MIC drivers:" $mic_modules
+
+ modprobe -a $mic_modules
+ RETVAL=$?
+ if [ $RETVAL -ne 0 ]; then
+ failure
+ echo
+ return $RETVAL
+ fi
+
+ # Start the daemon
+ echo -n $"Starting MPSSD "
+ $exec
+ RETVAL=$?
+ if [ $RETVAL -ne 0 ]; then
+ failure
+ echo
+ return $RETVAL
+ fi
+ success
+ echo
+
+ sleep 5
+
+ # Boot the cards
+ micctrl -b
+
+ # Wait till ping works
+ for f in $sysfs/*
+ do
+ count=100
+ ipaddr=`cat $f/cmdline`
+ ipaddr=${ipaddr#*address,}
+ ipaddr=`echo $ipaddr | cut -d, -f1 | cut -d\; -f1`
+ while [ $count -ge 0 ]
+ do
+ echo -e "Pinging "`basename $f`" "
+ ping -c 1 $ipaddr &> /dev/null
+ RETVAL=$?
+ if [ $RETVAL -eq 0 ]; then
+ success
+ break
+ fi
+ sleep 1
+ count=`expr $count - 1`
+ done
+ [ $RETVAL -ne 0 ] && failure || success
+ echo
+ done
+ return $RETVAL
+}
+
+stop()
+{
+ echo -e $"Shutting down MPSS Stack: "
+
+ # Bail out if module is unloaded
+ if [ ! -d "$sysfs" ]; then
+ echo -n $"Module unloaded "
+ success
+ echo
+ return 0
+ fi
+
+ # Shut down the cards.
+ micctrl -S
+
+ # Wait for the cards to go offline
+ for f in $sysfs/*
+ do
+ while [ "`cat $f/state`" != "ready" ]
+ do
+ sleep 1
+ echo -e "Waiting for "`basename $f`" to become ready"
+ done
+ done
+
+ # Display the status of the cards
+ micctrl -s
+
+ # Kill MPSSD now
+ echo -n $"Killing MPSSD"
+ killall -9 mpssd 2>/dev/null
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && failure || success
+ echo
+ return $RETVAL
+}
+
+restart()
+{
+ stop
+ sleep 5
+ start
+}
+
+status()
+{
+ micctrl -s
+ if [ "`ps -e | awk '{print $4}' | grep mpssd | head -n 1`" = "mpssd" ]; then
+ echo "mpssd is running"
+ else
+ echo "mpssd is stopped"
+ fi
+ return 0
+}
+
+unload()
+{
+ if [ ! -d "$sysfs" ]; then
+ echo -n $"No MIC_HOST Module: "
+ success
+ echo
+ return
+ fi
+
+ stop
+
+ sleep 5
+ echo -n $"Removing MIC drivers:" $mic_modules
+ modprobe -r $mic_modules
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && failure || success
+ echo
+ return $RETVAL
+}
+
+case $1 in
+ start)
+ start
+ ;;
+ stop)
+ stop
+ ;;
+ restart)
+ restart
+ ;;
+ status)
+ status
+ ;;
+ unload)
+ unload
+ ;;
+ *)
+ echo $"Usage: $0 {start|stop|restart|status|unload}"
+ exit 2
+esac
+
+exit $?
diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c
new file mode 100644
index 000000000..a50d27473
--- /dev/null
+++ b/samples/mic/mpssd/mpssd.c
@@ -0,0 +1,1826 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC User Space Tools.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <poll.h>
+#include <features.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_blk.h>
+#include <linux/version.h>
+#include "mpssd.h"
+#include <linux/mic_ioctl.h>
+#include <linux/mic_common.h>
+#include <tools/endian.h>
+
+static void *init_mic(void *arg);
+
+static FILE *logfp;
+static struct mic_info mic_list;
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define min_t(type, x, y) ({ \
+ type __min1 = (x); \
+ type __min2 = (y); \
+ __min1 < __min2 ? __min1 : __min2; })
+
+/* align addr on a size boundary - adjust address up/down if needed */
+#define _ALIGN_DOWN(addr, size) ((addr)&(~((size)-1)))
+#define _ALIGN_UP(addr, size) _ALIGN_DOWN(addr + size - 1, size)
+
+/* align addr on a size boundary - adjust address up if needed */
+#define _ALIGN(addr, size) _ALIGN_UP(addr, size)
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE)
+
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+#define GSO_ENABLED 1
+#define MAX_GSO_SIZE (64 * 1024)
+#define ETH_H_LEN 14
+#define MAX_NET_PKT_SIZE (_ALIGN_UP(MAX_GSO_SIZE + ETH_H_LEN, 64))
+#define MIC_DEVICE_PAGE_END 0x1000
+
+#ifndef VIRTIO_NET_HDR_F_DATA_VALID
+#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */
+#endif
+
+static struct {
+ struct mic_device_desc dd;
+ struct mic_vqconfig vqconfig[2];
+ __u32 host_features, guest_acknowledgements;
+ struct virtio_console_config cons_config;
+} virtcons_dev_page = {
+ .dd = {
+ .type = VIRTIO_ID_CONSOLE,
+ .num_vq = ARRAY_SIZE(virtcons_dev_page.vqconfig),
+ .feature_len = sizeof(virtcons_dev_page.host_features),
+ .config_len = sizeof(virtcons_dev_page.cons_config),
+ },
+ .vqconfig[0] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+ .vqconfig[1] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+};
+
+static struct {
+ struct mic_device_desc dd;
+ struct mic_vqconfig vqconfig[2];
+ __u32 host_features, guest_acknowledgements;
+ struct virtio_net_config net_config;
+} virtnet_dev_page = {
+ .dd = {
+ .type = VIRTIO_ID_NET,
+ .num_vq = ARRAY_SIZE(virtnet_dev_page.vqconfig),
+ .feature_len = sizeof(virtnet_dev_page.host_features),
+ .config_len = sizeof(virtnet_dev_page.net_config),
+ },
+ .vqconfig[0] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+ .vqconfig[1] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+#if GSO_ENABLED
+ .host_features = htole32(
+ 1 << VIRTIO_NET_F_CSUM |
+ 1 << VIRTIO_NET_F_GSO |
+ 1 << VIRTIO_NET_F_GUEST_TSO4 |
+ 1 << VIRTIO_NET_F_GUEST_TSO6 |
+ 1 << VIRTIO_NET_F_GUEST_ECN),
+#else
+ .host_features = 0,
+#endif
+};
+
+static const char *mic_config_dir = "/etc/mpss";
+static const char *virtblk_backend = "VIRTBLK_BACKEND";
+static struct {
+ struct mic_device_desc dd;
+ struct mic_vqconfig vqconfig[1];
+ __u32 host_features, guest_acknowledgements;
+ struct virtio_blk_config blk_config;
+} virtblk_dev_page = {
+ .dd = {
+ .type = VIRTIO_ID_BLOCK,
+ .num_vq = ARRAY_SIZE(virtblk_dev_page.vqconfig),
+ .feature_len = sizeof(virtblk_dev_page.host_features),
+ .config_len = sizeof(virtblk_dev_page.blk_config),
+ },
+ .vqconfig[0] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+ .host_features =
+ htole32(1<<VIRTIO_BLK_F_SEG_MAX),
+ .blk_config = {
+ .seg_max = htole32(MIC_VRING_ENTRIES - 2),
+ .capacity = htole64(0),
+ }
+};
+
+static char *myname;
+
+static int
+tap_configure(struct mic_info *mic, char *dev)
+{
+ pid_t pid;
+ char *ifargv[7];
+ char ipaddr[IFNAMSIZ];
+ int ret = 0;
+
+ pid = fork();
+ if (pid == 0) {
+ ifargv[0] = "ip";
+ ifargv[1] = "link";
+ ifargv[2] = "set";
+ ifargv[3] = dev;
+ ifargv[4] = "up";
+ ifargv[5] = NULL;
+ mpsslog("Configuring %s\n", dev);
+ ret = execvp("ip", ifargv);
+ if (ret < 0) {
+ mpsslog("%s execvp failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+ }
+ if (pid < 0) {
+ mpsslog("%s fork failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+
+ ret = waitpid(pid, NULL, 0);
+ if (ret < 0) {
+ mpsslog("%s waitpid failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+
+ snprintf(ipaddr, IFNAMSIZ, "172.31.%d.254/24", mic->id + 1);
+
+ pid = fork();
+ if (pid == 0) {
+ ifargv[0] = "ip";
+ ifargv[1] = "addr";
+ ifargv[2] = "add";
+ ifargv[3] = ipaddr;
+ ifargv[4] = "dev";
+ ifargv[5] = dev;
+ ifargv[6] = NULL;
+ mpsslog("Configuring %s ipaddr %s\n", dev, ipaddr);
+ ret = execvp("ip", ifargv);
+ if (ret < 0) {
+ mpsslog("%s execvp failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+ }
+ if (pid < 0) {
+ mpsslog("%s fork failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+
+ ret = waitpid(pid, NULL, 0);
+ if (ret < 0) {
+ mpsslog("%s waitpid failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+ mpsslog("MIC name %s %s %d DONE!\n",
+ mic->name, __func__, __LINE__);
+ return 0;
+}
+
+static int tun_alloc(struct mic_info *mic, char *dev)
+{
+ struct ifreq ifr;
+ int fd, err;
+#if GSO_ENABLED
+ unsigned offload;
+#endif
+ fd = open("/dev/net/tun", O_RDWR);
+ if (fd < 0) {
+ mpsslog("Could not open /dev/net/tun %s\n", strerror(errno));
+ goto done;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+ if (*dev)
+ strncpy(ifr.ifr_name, dev, IFNAMSIZ);
+
+ err = ioctl(fd, TUNSETIFF, (void *)&ifr);
+ if (err < 0) {
+ mpsslog("%s %s %d TUNSETIFF failed %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ close(fd);
+ return err;
+ }
+#if GSO_ENABLED
+ offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN;
+
+ err = ioctl(fd, TUNSETOFFLOAD, offload);
+ if (err < 0) {
+ mpsslog("%s %s %d TUNSETOFFLOAD failed %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ close(fd);
+ return err;
+ }
+#endif
+ strcpy(dev, ifr.ifr_name);
+ mpsslog("Created TAP %s\n", dev);
+done:
+ return fd;
+}
+
+#define NET_FD_VIRTIO_NET 0
+#define NET_FD_TUN 1
+#define MAX_NET_FD 2
+
+static void set_dp(struct mic_info *mic, int type, void *dp)
+{
+ switch (type) {
+ case VIRTIO_ID_CONSOLE:
+ mic->mic_console.console_dp = dp;
+ return;
+ case VIRTIO_ID_NET:
+ mic->mic_net.net_dp = dp;
+ return;
+ case VIRTIO_ID_BLOCK:
+ mic->mic_virtblk.block_dp = dp;
+ return;
+ }
+ mpsslog("%s %s %d not found\n", mic->name, __func__, type);
+ assert(0);
+}
+
+static void *get_dp(struct mic_info *mic, int type)
+{
+ switch (type) {
+ case VIRTIO_ID_CONSOLE:
+ return mic->mic_console.console_dp;
+ case VIRTIO_ID_NET:
+ return mic->mic_net.net_dp;
+ case VIRTIO_ID_BLOCK:
+ return mic->mic_virtblk.block_dp;
+ }
+ mpsslog("%s %s %d not found\n", mic->name, __func__, type);
+ assert(0);
+ return NULL;
+}
+
+static struct mic_device_desc *get_device_desc(struct mic_info *mic, int type)
+{
+ struct mic_device_desc *d;
+ int i;
+ void *dp = get_dp(mic, type);
+
+ for (i = sizeof(struct mic_bootparam); i < PAGE_SIZE;
+ i += mic_total_desc_size(d)) {
+ d = dp + i;
+
+ /* End of list */
+ if (d->type == 0)
+ break;
+
+ if (d->type == -1)
+ continue;
+
+ mpsslog("%s %s d-> type %d d %p\n",
+ mic->name, __func__, d->type, d);
+
+ if (d->type == (__u8)type)
+ return d;
+ }
+ mpsslog("%s %s %d not found\n", mic->name, __func__, type);
+ return NULL;
+}
+
+/* See comments in vhost.c for explanation of next_desc() */
+static unsigned next_desc(struct vring_desc *desc)
+{
+ unsigned int next;
+
+ if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT))
+ return -1U;
+ next = le16toh(desc->next);
+ return next;
+}
+
+/* Sum up all the IOVEC length */
+static ssize_t
+sum_iovec_len(struct mic_copy_desc *copy)
+{
+ ssize_t sum = 0;
+ unsigned int i;
+
+ for (i = 0; i < copy->iovcnt; i++)
+ sum += copy->iov[i].iov_len;
+ return sum;
+}
+
+static inline void verify_out_len(struct mic_info *mic,
+ struct mic_copy_desc *copy)
+{
+ if (copy->out_len != sum_iovec_len(copy)) {
+ mpsslog("%s %s %d BUG copy->out_len 0x%x len 0x%zx\n",
+ mic->name, __func__, __LINE__,
+ copy->out_len, sum_iovec_len(copy));
+ assert(copy->out_len == sum_iovec_len(copy));
+ }
+}
+
+/* Display an iovec */
+static void
+disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy,
+ const char *s, int line)
+{
+ unsigned int i;
+
+ for (i = 0; i < copy->iovcnt; i++)
+ mpsslog("%s %s %d copy->iov[%d] addr %p len 0x%zx\n",
+ mic->name, s, line, i,
+ copy->iov[i].iov_base, copy->iov[i].iov_len);
+}
+
+static inline __u16 read_avail_idx(struct mic_vring *vr)
+{
+ return READ_ONCE(vr->info->avail_idx);
+}
+
+static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr,
+ struct mic_copy_desc *copy, ssize_t len)
+{
+ copy->vr_idx = tx ? 0 : 1;
+ copy->update_used = true;
+ if (type == VIRTIO_ID_NET)
+ copy->iov[1].iov_len = len - sizeof(struct virtio_net_hdr);
+ else
+ copy->iov[0].iov_len = len;
+}
+
+/* Central API which triggers the copies */
+static int
+mic_virtio_copy(struct mic_info *mic, int fd,
+ struct mic_vring *vr, struct mic_copy_desc *copy)
+{
+ int ret;
+
+ ret = ioctl(fd, MIC_VIRTIO_COPY_DESC, copy);
+ if (ret) {
+ mpsslog("%s %s %d errno %s ret %d\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno), ret);
+ }
+ return ret;
+}
+
+static inline unsigned _vring_size(unsigned int num, unsigned long align)
+{
+ return _ALIGN_UP(((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num)
+ + align - 1) & ~(align - 1))
+ + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num, 4);
+}
+
+/*
+ * This initialization routine requires at least one
+ * vring i.e. vr0. vr1 is optional.
+ */
+static void *
+init_vr(struct mic_info *mic, int fd, int type,
+ struct mic_vring *vr0, struct mic_vring *vr1, int num_vq)
+{
+ int vr_size;
+ char *va;
+
+ vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES,
+ MIC_VIRTIO_RING_ALIGN) +
+ sizeof(struct _mic_vring_info));
+ va = mmap(NULL, MIC_DEVICE_PAGE_END + vr_size * num_vq,
+ PROT_READ, MAP_SHARED, fd, 0);
+ if (MAP_FAILED == va) {
+ mpsslog("%s %s %d mmap failed errno %s\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ goto done;
+ }
+ set_dp(mic, type, va);
+ vr0->va = (struct mic_vring *)&va[MIC_DEVICE_PAGE_END];
+ vr0->info = vr0->va +
+ _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN);
+ vring_init(&vr0->vr,
+ MIC_VRING_ENTRIES, vr0->va, MIC_VIRTIO_RING_ALIGN);
+ mpsslog("%s %s vr0 %p vr0->info %p vr_size 0x%x vring 0x%x ",
+ __func__, mic->name, vr0->va, vr0->info, vr_size,
+ _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN));
+ mpsslog("magic 0x%x expected 0x%x\n",
+ le32toh(vr0->info->magic), MIC_MAGIC + type);
+ assert(le32toh(vr0->info->magic) == MIC_MAGIC + type);
+ if (vr1) {
+ vr1->va = (struct mic_vring *)
+ &va[MIC_DEVICE_PAGE_END + vr_size];
+ vr1->info = vr1->va + _vring_size(MIC_VRING_ENTRIES,
+ MIC_VIRTIO_RING_ALIGN);
+ vring_init(&vr1->vr,
+ MIC_VRING_ENTRIES, vr1->va, MIC_VIRTIO_RING_ALIGN);
+ mpsslog("%s %s vr1 %p vr1->info %p vr_size 0x%x vring 0x%x ",
+ __func__, mic->name, vr1->va, vr1->info, vr_size,
+ _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN));
+ mpsslog("magic 0x%x expected 0x%x\n",
+ le32toh(vr1->info->magic), MIC_MAGIC + type + 1);
+ assert(le32toh(vr1->info->magic) == MIC_MAGIC + type + 1);
+ }
+done:
+ return va;
+}
+
+static int
+wait_for_card_driver(struct mic_info *mic, int fd, int type)
+{
+ struct pollfd pollfd;
+ int err;
+ struct mic_device_desc *desc = get_device_desc(mic, type);
+ __u8 prev_status;
+
+ if (!desc)
+ return -ENODEV;
+ prev_status = desc->status;
+ pollfd.fd = fd;
+ mpsslog("%s %s Waiting .... desc-> type %d status 0x%x\n",
+ mic->name, __func__, type, desc->status);
+
+ while (1) {
+ pollfd.events = POLLIN;
+ pollfd.revents = 0;
+ err = poll(&pollfd, 1, -1);
+ if (err < 0) {
+ mpsslog("%s %s poll failed %s\n",
+ mic->name, __func__, strerror(errno));
+ continue;
+ }
+
+ if (pollfd.revents) {
+ if (desc->status != prev_status) {
+ mpsslog("%s %s Waiting... desc-> type %d "
+ "status 0x%x\n",
+ mic->name, __func__, type,
+ desc->status);
+ prev_status = desc->status;
+ }
+ if (desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ mpsslog("%s %s poll.revents %d\n",
+ mic->name, __func__, pollfd.revents);
+ mpsslog("%s %s desc-> type %d status 0x%x\n",
+ mic->name, __func__, type,
+ desc->status);
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+/* Spin till we have some descriptors */
+static void
+spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr)
+{
+ __u16 avail_idx = read_avail_idx(vr);
+
+ while (avail_idx == le16toh(READ_ONCE(vr->vr.avail->idx))) {
+#ifdef DEBUG
+ mpsslog("%s %s waiting for desc avail %d info_avail %d\n",
+ mic->name, __func__,
+ le16toh(vr->vr.avail->idx), vr->info->avail_idx);
+#endif
+ sched_yield();
+ }
+}
+
+static void *
+virtio_net(void *arg)
+{
+ static __u8 vnet_hdr[2][sizeof(struct virtio_net_hdr)];
+ static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __attribute__ ((aligned(64)));
+ struct iovec vnet_iov[2][2] = {
+ { { .iov_base = vnet_hdr[0], .iov_len = sizeof(vnet_hdr[0]) },
+ { .iov_base = vnet_buf[0], .iov_len = sizeof(vnet_buf[0]) } },
+ { { .iov_base = vnet_hdr[1], .iov_len = sizeof(vnet_hdr[1]) },
+ { .iov_base = vnet_buf[1], .iov_len = sizeof(vnet_buf[1]) } },
+ };
+ struct iovec *iov0 = vnet_iov[0], *iov1 = vnet_iov[1];
+ struct mic_info *mic = (struct mic_info *)arg;
+ char if_name[IFNAMSIZ];
+ struct pollfd net_poll[MAX_NET_FD];
+ struct mic_vring tx_vr, rx_vr;
+ struct mic_copy_desc copy;
+ struct mic_device_desc *desc;
+ int err;
+
+ snprintf(if_name, IFNAMSIZ, "mic%d", mic->id);
+ mic->mic_net.tap_fd = tun_alloc(mic, if_name);
+ if (mic->mic_net.tap_fd < 0)
+ goto done;
+
+ if (tap_configure(mic, if_name))
+ goto done;
+ mpsslog("MIC name %s id %d\n", mic->name, mic->id);
+
+ net_poll[NET_FD_VIRTIO_NET].fd = mic->mic_net.virtio_net_fd;
+ net_poll[NET_FD_VIRTIO_NET].events = POLLIN;
+ net_poll[NET_FD_TUN].fd = mic->mic_net.tap_fd;
+ net_poll[NET_FD_TUN].events = POLLIN;
+
+ if (MAP_FAILED == init_vr(mic, mic->mic_net.virtio_net_fd,
+ VIRTIO_ID_NET, &tx_vr, &rx_vr,
+ virtnet_dev_page.dd.num_vq)) {
+ mpsslog("%s init_vr failed %s\n",
+ mic->name, strerror(errno));
+ goto done;
+ }
+
+ copy.iovcnt = 2;
+ desc = get_device_desc(mic, VIRTIO_ID_NET);
+
+ while (1) {
+ ssize_t len;
+
+ net_poll[NET_FD_VIRTIO_NET].revents = 0;
+ net_poll[NET_FD_TUN].revents = 0;
+
+ /* Start polling for data from tap and virtio net */
+ err = poll(net_poll, 2, -1);
+ if (err < 0) {
+ mpsslog("%s poll failed %s\n",
+ __func__, strerror(errno));
+ continue;
+ }
+ if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ err = wait_for_card_driver(mic,
+ mic->mic_net.virtio_net_fd,
+ VIRTIO_ID_NET);
+ if (err) {
+ mpsslog("%s %s %d Exiting...\n",
+ mic->name, __func__, __LINE__);
+ break;
+ }
+ }
+ /*
+ * Check if there is data to be read from TUN and write to
+ * virtio net fd if there is.
+ */
+ if (net_poll[NET_FD_TUN].revents & POLLIN) {
+ copy.iov = iov0;
+ len = readv(net_poll[NET_FD_TUN].fd,
+ copy.iov, copy.iovcnt);
+ if (len > 0) {
+ struct virtio_net_hdr *hdr
+ = (struct virtio_net_hdr *)vnet_hdr[0];
+
+ /* Disable checksums on the card since we are on
+ a reliable PCIe link */
+ hdr->flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+#ifdef DEBUG
+ mpsslog("%s %s %d hdr->flags 0x%x ", mic->name,
+ __func__, __LINE__, hdr->flags);
+ mpsslog("copy.out_len %d hdr->gso_type 0x%x\n",
+ copy.out_len, hdr->gso_type);
+#endif
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__, __LINE__);
+ mpsslog("%s %s %d read from tap 0x%lx\n",
+ mic->name, __func__, __LINE__,
+ len);
+#endif
+ spin_for_descriptors(mic, &tx_vr);
+ txrx_prepare(VIRTIO_ID_NET, 1, &tx_vr, &copy,
+ len);
+
+ err = mic_virtio_copy(mic,
+ mic->mic_net.virtio_net_fd, &tx_vr,
+ &copy);
+ if (err < 0) {
+ mpsslog("%s %s %d mic_virtio_copy %s\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ }
+ if (!err)
+ verify_out_len(mic, &copy);
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__, __LINE__);
+ mpsslog("%s %s %d wrote to net 0x%lx\n",
+ mic->name, __func__, __LINE__,
+ sum_iovec_len(&copy));
+#endif
+ /* Reinitialize IOV for next run */
+ iov0[1].iov_len = MAX_NET_PKT_SIZE;
+ } else if (len < 0) {
+ disp_iovec(mic, &copy, __func__, __LINE__);
+ mpsslog("%s %s %d read failed %s ", mic->name,
+ __func__, __LINE__, strerror(errno));
+ mpsslog("cnt %d sum %zd\n",
+ copy.iovcnt, sum_iovec_len(&copy));
+ }
+ }
+
+ /*
+ * Check if there is data to be read from virtio net and
+ * write to TUN if there is.
+ */
+ if (net_poll[NET_FD_VIRTIO_NET].revents & POLLIN) {
+ while (rx_vr.info->avail_idx !=
+ le16toh(rx_vr.vr.avail->idx)) {
+ copy.iov = iov1;
+ txrx_prepare(VIRTIO_ID_NET, 0, &rx_vr, &copy,
+ MAX_NET_PKT_SIZE
+ + sizeof(struct virtio_net_hdr));
+
+ err = mic_virtio_copy(mic,
+ mic->mic_net.virtio_net_fd, &rx_vr,
+ &copy);
+ if (!err) {
+#ifdef DEBUG
+ struct virtio_net_hdr *hdr
+ = (struct virtio_net_hdr *)
+ vnet_hdr[1];
+
+ mpsslog("%s %s %d hdr->flags 0x%x, ",
+ mic->name, __func__, __LINE__,
+ hdr->flags);
+ mpsslog("out_len %d gso_type 0x%x\n",
+ copy.out_len,
+ hdr->gso_type);
+#endif
+ /* Set the correct output iov_len */
+ iov1[1].iov_len = copy.out_len -
+ sizeof(struct virtio_net_hdr);
+ verify_out_len(mic, &copy);
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__,
+ __LINE__);
+ mpsslog("%s %s %d ",
+ mic->name, __func__, __LINE__);
+ mpsslog("read from net 0x%lx\n",
+ sum_iovec_len(copy));
+#endif
+ len = writev(net_poll[NET_FD_TUN].fd,
+ copy.iov, copy.iovcnt);
+ if (len != sum_iovec_len(&copy)) {
+ mpsslog("Tun write failed %s ",
+ strerror(errno));
+ mpsslog("len 0x%zx ", len);
+ mpsslog("read_len 0x%zx\n",
+ sum_iovec_len(&copy));
+ } else {
+#ifdef DEBUG
+ disp_iovec(mic, &copy, __func__,
+ __LINE__);
+ mpsslog("%s %s %d ",
+ mic->name, __func__,
+ __LINE__);
+ mpsslog("wrote to tap 0x%lx\n",
+ len);
+#endif
+ }
+ } else {
+ mpsslog("%s %s %d mic_virtio_copy %s\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ break;
+ }
+ }
+ }
+ if (net_poll[NET_FD_VIRTIO_NET].revents & POLLERR)
+ mpsslog("%s: %s: POLLERR\n", __func__, mic->name);
+ }
+done:
+ pthread_exit(NULL);
+}
+
+/* virtio_console */
+#define VIRTIO_CONSOLE_FD 0
+#define MONITOR_FD (VIRTIO_CONSOLE_FD + 1)
+#define MAX_CONSOLE_FD (MONITOR_FD + 1) /* must be the last one + 1 */
+#define MAX_BUFFER_SIZE PAGE_SIZE
+
+static void *
+virtio_console(void *arg)
+{
+ static __u8 vcons_buf[2][PAGE_SIZE];
+ struct iovec vcons_iov[2] = {
+ { .iov_base = vcons_buf[0], .iov_len = sizeof(vcons_buf[0]) },
+ { .iov_base = vcons_buf[1], .iov_len = sizeof(vcons_buf[1]) },
+ };
+ struct iovec *iov0 = &vcons_iov[0], *iov1 = &vcons_iov[1];
+ struct mic_info *mic = (struct mic_info *)arg;
+ int err;
+ struct pollfd console_poll[MAX_CONSOLE_FD];
+ int pty_fd;
+ char *pts_name;
+ ssize_t len;
+ struct mic_vring tx_vr, rx_vr;
+ struct mic_copy_desc copy;
+ struct mic_device_desc *desc;
+
+ pty_fd = posix_openpt(O_RDWR);
+ if (pty_fd < 0) {
+ mpsslog("can't open a pseudoterminal master device: %s\n",
+ strerror(errno));
+ goto _return;
+ }
+ pts_name = ptsname(pty_fd);
+ if (pts_name == NULL) {
+ mpsslog("can't get pts name\n");
+ goto _close_pty;
+ }
+ printf("%s console message goes to %s\n", mic->name, pts_name);
+ mpsslog("%s console message goes to %s\n", mic->name, pts_name);
+ err = grantpt(pty_fd);
+ if (err < 0) {
+ mpsslog("can't grant access: %s %s\n",
+ pts_name, strerror(errno));
+ goto _close_pty;
+ }
+ err = unlockpt(pty_fd);
+ if (err < 0) {
+ mpsslog("can't unlock a pseudoterminal: %s %s\n",
+ pts_name, strerror(errno));
+ goto _close_pty;
+ }
+ console_poll[MONITOR_FD].fd = pty_fd;
+ console_poll[MONITOR_FD].events = POLLIN;
+
+ console_poll[VIRTIO_CONSOLE_FD].fd = mic->mic_console.virtio_console_fd;
+ console_poll[VIRTIO_CONSOLE_FD].events = POLLIN;
+
+ if (MAP_FAILED == init_vr(mic, mic->mic_console.virtio_console_fd,
+ VIRTIO_ID_CONSOLE, &tx_vr, &rx_vr,
+ virtcons_dev_page.dd.num_vq)) {
+ mpsslog("%s init_vr failed %s\n",
+ mic->name, strerror(errno));
+ goto _close_pty;
+ }
+
+ copy.iovcnt = 1;
+ desc = get_device_desc(mic, VIRTIO_ID_CONSOLE);
+
+ for (;;) {
+ console_poll[MONITOR_FD].revents = 0;
+ console_poll[VIRTIO_CONSOLE_FD].revents = 0;
+ err = poll(console_poll, MAX_CONSOLE_FD, -1);
+ if (err < 0) {
+ mpsslog("%s %d: poll failed: %s\n", __func__, __LINE__,
+ strerror(errno));
+ continue;
+ }
+ if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ err = wait_for_card_driver(mic,
+ mic->mic_console.virtio_console_fd,
+ VIRTIO_ID_CONSOLE);
+ if (err) {
+ mpsslog("%s %s %d Exiting...\n",
+ mic->name, __func__, __LINE__);
+ break;
+ }
+ }
+
+ if (console_poll[MONITOR_FD].revents & POLLIN) {
+ copy.iov = iov0;
+ len = readv(pty_fd, copy.iov, copy.iovcnt);
+ if (len > 0) {
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__, __LINE__);
+ mpsslog("%s %s %d read from tap 0x%lx\n",
+ mic->name, __func__, __LINE__,
+ len);
+#endif
+ spin_for_descriptors(mic, &tx_vr);
+ txrx_prepare(VIRTIO_ID_CONSOLE, 1, &tx_vr,
+ &copy, len);
+
+ err = mic_virtio_copy(mic,
+ mic->mic_console.virtio_console_fd,
+ &tx_vr, &copy);
+ if (err < 0) {
+ mpsslog("%s %s %d mic_virtio_copy %s\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ }
+ if (!err)
+ verify_out_len(mic, &copy);
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__, __LINE__);
+ mpsslog("%s %s %d wrote to net 0x%lx\n",
+ mic->name, __func__, __LINE__,
+ sum_iovec_len(copy));
+#endif
+ /* Reinitialize IOV for next run */
+ iov0->iov_len = PAGE_SIZE;
+ } else if (len < 0) {
+ disp_iovec(mic, &copy, __func__, __LINE__);
+ mpsslog("%s %s %d read failed %s ",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ mpsslog("cnt %d sum %zd\n",
+ copy.iovcnt, sum_iovec_len(&copy));
+ }
+ }
+
+ if (console_poll[VIRTIO_CONSOLE_FD].revents & POLLIN) {
+ while (rx_vr.info->avail_idx !=
+ le16toh(rx_vr.vr.avail->idx)) {
+ copy.iov = iov1;
+ txrx_prepare(VIRTIO_ID_CONSOLE, 0, &rx_vr,
+ &copy, PAGE_SIZE);
+
+ err = mic_virtio_copy(mic,
+ mic->mic_console.virtio_console_fd,
+ &rx_vr, &copy);
+ if (!err) {
+ /* Set the correct output iov_len */
+ iov1->iov_len = copy.out_len;
+ verify_out_len(mic, &copy);
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__,
+ __LINE__);
+ mpsslog("%s %s %d ",
+ mic->name, __func__, __LINE__);
+ mpsslog("read from net 0x%lx\n",
+ sum_iovec_len(copy));
+#endif
+ len = writev(pty_fd,
+ copy.iov, copy.iovcnt);
+ if (len != sum_iovec_len(&copy)) {
+ mpsslog("Tun write failed %s ",
+ strerror(errno));
+ mpsslog("len 0x%zx ", len);
+ mpsslog("read_len 0x%zx\n",
+ sum_iovec_len(&copy));
+ } else {
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__,
+ __LINE__);
+ mpsslog("%s %s %d ",
+ mic->name, __func__,
+ __LINE__);
+ mpsslog("wrote to tap 0x%lx\n",
+ len);
+#endif
+ }
+ } else {
+ mpsslog("%s %s %d mic_virtio_copy %s\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ break;
+ }
+ }
+ }
+ if (console_poll[NET_FD_VIRTIO_NET].revents & POLLERR)
+ mpsslog("%s: %s: POLLERR\n", __func__, mic->name);
+ }
+_close_pty:
+ close(pty_fd);
+_return:
+ pthread_exit(NULL);
+}
+
+static void
+add_virtio_device(struct mic_info *mic, struct mic_device_desc *dd)
+{
+ char path[PATH_MAX];
+ int fd, err;
+
+ snprintf(path, PATH_MAX, "/dev/vop_virtio%d", mic->id);
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ mpsslog("Could not open %s %s\n", path, strerror(errno));
+ return;
+ }
+
+ err = ioctl(fd, MIC_VIRTIO_ADD_DEVICE, dd);
+ if (err < 0) {
+ mpsslog("Could not add %d %s\n", dd->type, strerror(errno));
+ close(fd);
+ return;
+ }
+ switch (dd->type) {
+ case VIRTIO_ID_NET:
+ mic->mic_net.virtio_net_fd = fd;
+ mpsslog("Added VIRTIO_ID_NET for %s\n", mic->name);
+ break;
+ case VIRTIO_ID_CONSOLE:
+ mic->mic_console.virtio_console_fd = fd;
+ mpsslog("Added VIRTIO_ID_CONSOLE for %s\n", mic->name);
+ break;
+ case VIRTIO_ID_BLOCK:
+ mic->mic_virtblk.virtio_block_fd = fd;
+ mpsslog("Added VIRTIO_ID_BLOCK for %s\n", mic->name);
+ break;
+ }
+}
+
+static bool
+set_backend_file(struct mic_info *mic)
+{
+ FILE *config;
+ char buff[PATH_MAX], *line, *evv, *p;
+
+ snprintf(buff, PATH_MAX, "%s/mpssd%03d.conf", mic_config_dir, mic->id);
+ config = fopen(buff, "r");
+ if (config == NULL)
+ return false;
+ do { /* look for "virtblk_backend=XXXX" */
+ line = fgets(buff, PATH_MAX, config);
+ if (line == NULL)
+ break;
+ if (*line == '#')
+ continue;
+ p = strchr(line, '\n');
+ if (p)
+ *p = '\0';
+ } while (strncmp(line, virtblk_backend, strlen(virtblk_backend)) != 0);
+ fclose(config);
+ if (line == NULL)
+ return false;
+ evv = strchr(line, '=');
+ if (evv == NULL)
+ return false;
+ mic->mic_virtblk.backend_file = malloc(strlen(evv) + 1);
+ if (mic->mic_virtblk.backend_file == NULL) {
+ mpsslog("%s %d can't allocate memory\n", mic->name, mic->id);
+ return false;
+ }
+ strcpy(mic->mic_virtblk.backend_file, evv + 1);
+ return true;
+}
+
+#define SECTOR_SIZE 512
+static bool
+set_backend_size(struct mic_info *mic)
+{
+ mic->mic_virtblk.backend_size = lseek(mic->mic_virtblk.backend, 0,
+ SEEK_END);
+ if (mic->mic_virtblk.backend_size < 0) {
+ mpsslog("%s: can't seek: %s\n",
+ mic->name, mic->mic_virtblk.backend_file);
+ return false;
+ }
+ virtblk_dev_page.blk_config.capacity =
+ mic->mic_virtblk.backend_size / SECTOR_SIZE;
+ if ((mic->mic_virtblk.backend_size % SECTOR_SIZE) != 0)
+ virtblk_dev_page.blk_config.capacity++;
+
+ virtblk_dev_page.blk_config.capacity =
+ htole64(virtblk_dev_page.blk_config.capacity);
+
+ return true;
+}
+
+static bool
+open_backend(struct mic_info *mic)
+{
+ if (!set_backend_file(mic))
+ goto _error_exit;
+ mic->mic_virtblk.backend = open(mic->mic_virtblk.backend_file, O_RDWR);
+ if (mic->mic_virtblk.backend < 0) {
+ mpsslog("%s: can't open: %s\n", mic->name,
+ mic->mic_virtblk.backend_file);
+ goto _error_free;
+ }
+ if (!set_backend_size(mic))
+ goto _error_close;
+ mic->mic_virtblk.backend_addr = mmap(NULL,
+ mic->mic_virtblk.backend_size,
+ PROT_READ|PROT_WRITE, MAP_SHARED,
+ mic->mic_virtblk.backend, 0L);
+ if (mic->mic_virtblk.backend_addr == MAP_FAILED) {
+ mpsslog("%s: can't map: %s %s\n",
+ mic->name, mic->mic_virtblk.backend_file,
+ strerror(errno));
+ goto _error_close;
+ }
+ return true;
+
+ _error_close:
+ close(mic->mic_virtblk.backend);
+ _error_free:
+ free(mic->mic_virtblk.backend_file);
+ _error_exit:
+ return false;
+}
+
+static void
+close_backend(struct mic_info *mic)
+{
+ munmap(mic->mic_virtblk.backend_addr, mic->mic_virtblk.backend_size);
+ close(mic->mic_virtblk.backend);
+ free(mic->mic_virtblk.backend_file);
+}
+
+static bool
+start_virtblk(struct mic_info *mic, struct mic_vring *vring)
+{
+ if (((unsigned long)&virtblk_dev_page.blk_config % 8) != 0) {
+ mpsslog("%s: blk_config is not 8 byte aligned.\n",
+ mic->name);
+ return false;
+ }
+ add_virtio_device(mic, &virtblk_dev_page.dd);
+ if (MAP_FAILED == init_vr(mic, mic->mic_virtblk.virtio_block_fd,
+ VIRTIO_ID_BLOCK, vring, NULL,
+ virtblk_dev_page.dd.num_vq)) {
+ mpsslog("%s init_vr failed %s\n",
+ mic->name, strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+static void
+stop_virtblk(struct mic_info *mic)
+{
+ int vr_size, ret;
+
+ vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES,
+ MIC_VIRTIO_RING_ALIGN) +
+ sizeof(struct _mic_vring_info));
+ ret = munmap(mic->mic_virtblk.block_dp,
+ MIC_DEVICE_PAGE_END + vr_size * virtblk_dev_page.dd.num_vq);
+ if (ret < 0)
+ mpsslog("%s munmap errno %d\n", mic->name, errno);
+ close(mic->mic_virtblk.virtio_block_fd);
+}
+
+static __u8
+header_error_check(struct vring_desc *desc)
+{
+ if (le32toh(desc->len) != sizeof(struct virtio_blk_outhdr)) {
+ mpsslog("%s() %d: length is not sizeof(virtio_blk_outhd)\n",
+ __func__, __LINE__);
+ return -EIO;
+ }
+ if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) {
+ mpsslog("%s() %d: alone\n",
+ __func__, __LINE__);
+ return -EIO;
+ }
+ if (le16toh(desc->flags) & VRING_DESC_F_WRITE) {
+ mpsslog("%s() %d: not read\n",
+ __func__, __LINE__);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int
+read_header(int fd, struct virtio_blk_outhdr *hdr, __u32 desc_idx)
+{
+ struct iovec iovec;
+ struct mic_copy_desc copy;
+
+ iovec.iov_len = sizeof(*hdr);
+ iovec.iov_base = hdr;
+ copy.iov = &iovec;
+ copy.iovcnt = 1;
+ copy.vr_idx = 0; /* only one vring on virtio_block */
+ copy.update_used = false; /* do not update used index */
+ return ioctl(fd, MIC_VIRTIO_COPY_DESC, &copy);
+}
+
+static int
+transfer_blocks(int fd, struct iovec *iovec, __u32 iovcnt)
+{
+ struct mic_copy_desc copy;
+
+ copy.iov = iovec;
+ copy.iovcnt = iovcnt;
+ copy.vr_idx = 0; /* only one vring on virtio_block */
+ copy.update_used = false; /* do not update used index */
+ return ioctl(fd, MIC_VIRTIO_COPY_DESC, &copy);
+}
+
+static __u8
+status_error_check(struct vring_desc *desc)
+{
+ if (le32toh(desc->len) != sizeof(__u8)) {
+ mpsslog("%s() %d: length is not sizeof(status)\n",
+ __func__, __LINE__);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int
+write_status(int fd, __u8 *status)
+{
+ struct iovec iovec;
+ struct mic_copy_desc copy;
+
+ iovec.iov_base = status;
+ iovec.iov_len = sizeof(*status);
+ copy.iov = &iovec;
+ copy.iovcnt = 1;
+ copy.vr_idx = 0; /* only one vring on virtio_block */
+ copy.update_used = true; /* Update used index */
+ return ioctl(fd, MIC_VIRTIO_COPY_DESC, &copy);
+}
+
+#ifndef VIRTIO_BLK_T_GET_ID
+#define VIRTIO_BLK_T_GET_ID 8
+#endif
+
+static void *
+virtio_block(void *arg)
+{
+ struct mic_info *mic = (struct mic_info *)arg;
+ int ret;
+ struct pollfd block_poll;
+ struct mic_vring vring;
+ __u16 avail_idx;
+ __u32 desc_idx;
+ struct vring_desc *desc;
+ struct iovec *iovec, *piov;
+ __u8 status;
+ __u32 buffer_desc_idx;
+ struct virtio_blk_outhdr hdr;
+ void *fos;
+
+ for (;;) { /* forever */
+ if (!open_backend(mic)) { /* No virtblk */
+ for (mic->mic_virtblk.signaled = 0;
+ !mic->mic_virtblk.signaled;)
+ sleep(1);
+ continue;
+ }
+
+ /* backend file is specified. */
+ if (!start_virtblk(mic, &vring))
+ goto _close_backend;
+ iovec = malloc(sizeof(*iovec) *
+ le32toh(virtblk_dev_page.blk_config.seg_max));
+ if (!iovec) {
+ mpsslog("%s: can't alloc iovec: %s\n",
+ mic->name, strerror(ENOMEM));
+ goto _stop_virtblk;
+ }
+
+ block_poll.fd = mic->mic_virtblk.virtio_block_fd;
+ block_poll.events = POLLIN;
+ for (mic->mic_virtblk.signaled = 0;
+ !mic->mic_virtblk.signaled;) {
+ block_poll.revents = 0;
+ /* timeout in 1 sec to see signaled */
+ ret = poll(&block_poll, 1, 1000);
+ if (ret < 0) {
+ mpsslog("%s %d: poll failed: %s\n",
+ __func__, __LINE__,
+ strerror(errno));
+ continue;
+ }
+
+ if (!(block_poll.revents & POLLIN)) {
+#ifdef DEBUG
+ mpsslog("%s %d: block_poll.revents=0x%x\n",
+ __func__, __LINE__, block_poll.revents);
+#endif
+ continue;
+ }
+
+ /* POLLIN */
+ while (vring.info->avail_idx !=
+ le16toh(vring.vr.avail->idx)) {
+ /* read header element */
+ avail_idx =
+ vring.info->avail_idx &
+ (vring.vr.num - 1);
+ desc_idx = le16toh(
+ vring.vr.avail->ring[avail_idx]);
+ desc = &vring.vr.desc[desc_idx];
+#ifdef DEBUG
+ mpsslog("%s() %d: avail_idx=%d ",
+ __func__, __LINE__,
+ vring.info->avail_idx);
+ mpsslog("vring.vr.num=%d desc=%p\n",
+ vring.vr.num, desc);
+#endif
+ status = header_error_check(desc);
+ ret = read_header(
+ mic->mic_virtblk.virtio_block_fd,
+ &hdr, desc_idx);
+ if (ret < 0) {
+ mpsslog("%s() %d %s: ret=%d %s\n",
+ __func__, __LINE__,
+ mic->name, ret,
+ strerror(errno));
+ break;
+ }
+ /* buffer element */
+ piov = iovec;
+ status = 0;
+ fos = mic->mic_virtblk.backend_addr +
+ (hdr.sector * SECTOR_SIZE);
+ buffer_desc_idx = next_desc(desc);
+ desc_idx = buffer_desc_idx;
+ for (desc = &vring.vr.desc[buffer_desc_idx];
+ desc->flags & VRING_DESC_F_NEXT;
+ desc_idx = next_desc(desc),
+ desc = &vring.vr.desc[desc_idx]) {
+ piov->iov_len = desc->len;
+ piov->iov_base = fos;
+ piov++;
+ fos += desc->len;
+ }
+ /* Returning NULLs for VIRTIO_BLK_T_GET_ID. */
+ if (hdr.type & ~(VIRTIO_BLK_T_OUT |
+ VIRTIO_BLK_T_GET_ID)) {
+ /*
+ VIRTIO_BLK_T_IN - does not do
+ anything. Probably for documenting.
+ VIRTIO_BLK_T_SCSI_CMD - for
+ virtio_scsi.
+ VIRTIO_BLK_T_FLUSH - turned off in
+ config space.
+ VIRTIO_BLK_T_BARRIER - defined but not
+ used in anywhere.
+ */
+ mpsslog("%s() %d: type %x ",
+ __func__, __LINE__,
+ hdr.type);
+ mpsslog("is not supported\n");
+ status = -ENOTSUP;
+
+ } else {
+ ret = transfer_blocks(
+ mic->mic_virtblk.virtio_block_fd,
+ iovec,
+ piov - iovec);
+ if (ret < 0 &&
+ status != 0)
+ status = ret;
+ }
+ /* write status and update used pointer */
+ if (status != 0)
+ status = status_error_check(desc);
+ ret = write_status(
+ mic->mic_virtblk.virtio_block_fd,
+ &status);
+#ifdef DEBUG
+ mpsslog("%s() %d: write status=%d on desc=%p\n",
+ __func__, __LINE__,
+ status, desc);
+#endif
+ }
+ }
+ free(iovec);
+_stop_virtblk:
+ stop_virtblk(mic);
+_close_backend:
+ close_backend(mic);
+ } /* forever */
+
+ pthread_exit(NULL);
+}
+
+static void
+reset(struct mic_info *mic)
+{
+#define RESET_TIMEOUT 120
+ int i = RESET_TIMEOUT;
+ setsysfs(mic->name, "state", "reset");
+ while (i) {
+ char *state;
+ state = readsysfs(mic->name, "state");
+ if (!state)
+ goto retry;
+ mpsslog("%s: %s %d state %s\n",
+ mic->name, __func__, __LINE__, state);
+
+ if (!strcmp(state, "ready")) {
+ free(state);
+ break;
+ }
+ free(state);
+retry:
+ sleep(1);
+ i--;
+ }
+}
+
+static int
+get_mic_shutdown_status(struct mic_info *mic, char *shutdown_status)
+{
+ if (!strcmp(shutdown_status, "nop"))
+ return MIC_NOP;
+ if (!strcmp(shutdown_status, "crashed"))
+ return MIC_CRASHED;
+ if (!strcmp(shutdown_status, "halted"))
+ return MIC_HALTED;
+ if (!strcmp(shutdown_status, "poweroff"))
+ return MIC_POWER_OFF;
+ if (!strcmp(shutdown_status, "restart"))
+ return MIC_RESTART;
+ mpsslog("%s: BUG invalid status %s\n", mic->name, shutdown_status);
+ /* Invalid state */
+ assert(0);
+};
+
+static int get_mic_state(struct mic_info *mic)
+{
+ char *state = NULL;
+ enum mic_states mic_state;
+
+ while (!state) {
+ state = readsysfs(mic->name, "state");
+ sleep(1);
+ }
+ mpsslog("%s: %s %d state %s\n",
+ mic->name, __func__, __LINE__, state);
+
+ if (!strcmp(state, "ready")) {
+ mic_state = MIC_READY;
+ } else if (!strcmp(state, "booting")) {
+ mic_state = MIC_BOOTING;
+ } else if (!strcmp(state, "online")) {
+ mic_state = MIC_ONLINE;
+ } else if (!strcmp(state, "shutting_down")) {
+ mic_state = MIC_SHUTTING_DOWN;
+ } else if (!strcmp(state, "reset_failed")) {
+ mic_state = MIC_RESET_FAILED;
+ } else if (!strcmp(state, "resetting")) {
+ mic_state = MIC_RESETTING;
+ } else {
+ mpsslog("%s: BUG invalid state %s\n", mic->name, state);
+ assert(0);
+ }
+
+ free(state);
+ return mic_state;
+};
+
+static void mic_handle_shutdown(struct mic_info *mic)
+{
+#define SHUTDOWN_TIMEOUT 60
+ int i = SHUTDOWN_TIMEOUT;
+ char *shutdown_status;
+ while (i) {
+ shutdown_status = readsysfs(mic->name, "shutdown_status");
+ if (!shutdown_status) {
+ sleep(1);
+ continue;
+ }
+ mpsslog("%s: %s %d shutdown_status %s\n",
+ mic->name, __func__, __LINE__, shutdown_status);
+ switch (get_mic_shutdown_status(mic, shutdown_status)) {
+ case MIC_RESTART:
+ mic->restart = 1;
+ case MIC_HALTED:
+ case MIC_POWER_OFF:
+ case MIC_CRASHED:
+ free(shutdown_status);
+ goto reset;
+ default:
+ break;
+ }
+ free(shutdown_status);
+ sleep(1);
+ i--;
+ }
+reset:
+ if (!i)
+ mpsslog("%s: %s %d timing out waiting for shutdown_status %s\n",
+ mic->name, __func__, __LINE__, shutdown_status);
+ reset(mic);
+}
+
+static int open_state_fd(struct mic_info *mic)
+{
+ char pathname[PATH_MAX];
+ int fd;
+
+ snprintf(pathname, PATH_MAX - 1, "%s/%s/%s",
+ MICSYSFSDIR, mic->name, "state");
+
+ fd = open(pathname, O_RDONLY);
+ if (fd < 0)
+ mpsslog("%s: opening file %s failed %s\n",
+ mic->name, pathname, strerror(errno));
+ return fd;
+}
+
+static int block_till_state_change(int fd, struct mic_info *mic)
+{
+ struct pollfd ufds[1];
+ char value[PAGE_SIZE];
+ int ret;
+
+ ufds[0].fd = fd;
+ ufds[0].events = POLLERR | POLLPRI;
+ ret = poll(ufds, 1, -1);
+ if (ret < 0) {
+ mpsslog("%s: %s %d poll failed %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ return ret;
+ }
+
+ ret = lseek(fd, 0, SEEK_SET);
+ if (ret < 0) {
+ mpsslog("%s: %s %d Failed to seek to 0: %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ return ret;
+ }
+
+ ret = read(fd, value, sizeof(value));
+ if (ret < 0) {
+ mpsslog("%s: %s %d Failed to read sysfs entry: %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ return ret;
+ }
+
+ return 0;
+}
+
+static void *
+mic_config(void *arg)
+{
+ struct mic_info *mic = (struct mic_info *)arg;
+ int fd, ret, stat = 0;
+
+ fd = open_state_fd(mic);
+ if (fd < 0) {
+ mpsslog("%s: %s %d open state fd failed %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ goto exit;
+ }
+
+ do {
+ ret = block_till_state_change(fd, mic);
+ if (ret < 0) {
+ mpsslog("%s: %s %d block_till_state_change error %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ goto close_exit;
+ }
+
+ switch (get_mic_state(mic)) {
+ case MIC_SHUTTING_DOWN:
+ mic_handle_shutdown(mic);
+ break;
+ case MIC_READY:
+ case MIC_RESET_FAILED:
+ ret = kill(mic->pid, SIGTERM);
+ mpsslog("%s: %s %d kill pid %d ret %d\n",
+ mic->name, __func__, __LINE__,
+ mic->pid, ret);
+ if (!ret) {
+ ret = waitpid(mic->pid, &stat,
+ WIFSIGNALED(stat));
+ mpsslog("%s: %s %d waitpid ret %d pid %d\n",
+ mic->name, __func__, __LINE__,
+ ret, mic->pid);
+ }
+ if (mic->boot_on_resume) {
+ setsysfs(mic->name, "state", "boot");
+ mic->boot_on_resume = 0;
+ }
+ goto close_exit;
+ default:
+ break;
+ }
+ } while (1);
+
+close_exit:
+ close(fd);
+exit:
+ init_mic(mic);
+ pthread_exit(NULL);
+}
+
+static void
+set_cmdline(struct mic_info *mic)
+{
+ char buffer[PATH_MAX];
+ int len;
+
+ len = snprintf(buffer, PATH_MAX,
+ "clocksource=tsc highres=off nohz=off ");
+ len += snprintf(buffer + len, PATH_MAX - len,
+ "cpufreq_on;corec6_off;pc3_off;pc6_off ");
+ len += snprintf(buffer + len, PATH_MAX - len,
+ "ifcfg=static;address,172.31.%d.1;netmask,255.255.255.0",
+ mic->id + 1);
+
+ setsysfs(mic->name, "cmdline", buffer);
+ mpsslog("%s: Command line: \"%s\"\n", mic->name, buffer);
+ snprintf(buffer, PATH_MAX, "172.31.%d.1", mic->id + 1);
+ mpsslog("%s: IPADDR: \"%s\"\n", mic->name, buffer);
+}
+
+static void
+set_log_buf_info(struct mic_info *mic)
+{
+ int fd;
+ off_t len;
+ char system_map[] = "/lib/firmware/mic/System.map";
+ char *map, *temp, log_buf[17] = {'\0'};
+
+ fd = open(system_map, O_RDONLY);
+ if (fd < 0) {
+ mpsslog("%s: Opening System.map failed: %d\n",
+ mic->name, errno);
+ return;
+ }
+ len = lseek(fd, 0, SEEK_END);
+ if (len < 0) {
+ mpsslog("%s: Reading System.map size failed: %d\n",
+ mic->name, errno);
+ close(fd);
+ return;
+ }
+ map = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (map == MAP_FAILED) {
+ mpsslog("%s: mmap of System.map failed: %d\n",
+ mic->name, errno);
+ close(fd);
+ return;
+ }
+ temp = strstr(map, "__log_buf");
+ if (!temp) {
+ mpsslog("%s: __log_buf not found: %d\n", mic->name, errno);
+ munmap(map, len);
+ close(fd);
+ return;
+ }
+ strncpy(log_buf, temp - 19, 16);
+ setsysfs(mic->name, "log_buf_addr", log_buf);
+ mpsslog("%s: log_buf_addr: %s\n", mic->name, log_buf);
+ temp = strstr(map, "log_buf_len");
+ if (!temp) {
+ mpsslog("%s: log_buf_len not found: %d\n", mic->name, errno);
+ munmap(map, len);
+ close(fd);
+ return;
+ }
+ strncpy(log_buf, temp - 19, 16);
+ setsysfs(mic->name, "log_buf_len", log_buf);
+ mpsslog("%s: log_buf_len: %s\n", mic->name, log_buf);
+ munmap(map, len);
+ close(fd);
+}
+
+static void
+change_virtblk_backend(int x, siginfo_t *siginfo, void *p)
+{
+ struct mic_info *mic;
+
+ for (mic = mic_list.next; mic != NULL; mic = mic->next)
+ mic->mic_virtblk.signaled = 1/* true */;
+}
+
+static void
+set_mic_boot_params(struct mic_info *mic)
+{
+ set_log_buf_info(mic);
+ set_cmdline(mic);
+}
+
+static void *
+init_mic(void *arg)
+{
+ struct mic_info *mic = (struct mic_info *)arg;
+ struct sigaction ignore = {
+ .sa_flags = 0,
+ .sa_handler = SIG_IGN
+ };
+ struct sigaction act = {
+ .sa_flags = SA_SIGINFO,
+ .sa_sigaction = change_virtblk_backend,
+ };
+ char buffer[PATH_MAX];
+ int err, fd;
+
+ /*
+ * Currently, one virtio block device is supported for each MIC card
+ * at a time. Any user (or test) can send a SIGUSR1 to the MIC daemon.
+ * The signal informs the virtio block backend about a change in the
+ * configuration file which specifies the virtio backend file name on
+ * the host. Virtio block backend then re-reads the configuration file
+ * and switches to the new block device. This signalling mechanism may
+ * not be required once multiple virtio block devices are supported by
+ * the MIC daemon.
+ */
+ sigaction(SIGUSR1, &ignore, NULL);
+retry:
+ fd = open_state_fd(mic);
+ if (fd < 0) {
+ mpsslog("%s: %s %d open state fd failed %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ sleep(2);
+ goto retry;
+ }
+
+ if (mic->restart) {
+ snprintf(buffer, PATH_MAX, "boot");
+ setsysfs(mic->name, "state", buffer);
+ mpsslog("%s restarting mic %d\n",
+ mic->name, mic->restart);
+ mic->restart = 0;
+ }
+
+ while (1) {
+ while (block_till_state_change(fd, mic)) {
+ mpsslog("%s: %s %d block_till_state_change error %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ sleep(2);
+ continue;
+ }
+
+ if (get_mic_state(mic) == MIC_BOOTING)
+ break;
+ }
+
+ mic->pid = fork();
+ switch (mic->pid) {
+ case 0:
+ add_virtio_device(mic, &virtcons_dev_page.dd);
+ add_virtio_device(mic, &virtnet_dev_page.dd);
+ err = pthread_create(&mic->mic_console.console_thread, NULL,
+ virtio_console, mic);
+ if (err)
+ mpsslog("%s virtcons pthread_create failed %s\n",
+ mic->name, strerror(err));
+ err = pthread_create(&mic->mic_net.net_thread, NULL,
+ virtio_net, mic);
+ if (err)
+ mpsslog("%s virtnet pthread_create failed %s\n",
+ mic->name, strerror(err));
+ err = pthread_create(&mic->mic_virtblk.block_thread, NULL,
+ virtio_block, mic);
+ if (err)
+ mpsslog("%s virtblk pthread_create failed %s\n",
+ mic->name, strerror(err));
+ sigemptyset(&act.sa_mask);
+ err = sigaction(SIGUSR1, &act, NULL);
+ if (err)
+ mpsslog("%s sigaction SIGUSR1 failed %s\n",
+ mic->name, strerror(errno));
+ while (1)
+ sleep(60);
+ case -1:
+ mpsslog("fork failed MIC name %s id %d errno %d\n",
+ mic->name, mic->id, errno);
+ break;
+ default:
+ err = pthread_create(&mic->config_thread, NULL,
+ mic_config, mic);
+ if (err)
+ mpsslog("%s mic_config pthread_create failed %s\n",
+ mic->name, strerror(err));
+ }
+
+ return NULL;
+}
+
+static void
+start_daemon(void)
+{
+ struct mic_info *mic;
+ int err;
+
+ for (mic = mic_list.next; mic; mic = mic->next) {
+ set_mic_boot_params(mic);
+ err = pthread_create(&mic->init_thread, NULL, init_mic, mic);
+ if (err)
+ mpsslog("%s init_mic pthread_create failed %s\n",
+ mic->name, strerror(err));
+ }
+
+ while (1)
+ sleep(60);
+}
+
+static int
+init_mic_list(void)
+{
+ struct mic_info *mic = &mic_list;
+ struct dirent *file;
+ DIR *dp;
+ int cnt = 0;
+
+ dp = opendir(MICSYSFSDIR);
+ if (!dp)
+ return 0;
+
+ while ((file = readdir(dp)) != NULL) {
+ if (!strncmp(file->d_name, "mic", 3)) {
+ mic->next = calloc(1, sizeof(struct mic_info));
+ if (mic->next) {
+ mic = mic->next;
+ mic->id = atoi(&file->d_name[3]);
+ mic->name = malloc(strlen(file->d_name) + 16);
+ if (mic->name)
+ strcpy(mic->name, file->d_name);
+ mpsslog("MIC name %s id %d\n", mic->name,
+ mic->id);
+ cnt++;
+ }
+ }
+ }
+
+ closedir(dp);
+ return cnt;
+}
+
+void
+mpsslog(char *format, ...)
+{
+ va_list args;
+ char buffer[4096];
+ char ts[52], *ts1;
+ time_t t;
+
+ if (logfp == NULL)
+ return;
+
+ va_start(args, format);
+ vsprintf(buffer, format, args);
+ va_end(args);
+
+ time(&t);
+ ts1 = ctime_r(&t, ts);
+ ts1[strlen(ts1) - 1] = '\0';
+ fprintf(logfp, "%s: %s", ts1, buffer);
+
+ fflush(logfp);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int cnt;
+ pid_t pid;
+
+ myname = argv[0];
+
+ logfp = fopen(LOGFILE_NAME, "a+");
+ if (!logfp) {
+ fprintf(stderr, "cannot open logfile '%s'\n", LOGFILE_NAME);
+ exit(1);
+ }
+ pid = fork();
+ switch (pid) {
+ case 0:
+ break;
+ case -1:
+ exit(2);
+ default:
+ exit(0);
+ }
+
+ mpsslog("MIC Daemon start\n");
+
+ cnt = init_mic_list();
+ if (cnt == 0) {
+ mpsslog("MIC module not loaded\n");
+ exit(3);
+ }
+ mpsslog("MIC found %d devices\n", cnt);
+
+ start_daemon();
+
+ exit(0);
+}
diff --git a/samples/mic/mpssd/mpssd.h b/samples/mic/mpssd/mpssd.h
new file mode 100644
index 000000000..8bd64944a
--- /dev/null
+++ b/samples/mic/mpssd/mpssd.h
@@ -0,0 +1,103 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC User Space Tools.
+ */
+#ifndef _MPSSD_H_
+#define _MPSSD_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <stdarg.h>
+#include <time.h>
+#include <errno.h>
+#include <sys/dir.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <pthread.h>
+#include <signal.h>
+#include <limits.h>
+#include <syslog.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <linux/if_tun.h>
+#include <linux/if_tun.h>
+#include <linux/virtio_ids.h>
+
+#define MICSYSFSDIR "/sys/class/mic"
+#define LOGFILE_NAME "/var/log/mpssd"
+#define PAGE_SIZE 4096
+
+struct mic_console_info {
+ pthread_t console_thread;
+ int virtio_console_fd;
+ void *console_dp;
+};
+
+struct mic_net_info {
+ pthread_t net_thread;
+ int virtio_net_fd;
+ int tap_fd;
+ void *net_dp;
+};
+
+struct mic_virtblk_info {
+ pthread_t block_thread;
+ int virtio_block_fd;
+ void *block_dp;
+ volatile sig_atomic_t signaled;
+ char *backend_file;
+ int backend;
+ void *backend_addr;
+ long backend_size;
+};
+
+struct mic_info {
+ int id;
+ char *name;
+ pthread_t config_thread;
+ pthread_t init_thread;
+ pid_t pid;
+ struct mic_console_info mic_console;
+ struct mic_net_info mic_net;
+ struct mic_virtblk_info mic_virtblk;
+ int restart;
+ int boot_on_resume;
+ struct mic_info *next;
+};
+
+__attribute__((format(printf, 1, 2)))
+void mpsslog(char *format, ...);
+char *readsysfs(char *dir, char *entry);
+int setsysfs(char *dir, char *entry, char *value);
+#endif
diff --git a/samples/mic/mpssd/sysfs.c b/samples/mic/mpssd/sysfs.c
new file mode 100644
index 000000000..8dd326936
--- /dev/null
+++ b/samples/mic/mpssd/sysfs.c
@@ -0,0 +1,102 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC User Space Tools.
+ */
+
+#include "mpssd.h"
+
+#define PAGE_SIZE 4096
+
+char *
+readsysfs(char *dir, char *entry)
+{
+ char filename[PATH_MAX];
+ char value[PAGE_SIZE];
+ char *string = NULL;
+ int fd;
+ int len;
+
+ if (dir == NULL)
+ snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry);
+ else
+ snprintf(filename, PATH_MAX,
+ "%s/%s/%s", MICSYSFSDIR, dir, entry);
+
+ fd = open(filename, O_RDONLY);
+ if (fd < 0) {
+ mpsslog("Failed to open sysfs entry '%s': %s\n",
+ filename, strerror(errno));
+ return NULL;
+ }
+
+ len = read(fd, value, sizeof(value));
+ if (len < 0) {
+ mpsslog("Failed to read sysfs entry '%s': %s\n",
+ filename, strerror(errno));
+ goto readsys_ret;
+ }
+ if (len == 0)
+ goto readsys_ret;
+
+ value[len - 1] = '\0';
+
+ string = malloc(strlen(value) + 1);
+ if (string)
+ strcpy(string, value);
+
+readsys_ret:
+ close(fd);
+ return string;
+}
+
+int
+setsysfs(char *dir, char *entry, char *value)
+{
+ char filename[PATH_MAX];
+ char *oldvalue;
+ int fd, ret = 0;
+
+ if (dir == NULL)
+ snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry);
+ else
+ snprintf(filename, PATH_MAX, "%s/%s/%s",
+ MICSYSFSDIR, dir, entry);
+
+ oldvalue = readsysfs(dir, entry);
+
+ fd = open(filename, O_RDWR);
+ if (fd < 0) {
+ ret = errno;
+ mpsslog("Failed to open sysfs entry '%s': %s\n",
+ filename, strerror(errno));
+ goto done;
+ }
+
+ if (!oldvalue || strcmp(value, oldvalue)) {
+ if (write(fd, value, strlen(value)) < 0) {
+ ret = errno;
+ mpsslog("Failed to write new sysfs entry '%s': %s\n",
+ filename, strerror(errno));
+ }
+ }
+ close(fd);
+done:
+ if (oldvalue)
+ free(oldvalue);
+ return ret;
+}
diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst
new file mode 100644
index 000000000..ff8929da6
--- /dev/null
+++ b/samples/pktgen/README.rst
@@ -0,0 +1,45 @@
+Sample and benchmark scripts for pktgen (packet generator)
+==========================================================
+This directory contains some pktgen sample and benchmark scripts, that
+can easily be copied and adjusted for your own use-case.
+
+General doc is located in kernel: Documentation/networking/pktgen.txt
+
+Helper include files
+====================
+This directory contains two helper shell files, that can be "included"
+by shell source'ing. Namely "functions.sh" and "parameters.sh".
+
+Common parameters
+-----------------
+The parameters.sh file support easy and consistant parameter parsing
+across the sample scripts. Usage example is printed on errors::
+
+ Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX
+ -i : ($DEV) output interface/device (required)
+ -s : ($PKT_SIZE) packet size
+ -d : ($DEST_IP) destination IP
+ -m : ($DST_MAC) destination MAC-addr
+ -t : ($THREADS) threads to start
+ -f : ($F_THREAD) index of first thread (zero indexed CPU number)
+ -c : ($SKB_CLONE) SKB clones send before alloc new SKB
+ -n : ($COUNT) num messages to send per thread, 0 means indefinitely
+ -b : ($BURST) HW level bursting of SKBs
+ -v : ($VERBOSE) verbose
+ -x : ($DEBUG) debug
+
+The global variable being set is also listed. E.g. the required
+interface/device parameter "-i" sets variable $DEV.
+
+Common functions
+----------------
+The functions.sh file provides; Three different shell functions for
+configuring the different components of pktgen: pg_ctrl(), pg_thread()
+and pg_set().
+
+These functions correspond to pktgens different components.
+ * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl)
+ * pg_thread() control the kernel threads and binding to devices
+ * pg_set() control setup of individual devices
+
+See sample scripts for usage examples.
diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh
new file mode 100644
index 000000000..7d928571b
--- /dev/null
+++ b/samples/pktgen/functions.sh
@@ -0,0 +1,169 @@
+#
+# Common functions used by pktgen scripts
+# - Depending on bash 3 (or higher) syntax
+#
+# Author: Jesper Dangaaard Brouer
+# License: GPL
+
+set -o errexit
+
+## -- General shell logging cmds --
+function err() {
+ local exitcode=$1
+ shift
+ echo "ERROR: $@" >&2
+ exit $exitcode
+}
+
+function warn() {
+ echo "WARN : $@" >&2
+}
+
+function info() {
+ if [[ -n "$VERBOSE" ]]; then
+ echo "INFO : $@" >&2
+ fi
+}
+
+## -- Pktgen proc config commands -- ##
+export PROC_DIR=/proc/net/pktgen
+#
+# Three different shell functions for configuring the different
+# components of pktgen:
+# pg_ctrl(), pg_thread() and pg_set().
+#
+# These functions correspond to pktgens different components.
+# * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl)
+# * pg_thread() control the kernel threads and binding to devices
+# * pg_set() control setup of individual devices
+function pg_ctrl() {
+ local proc_file="pgctrl"
+ proc_cmd ${proc_file} "$@"
+}
+
+function pg_thread() {
+ local thread=$1
+ local proc_file="kpktgend_${thread}"
+ shift
+ proc_cmd ${proc_file} "$@"
+}
+
+function pg_set() {
+ local dev=$1
+ local proc_file="$dev"
+ shift
+ proc_cmd ${proc_file} "$@"
+}
+
+# More generic replacement for pgset(), that does not depend on global
+# variable for proc file.
+function proc_cmd() {
+ local result
+ local proc_file=$1
+ local status=0
+ # after shift, the remaining args are contained in $@
+ shift
+ local proc_ctrl=${PROC_DIR}/$proc_file
+ if [[ ! -e "$proc_ctrl" ]]; then
+ err 3 "proc file:$proc_ctrl does not exists (dev added to thread?)"
+ else
+ if [[ ! -w "$proc_ctrl" ]]; then
+ err 4 "proc file:$proc_ctrl not writable, not root?!"
+ fi
+ fi
+
+ if [[ "$DEBUG" == "yes" ]]; then
+ echo "cmd: $@ > $proc_ctrl"
+ fi
+ # Quoting of "$@" is important for space expansion
+ echo "$@" > "$proc_ctrl" || status=$?
+
+ if [[ "$proc_file" != "pgctrl" ]]; then
+ result=$(grep "Result: OK:" $proc_ctrl) || true
+ if [[ "$result" == "" ]]; then
+ grep "Result:" $proc_ctrl >&2
+ fi
+ fi
+ if (( $status != 0 )); then
+ err 5 "Write error($status) occurred cmd: \"$@ > $proc_ctrl\""
+ fi
+}
+
+# Old obsolete "pgset" function, with slightly improved err handling
+function pgset() {
+ local result
+
+ if [[ "$DEBUG" == "yes" ]]; then
+ echo "cmd: $1 > $PGDEV"
+ fi
+ echo $1 > $PGDEV
+ local status=$?
+
+ result=`cat $PGDEV | fgrep "Result: OK:"`
+ if [[ "$result" == "" ]]; then
+ cat $PGDEV | fgrep Result:
+ fi
+ if (( $status != 0 )); then
+ err 5 "Write error($status) occurred cmd: \"$1 > $PGDEV\""
+ fi
+}
+
+[[ $EUID -eq 0 ]] && trap 'pg_ctrl "reset"' EXIT
+
+## -- General shell tricks --
+
+function root_check_run_with_sudo() {
+ # Trick so, program can be run as normal user, will just use "sudo"
+ # call as root_check_run_as_sudo "$@"
+ if [ "$EUID" -ne 0 ]; then
+ if [ -x $0 ]; then # Directly executable use sudo
+ info "Not root, running with sudo"
+ sudo "$0" "$@"
+ exit $?
+ fi
+ err 4 "cannot perform sudo run of $0"
+ fi
+}
+
+# Exact input device's NUMA node info
+function get_iface_node()
+{
+ local node=$(</sys/class/net/$1/device/numa_node)
+ if [[ $node == -1 ]]; then
+ echo 0
+ else
+ echo $node
+ fi
+}
+
+# Given an Dev/iface, get its queues' irq numbers
+function get_iface_irqs()
+{
+ local IFACE=$1
+ local queues="${IFACE}-.*TxRx"
+
+ irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:)
+ [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:)
+ [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\
+ do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\
+ done)
+ [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE"
+
+ echo $irqs
+}
+
+# Given a NUMA node, return cpu ids belonging to it.
+function get_node_cpus()
+{
+ local node=$1
+ local node_cpu_list
+ local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \
+ /sys/devices/system/node/node$node/cpulist`
+
+ for cpu_range in $node_cpu_range_list
+ do
+ node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }`
+ done
+
+ echo $node_cpu_list
+}
diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh
new file mode 100644
index 000000000..72fc56287
--- /dev/null
+++ b/samples/pktgen/parameters.sh
@@ -0,0 +1,116 @@
+#
+# SPDX-License-Identifier: GPL-2.0
+# Common parameter parsing for pktgen scripts
+#
+
+function usage() {
+ echo ""
+ echo "Usage: $0 [-vx] -i ethX"
+ echo " -i : (\$DEV) output interface/device (required)"
+ echo " -s : (\$PKT_SIZE) packet size"
+ echo " -d : (\$DEST_IP) destination IP"
+ echo " -m : (\$DST_MAC) destination MAC-addr"
+ echo " -t : (\$THREADS) threads to start"
+ echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)"
+ echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB"
+ echo " -n : (\$COUNT) num messages to send per thread, 0 means indefinitely"
+ echo " -b : (\$BURST) HW level bursting of SKBs"
+ echo " -v : (\$VERBOSE) verbose"
+ echo " -x : (\$DEBUG) debug"
+ echo " -6 : (\$IP6) IPv6"
+ echo ""
+}
+
+## --- Parse command line arguments / parameters ---
+## echo "Commandline options:"
+while getopts "s:i:d:m:f:t:c:n:b:vxh6" option; do
+ case $option in
+ i) # interface
+ export DEV=$OPTARG
+ info "Output device set to: DEV=$DEV"
+ ;;
+ s)
+ export PKT_SIZE=$OPTARG
+ info "Packet size set to: PKT_SIZE=$PKT_SIZE bytes"
+ ;;
+ d) # destination IP
+ export DEST_IP=$OPTARG
+ info "Destination IP set to: DEST_IP=$DEST_IP"
+ ;;
+ m) # MAC
+ export DST_MAC=$OPTARG
+ info "Destination MAC set to: DST_MAC=$DST_MAC"
+ ;;
+ f)
+ export F_THREAD=$OPTARG
+ info "Index of first thread (zero indexed CPU number): $F_THREAD"
+ ;;
+ t)
+ export THREADS=$OPTARG
+ info "Number of threads to start: $THREADS"
+ ;;
+ c)
+ export CLONE_SKB=$OPTARG
+ info "CLONE_SKB=$CLONE_SKB"
+ ;;
+ n)
+ export COUNT=$OPTARG
+ info "COUNT=$COUNT"
+ ;;
+ b)
+ export BURST=$OPTARG
+ info "SKB bursting: BURST=$BURST"
+ ;;
+ v)
+ export VERBOSE=yes
+ info "Verbose mode: VERBOSE=$VERBOSE"
+ ;;
+ x)
+ export DEBUG=yes
+ info "Debug mode: DEBUG=$DEBUG"
+ ;;
+ 6)
+ export IP6=6
+ info "IP6: IP6=$IP6"
+ ;;
+ h|?|*)
+ usage;
+ err 2 "[ERROR] Unknown parameters!!!"
+ esac
+done
+shift $(( $OPTIND - 1 ))
+
+if [ -z "$PKT_SIZE" ]; then
+ # NIC adds 4 bytes CRC
+ export PKT_SIZE=60
+ info "Default packet size set to: set to: $PKT_SIZE bytes"
+fi
+
+if [ -z "$F_THREAD" ]; then
+ # First thread (F_THREAD) reference the zero indexed CPU number
+ export F_THREAD=0
+fi
+
+if [ -z "$THREADS" ]; then
+ export THREADS=1
+fi
+
+export L_THREAD=$(( THREADS + F_THREAD - 1 ))
+
+if [ -z "$DEV" ]; then
+ usage
+ err 2 "Please specify output device"
+fi
+
+if [ -z "$DST_MAC" ]; then
+ warn "Missing destination MAC address"
+fi
+
+if [ -z "$DEST_IP" ]; then
+ warn "Missing destination IP address"
+fi
+
+if [ ! -d /proc/net/pktgen ]; then
+ info "Loading kernel module: pktgen"
+ modprobe pktgen
+fi
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
new file mode 100755
index 000000000..2839f7d31
--- /dev/null
+++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Benchmark script:
+# - developed for benchmarking ingress qdisc path
+#
+# Script for injecting packets into RX path of the stack with pktgen
+# "xmit_mode netif_receive". With an invalid dst_mac this will only
+# measure the ingress code path as packets gets dropped in ip_rcv().
+#
+# This script don't really need any hardware. It benchmarks software
+# RX path just after NIC driver level. With bursting is also
+# "removes" the SKB alloc/free overhead.
+#
+# Setup scenarios for measuring ingress qdisc (with invalid dst_mac):
+# ------------------------------------------------------------------
+# (1) no ingress (uses static_key_false(&ingress_needed))
+#
+# (2) ingress on other dev (change ingress_needed and calls
+# handle_ing() but exit early)
+#
+# config: tc qdisc add dev $SOMEDEV handle ffff: ingress
+#
+# (3) ingress on this dev, handle_ing() -> tc_classify()
+#
+# config: tc qdisc add dev $DEV handle ffff: ingress
+#
+# (4) ingress on this dev + drop at u32 classifier/action.
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+# Using invalid DST_MAC will cause the packets to get dropped in
+# ip_rcv() which is part of the test
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$BURST" ] && BURST=1024
+[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely
+
+# Base Config
+DELAY="0" # Zero means max speed
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config of dev
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst$IP6 $DEST_IP"
+
+ # Inject packet into RX path of stack
+ pg_set $dev "xmit_mode netif_receive"
+
+ # Burst allow us to avoid measuring SKB alloc/free overhead
+ pg_set $dev "burst $BURST"
+done
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+# Print results
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+done
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh
new file mode 100755
index 000000000..e1ee54465
--- /dev/null
+++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Benchmark script:
+# - developed for benchmarking egress qdisc path, derived (more
+# like cut'n'pasted) from ingress benchmark script.
+#
+# Script for injecting packets into egress qdisc path of the stack
+# with pktgen "xmit_mode queue_xmit".
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+
+# Burst greater than 1 are invalid for queue_xmit mode
+if [[ -n "$BURST" ]]; then
+ err 1 "Bursting not supported for this mode"
+fi
+[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely
+
+# Base Config
+DELAY="0" # Zero means max speed
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config of dev
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst$IP6 $DEST_IP"
+
+ # Inject packet into TX qdisc egress path of stack
+ pg_set $dev "xmit_mode queue_xmit"
+done
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+# Print results
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+done
diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh
new file mode 100755
index 000000000..e9ab4edba
--- /dev/null
+++ b/samples/pktgen/pktgen_sample01_simple.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Simple example:
+# * pktgen sending with single thread and single interface
+# * flow variation via random UDP source port
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+# - go look in parameters.sh to see which setting are avail
+# - required param is the interface "-i" stored in $DEV
+source ${basedir}/parameters.sh
+#
+# Set some default params, if they didn't get set
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+# Example enforce param "-m" for dst_mac
+[ -z "$DST_MAC" ] && usage && err 2 "Must specify -m dst_mac"
+[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely
+
+# Base Config
+DELAY="0" # Zero means max speed
+
+# Flow variation random source port between min and max
+UDP_MIN=9
+UDP_MAX=109
+
+# General cleanup everything since last run
+# (especially important if other threads were configured by other scripts)
+pg_ctrl "reset"
+
+# Add remove all other devices and add_device $DEV to thread 0
+thread=0
+pg_thread $thread "rem_device_all"
+pg_thread $thread "add_device" $DEV
+
+# How many packets to send (zero means indefinitely)
+pg_set $DEV "count $COUNT"
+
+# Reduce alloc cost by sending same SKB many times
+# - this obviously affects the randomness within the packet
+pg_set $DEV "clone_skb $CLONE_SKB"
+
+# Set packet size
+pg_set $DEV "pkt_size $PKT_SIZE"
+
+# Delay between packets (zero means max speed)
+pg_set $DEV "delay $DELAY"
+
+# Flag example disabling timestamping
+pg_set $DEV "flag NO_TIMESTAMP"
+
+# Destination
+pg_set $DEV "dst_mac $DST_MAC"
+pg_set $DEV "dst$IP6 $DEST_IP"
+
+# Setup random UDP port src range
+pg_set $DEV "flag UDPSRC_RND"
+pg_set $DEV "udp_src_min $UDP_MIN"
+pg_set $DEV "udp_src_max $UDP_MAX"
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+# Print results
+echo "Result device: $DEV"
+cat /proc/net/pktgen/$DEV
diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh
new file mode 100755
index 000000000..99f740ae9
--- /dev/null
+++ b/samples/pktgen/pktgen_sample02_multiqueue.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Multiqueue: Using pktgen threads for sending on multiple CPUs
+# * adding devices to kernel threads
+# * notice the naming scheme for keeping device names unique
+# * nameing scheme: dev@thread_number
+# * flow variation via random UDP source port
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+#
+# Required param: -i dev in $DEV
+source ${basedir}/parameters.sh
+
+[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely
+
+# Base Config
+DELAY="0" # Zero means max speed
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+
+# Flow variation random source port between min and max
+UDP_MIN=9
+UDP_MAX=109
+
+# (example of setting default params in your script)
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Notice config queue to map to cpu (mirrors smp_processor_id())
+ # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number
+ pg_set $dev "flag QUEUE_MAP_CPU"
+
+ # Base config of dev
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+
+ # Flag example disabling timestamping
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst$IP6 $DEST_IP"
+
+ # Setup random UDP port src range
+ pg_set $dev "flag UDPSRC_RND"
+ pg_set $dev "udp_src_min $UDP_MIN"
+ pg_set $dev "udp_src_max $UDP_MAX"
+done
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+# Print results
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+done
diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
new file mode 100755
index 000000000..8fdd36722
--- /dev/null
+++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Script for max single flow performance
+# - If correctly tuned[1], single CPU 10G wirespeed small pkts is possible[2]
+#
+# Using pktgen "burst" option (use -b $N)
+# - To boost max performance
+# - Avail since: kernel v3.18
+# * commit 38b2cf2982dc73 ("net: pktgen: packet bursting via skb->xmit_more")
+# - This avoids writing the HW tailptr on every driver xmit
+# - The performance boost is impressive, see commit and blog [2]
+#
+# Notice: On purpose generates a single (UDP) flow towards target,
+# reason behind this is to only overload/activate a single CPU on
+# target host. And no randomness for pktgen also makes it faster.
+#
+# Tuning see:
+# [1] http://netoptimizer.blogspot.dk/2014/06/pktgen-for-network-overload-testing.html
+# [2] http://netoptimizer.blogspot.dk/2014/10/unlocked-10gbps-tx-wirespeed-smallest.html
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+# Set some default params, if they didn't get set
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$BURST" ] && BURST=32
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting
+[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
+
+# Base Config
+DELAY="0" # Zero means max speed
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst$IP6 $DEST_IP"
+
+ # Setup burst, for easy testing -b 0 disable bursting
+ # (internally in pktgen default and minimum burst=1)
+ if [[ ${BURST} -ne 0 ]]; then
+ pg_set $dev "burst $BURST"
+ else
+ info "$dev: Not using burst"
+ fi
+done
+
+# Run if user hits control-c
+function control_c() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap control_c SIGINT
+
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh
new file mode 100755
index 000000000..4df92b717
--- /dev/null
+++ b/samples/pktgen/pktgen_sample04_many_flows.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Script example for many flows testing
+#
+# Number of simultaneous flows limited by variable $FLOWS
+# and number of packets per flow controlled by variable $FLOWLEN
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+# Set some default params, if they didn't get set
+[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42"
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
+
+# NOTICE: Script specific settings
+# =======
+# Limiting the number of concurrent flows ($FLOWS)
+# and also set how many packets each flow contains ($FLOWLEN)
+#
+[ -z "$FLOWS" ] && FLOWS="8000"
+[ -z "$FLOWLEN" ] && FLOWLEN="10"
+
+# Base Config
+DELAY="0" # Zero means max speed
+
+if [[ -n "$BURST" ]]; then
+ err 1 "Bursting not supported for this mode"
+fi
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Single destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst $DEST_IP"
+
+ # Randomize source IP-addresses
+ pg_set $dev "flag IPSRC_RND"
+ pg_set $dev "src_min 198.18.0.0"
+ pg_set $dev "src_max 198.19.255.255"
+
+ # Limit number of flows (max 65535)
+ pg_set $dev "flows $FLOWS"
+ #
+ # How many packets a flow will send, before flow "entry" is
+ # re-generated/setup.
+ pg_set $dev "flowlen $FLOWLEN"
+ #
+ # Flag FLOW_SEQ will cause $FLOWLEN packets from the same flow
+ # being send back-to-back, before next flow is selected
+ # incrementally. This helps lookup caches, and is more realistic.
+ #
+ pg_set $dev "flag FLOW_SEQ"
+
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+
+print_result
diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
new file mode 100755
index 000000000..7f8b5e59f
--- /dev/null
+++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Script will generate one flow per thread (-t N)
+# - Same destination IP
+# - Fake source IPs for each flow (fixed based on thread number)
+#
+# Useful for scale testing on receiver, to see whether silo'ing flows
+# works and scales. For optimal scalability (on receiver) each
+# separate-flow should not access shared variables/data. This script
+# helps magnify any of these scaling issues by overloading the receiver.
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+# Set some default params, if they didn't get set
+[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42"
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+[ -z "$BURST" ] && BURST=32
+[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
+
+
+# Base Config
+DELAY="0" # Zero means max speed
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Single destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst $DEST_IP"
+
+ # Setup source IP-addresses based on thread number
+ pg_set $dev "src_min 198.18.$((thread+1)).1"
+ pg_set $dev "src_max 198.18.$((thread+1)).1"
+
+ # Setup burst, for easy testing -b 0 disable bursting
+ # (internally in pktgen default and minimum burst=1)
+ if [[ ${BURST} -ne 0 ]]; then
+ pg_set $dev "burst $BURST"
+ else
+ info "$dev: Not using burst"
+ fi
+
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+
+print_result
diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
new file mode 100755
index 000000000..353adc172
--- /dev/null
+++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# Multiqueue: Using pktgen threads for sending on multiple CPUs
+# * adding devices to kernel threads which are in the same NUMA node
+# * bound devices queue's irq affinity to the threads, 1:1 mapping
+# * notice the naming scheme for keeping device names unique
+# * nameing scheme: dev@thread_number
+# * flow variation via random UDP source port
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+#
+# Required param: -i dev in $DEV
+source ${basedir}/parameters.sh
+
+# Base Config
+DELAY="0" # Zero means max speed
+[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+
+# Flow variation random source port between min and max
+UDP_MIN=9
+UDP_MAX=109
+
+node=`get_iface_node $DEV`
+irq_array=(`get_iface_irqs $DEV`)
+cpu_array=(`get_node_cpus $node`)
+
+[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \
+ err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})"
+
+# (example of setting default params in your script)
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((i = 0; i < $THREADS; i++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ # Set the queue's irq affinity to this $thread (processor)
+ # if '-f' is designated, offset cpu id
+ thread=${cpu_array[$((i+F_THREAD))]}
+ dev=${DEV}@${thread}
+ echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list
+ info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`"
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # select queue and bind the queue and $dev in 1:1 relationship
+ queue_num=$i
+ info "queue number is $queue_num"
+ pg_set $dev "queue_map_min $queue_num"
+ pg_set $dev "queue_map_max $queue_num"
+
+ # Notice config queue to map to cpu (mirrors smp_processor_id())
+ # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number
+ pg_set $dev "flag QUEUE_MAP_CPU"
+
+ # Base config of dev
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+
+ # Flag example disabling timestamping
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst$IP6 $DEST_IP"
+
+ # Setup random UDP port src range
+ pg_set $dev "flag UDPSRC_RND"
+ pg_set $dev "udp_src_min $UDP_MIN"
+ pg_set $dev "udp_src_max $UDP_MAX"
+done
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+# Print results
+for ((i = 0; i < $THREADS; i++)); do
+ thread=${cpu_array[$((i+F_THREAD))]}
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+done
diff --git a/samples/qmi/Makefile b/samples/qmi/Makefile
new file mode 100644
index 000000000..2b111d276
--- /dev/null
+++ b/samples/qmi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi_sample_client.o
diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c
new file mode 100644
index 000000000..c9e7276c3
--- /dev/null
+++ b/samples/qmi/qmi_sample_client.c
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sample in-kernel QMI client driver
+ *
+ * Copyright (c) 2013-2014, The Linux Foundation. All rights reserved.
+ * Copyright (C) 2017 Linaro Ltd.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/qrtr.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/idr.h>
+#include <linux/string.h>
+#include <net/sock.h>
+#include <linux/soc/qcom/qmi.h>
+
+#define PING_REQ1_TLV_TYPE 0x1
+#define PING_RESP1_TLV_TYPE 0x2
+#define PING_OPT1_TLV_TYPE 0x10
+#define PING_OPT2_TLV_TYPE 0x11
+
+#define DATA_REQ1_TLV_TYPE 0x1
+#define DATA_RESP1_TLV_TYPE 0x2
+#define DATA_OPT1_TLV_TYPE 0x10
+#define DATA_OPT2_TLV_TYPE 0x11
+
+#define TEST_MED_DATA_SIZE_V01 8192
+#define TEST_MAX_NAME_SIZE_V01 255
+
+#define TEST_PING_REQ_MSG_ID_V01 0x20
+#define TEST_DATA_REQ_MSG_ID_V01 0x21
+
+#define TEST_PING_REQ_MAX_MSG_LEN_V01 266
+#define TEST_DATA_REQ_MAX_MSG_LEN_V01 8456
+
+struct test_name_type_v01 {
+ u32 name_len;
+ char name[TEST_MAX_NAME_SIZE_V01];
+};
+
+static struct qmi_elem_info test_name_type_v01_ei[] = {
+ {
+ .data_type = QMI_DATA_LEN,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = QMI_COMMON_TLV_TYPE,
+ .offset = offsetof(struct test_name_type_v01,
+ name_len),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = TEST_MAX_NAME_SIZE_V01,
+ .elem_size = sizeof(char),
+ .array_type = VAR_LEN_ARRAY,
+ .tlv_type = QMI_COMMON_TLV_TYPE,
+ .offset = offsetof(struct test_name_type_v01,
+ name),
+ },
+ {}
+};
+
+struct test_ping_req_msg_v01 {
+ char ping[4];
+
+ u8 client_name_valid;
+ struct test_name_type_v01 client_name;
+};
+
+static struct qmi_elem_info test_ping_req_msg_v01_ei[] = {
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = 4,
+ .elem_size = sizeof(char),
+ .array_type = STATIC_ARRAY,
+ .tlv_type = PING_REQ1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_req_msg_v01,
+ ping),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_req_msg_v01,
+ client_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_req_msg_v01,
+ client_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+struct test_ping_resp_msg_v01 {
+ struct qmi_response_type_v01 resp;
+
+ u8 pong_valid;
+ char pong[4];
+
+ u8 service_name_valid;
+ struct test_name_type_v01 service_name;
+};
+
+static struct qmi_elem_info test_ping_resp_msg_v01_ei[] = {
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct qmi_response_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_RESP1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ resp),
+ .ei_array = qmi_response_type_v01_ei,
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ pong_valid),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = 4,
+ .elem_size = sizeof(char),
+ .array_type = STATIC_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ pong),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ service_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ service_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+struct test_data_req_msg_v01 {
+ u32 data_len;
+ u8 data[TEST_MED_DATA_SIZE_V01];
+
+ u8 client_name_valid;
+ struct test_name_type_v01 client_name;
+};
+
+static struct qmi_elem_info test_data_req_msg_v01_ei[] = {
+ {
+ .data_type = QMI_DATA_LEN,
+ .elem_len = 1,
+ .elem_size = sizeof(u32),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_REQ1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ data_len),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = TEST_MED_DATA_SIZE_V01,
+ .elem_size = sizeof(u8),
+ .array_type = VAR_LEN_ARRAY,
+ .tlv_type = DATA_REQ1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ data),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ client_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ client_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+struct test_data_resp_msg_v01 {
+ struct qmi_response_type_v01 resp;
+
+ u8 data_valid;
+ u32 data_len;
+ u8 data[TEST_MED_DATA_SIZE_V01];
+
+ u8 service_name_valid;
+ struct test_name_type_v01 service_name;
+};
+
+static struct qmi_elem_info test_data_resp_msg_v01_ei[] = {
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct qmi_response_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_RESP1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ resp),
+ .ei_array = qmi_response_type_v01_ei,
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ data_valid),
+ },
+ {
+ .data_type = QMI_DATA_LEN,
+ .elem_len = 1,
+ .elem_size = sizeof(u32),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ data_len),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = TEST_MED_DATA_SIZE_V01,
+ .elem_size = sizeof(u8),
+ .array_type = VAR_LEN_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ data),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ service_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ service_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+/*
+ * ping_write() - ping_pong debugfs file write handler
+ * @file: debugfs file context
+ * @user_buf: reference to the user data (ignored)
+ * @count: number of bytes in @user_buf
+ * @ppos: offset in @file to write
+ *
+ * This function allows user space to send out a ping_pong QMI encoded message
+ * to the associated remote test service and will return with the result of the
+ * transaction. It serves as an example of how to provide a custom response
+ * handler.
+ *
+ * Return: @count, or negative errno on failure.
+ */
+static ssize_t ping_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct qmi_handle *qmi = file->private_data;
+ struct test_ping_req_msg_v01 req = {};
+ struct qmi_txn txn;
+ int ret;
+
+ memcpy(req.ping, "ping", sizeof(req.ping));
+
+ ret = qmi_txn_init(qmi, &txn, NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = qmi_send_request(qmi, NULL, &txn,
+ TEST_PING_REQ_MSG_ID_V01,
+ TEST_PING_REQ_MAX_MSG_LEN_V01,
+ test_ping_req_msg_v01_ei, &req);
+ if (ret < 0) {
+ qmi_txn_cancel(&txn);
+ return ret;
+ }
+
+ ret = qmi_txn_wait(&txn, 5 * HZ);
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
+
+static const struct file_operations ping_fops = {
+ .open = simple_open,
+ .write = ping_write,
+};
+
+static void ping_pong_cb(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
+ struct qmi_txn *txn, const void *data)
+{
+ const struct test_ping_resp_msg_v01 *resp = data;
+
+ if (!txn) {
+ pr_err("spurious ping response\n");
+ return;
+ }
+
+ if (resp->resp.result == QMI_RESULT_FAILURE_V01)
+ txn->result = -ENXIO;
+ else if (!resp->pong_valid || memcmp(resp->pong, "pong", 4))
+ txn->result = -EINVAL;
+
+ complete(&txn->completion);
+}
+
+/*
+ * data_write() - data debugfs file write handler
+ * @file: debugfs file context
+ * @user_buf: reference to the user data
+ * @count: number of bytes in @user_buf
+ * @ppos: offset in @file to write
+ *
+ * This function allows user space to send out a data QMI encoded message to
+ * the associated remote test service and will return with the result of the
+ * transaction. It serves as an example of how to have the QMI helpers decode a
+ * transaction response into a provided object automatically.
+ *
+ * Return: @count, or negative errno on failure.
+ */
+static ssize_t data_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+
+{
+ struct qmi_handle *qmi = file->private_data;
+ struct test_data_resp_msg_v01 *resp;
+ struct test_data_req_msg_v01 *req;
+ struct qmi_txn txn;
+ int ret;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ resp = kzalloc(sizeof(*resp), GFP_KERNEL);
+ if (!resp) {
+ kfree(req);
+ return -ENOMEM;
+ }
+
+ req->data_len = min_t(size_t, sizeof(req->data), count);
+ if (copy_from_user(req->data, user_buf, req->data_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = qmi_txn_init(qmi, &txn, test_data_resp_msg_v01_ei, resp);
+ if (ret < 0)
+ goto out;
+
+ ret = qmi_send_request(qmi, NULL, &txn,
+ TEST_DATA_REQ_MSG_ID_V01,
+ TEST_DATA_REQ_MAX_MSG_LEN_V01,
+ test_data_req_msg_v01_ei, req);
+ if (ret < 0) {
+ qmi_txn_cancel(&txn);
+ goto out;
+ }
+
+ ret = qmi_txn_wait(&txn, 5 * HZ);
+ if (ret < 0) {
+ goto out;
+ } else if (!resp->data_valid ||
+ resp->data_len != req->data_len ||
+ memcmp(resp->data, req->data, req->data_len)) {
+ pr_err("response data doesn't match expectation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = count;
+
+out:
+ kfree(resp);
+ kfree(req);
+
+ return ret;
+}
+
+static const struct file_operations data_fops = {
+ .open = simple_open,
+ .write = data_write,
+};
+
+static struct qmi_msg_handler qmi_sample_handlers[] = {
+ {
+ .type = QMI_RESPONSE,
+ .msg_id = TEST_PING_REQ_MSG_ID_V01,
+ .ei = test_ping_resp_msg_v01_ei,
+ .decoded_size = sizeof(struct test_ping_req_msg_v01),
+ .fn = ping_pong_cb
+ },
+ {}
+};
+
+struct qmi_sample {
+ struct qmi_handle qmi;
+
+ struct dentry *de_dir;
+ struct dentry *de_data;
+ struct dentry *de_ping;
+};
+
+static struct dentry *qmi_debug_dir;
+
+static int qmi_sample_probe(struct platform_device *pdev)
+{
+ struct sockaddr_qrtr *sq;
+ struct qmi_sample *sample;
+ char path[20];
+ int ret;
+
+ sample = devm_kzalloc(&pdev->dev, sizeof(*sample), GFP_KERNEL);
+ if (!sample)
+ return -ENOMEM;
+
+ ret = qmi_handle_init(&sample->qmi, TEST_DATA_REQ_MAX_MSG_LEN_V01,
+ NULL,
+ qmi_sample_handlers);
+ if (ret < 0)
+ return ret;
+
+ sq = dev_get_platdata(&pdev->dev);
+ ret = kernel_connect(sample->qmi.sock, (struct sockaddr *)sq,
+ sizeof(*sq), 0);
+ if (ret < 0) {
+ pr_err("failed to connect to remote service port\n");
+ goto err_release_qmi_handle;
+ }
+
+ snprintf(path, sizeof(path), "%d:%d", sq->sq_node, sq->sq_port);
+
+ sample->de_dir = debugfs_create_dir(path, qmi_debug_dir);
+ if (IS_ERR(sample->de_dir)) {
+ ret = PTR_ERR(sample->de_dir);
+ goto err_release_qmi_handle;
+ }
+
+ sample->de_data = debugfs_create_file("data", 0600, sample->de_dir,
+ sample, &data_fops);
+ if (IS_ERR(sample->de_data)) {
+ ret = PTR_ERR(sample->de_data);
+ goto err_remove_de_dir;
+ }
+
+ sample->de_ping = debugfs_create_file("ping", 0600, sample->de_dir,
+ sample, &ping_fops);
+ if (IS_ERR(sample->de_ping)) {
+ ret = PTR_ERR(sample->de_ping);
+ goto err_remove_de_data;
+ }
+
+ platform_set_drvdata(pdev, sample);
+
+ return 0;
+
+err_remove_de_data:
+ debugfs_remove(sample->de_data);
+err_remove_de_dir:
+ debugfs_remove(sample->de_dir);
+err_release_qmi_handle:
+ qmi_handle_release(&sample->qmi);
+
+ return ret;
+}
+
+static int qmi_sample_remove(struct platform_device *pdev)
+{
+ struct qmi_sample *sample = platform_get_drvdata(pdev);
+
+ debugfs_remove(sample->de_ping);
+ debugfs_remove(sample->de_data);
+ debugfs_remove(sample->de_dir);
+
+ qmi_handle_release(&sample->qmi);
+
+ return 0;
+}
+
+static struct platform_driver qmi_sample_driver = {
+ .probe = qmi_sample_probe,
+ .remove = qmi_sample_remove,
+ .driver = {
+ .name = "qmi_sample_client",
+ },
+};
+
+static int qmi_sample_new_server(struct qmi_handle *qmi,
+ struct qmi_service *service)
+{
+ struct platform_device *pdev;
+ struct sockaddr_qrtr sq = { AF_QIPCRTR, service->node, service->port };
+ int ret;
+
+ pdev = platform_device_alloc("qmi_sample_client", PLATFORM_DEVID_AUTO);
+ if (!pdev)
+ return -ENOMEM;
+
+ ret = platform_device_add_data(pdev, &sq, sizeof(sq));
+ if (ret)
+ goto err_put_device;
+
+ ret = platform_device_add(pdev);
+ if (ret)
+ goto err_put_device;
+
+ service->priv = pdev;
+
+ return 0;
+
+err_put_device:
+ platform_device_put(pdev);
+
+ return ret;
+}
+
+static void qmi_sample_del_server(struct qmi_handle *qmi,
+ struct qmi_service *service)
+{
+ struct platform_device *pdev = service->priv;
+
+ platform_device_unregister(pdev);
+}
+
+static struct qmi_handle lookup_client;
+
+static struct qmi_ops lookup_ops = {
+ .new_server = qmi_sample_new_server,
+ .del_server = qmi_sample_del_server,
+};
+
+static int qmi_sample_init(void)
+{
+ int ret;
+
+ qmi_debug_dir = debugfs_create_dir("qmi_sample", NULL);
+ if (IS_ERR(qmi_debug_dir)) {
+ pr_err("failed to create qmi_sample dir\n");
+ return PTR_ERR(qmi_debug_dir);
+ }
+
+ ret = platform_driver_register(&qmi_sample_driver);
+ if (ret)
+ goto err_remove_debug_dir;
+
+ ret = qmi_handle_init(&lookup_client, 0, &lookup_ops, NULL);
+ if (ret < 0)
+ goto err_unregister_driver;
+
+ qmi_add_lookup(&lookup_client, 15, 0, 0);
+
+ return 0;
+
+err_unregister_driver:
+ platform_driver_unregister(&qmi_sample_driver);
+err_remove_debug_dir:
+ debugfs_remove(qmi_debug_dir);
+
+ return ret;
+}
+
+static void qmi_sample_exit(void)
+{
+ qmi_handle_release(&lookup_client);
+
+ platform_driver_unregister(&qmi_sample_driver);
+
+ debugfs_remove(qmi_debug_dir);
+}
+
+module_init(qmi_sample_init);
+module_exit(qmi_sample_exit);
+
+MODULE_DESCRIPTION("Sample QMI client driver");
+MODULE_LICENSE("GPL v2");
diff --git a/samples/rpmsg/Makefile b/samples/rpmsg/Makefile
new file mode 100644
index 000000000..2d4973c69
--- /dev/null
+++ b/samples/rpmsg/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg_client_sample.o
diff --git a/samples/rpmsg/rpmsg_client_sample.c b/samples/rpmsg/rpmsg_client_sample.c
new file mode 100644
index 000000000..f161dfd3e
--- /dev/null
+++ b/samples/rpmsg/rpmsg_client_sample.c
@@ -0,0 +1,102 @@
+/*
+ * Remote processor messaging - sample client driver
+ *
+ * Copyright (C) 2011 Texas Instruments, Inc.
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Ohad Ben-Cohen <ohad@wizery.com>
+ * Brian Swetland <swetland@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rpmsg.h>
+
+#define MSG "hello world!"
+#define MSG_LIMIT 100
+
+struct instance_data {
+ int rx_count;
+};
+
+static int rpmsg_sample_cb(struct rpmsg_device *rpdev, void *data, int len,
+ void *priv, u32 src)
+{
+ int ret;
+ struct instance_data *idata = dev_get_drvdata(&rpdev->dev);
+
+ dev_info(&rpdev->dev, "incoming msg %d (src: 0x%x)\n",
+ ++idata->rx_count, src);
+
+ print_hex_dump(KERN_DEBUG, __func__, DUMP_PREFIX_NONE, 16, 1,
+ data, len, true);
+
+ /* samples should not live forever */
+ if (idata->rx_count >= MSG_LIMIT) {
+ dev_info(&rpdev->dev, "goodbye!\n");
+ return 0;
+ }
+
+ /* send a new message now */
+ ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG));
+ if (ret)
+ dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret);
+
+ return 0;
+}
+
+static int rpmsg_sample_probe(struct rpmsg_device *rpdev)
+{
+ int ret;
+ struct instance_data *idata;
+
+ dev_info(&rpdev->dev, "new channel: 0x%x -> 0x%x!\n",
+ rpdev->src, rpdev->dst);
+
+ idata = devm_kzalloc(&rpdev->dev, sizeof(*idata), GFP_KERNEL);
+ if (!idata)
+ return -ENOMEM;
+
+ dev_set_drvdata(&rpdev->dev, idata);
+
+ /* send a message to our remote processor */
+ ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG));
+ if (ret) {
+ dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void rpmsg_sample_remove(struct rpmsg_device *rpdev)
+{
+ dev_info(&rpdev->dev, "rpmsg sample client driver is removed\n");
+}
+
+static struct rpmsg_device_id rpmsg_driver_sample_id_table[] = {
+ { .name = "rpmsg-client-sample" },
+ { },
+};
+MODULE_DEVICE_TABLE(rpmsg, rpmsg_driver_sample_id_table);
+
+static struct rpmsg_driver rpmsg_sample_client = {
+ .drv.name = KBUILD_MODNAME,
+ .id_table = rpmsg_driver_sample_id_table,
+ .probe = rpmsg_sample_probe,
+ .callback = rpmsg_sample_cb,
+ .remove = rpmsg_sample_remove,
+};
+module_rpmsg_driver(rpmsg_sample_client);
+
+MODULE_DESCRIPTION("Remote processor messaging sample client driver");
+MODULE_LICENSE("GPL v2");
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
new file mode 100644
index 000000000..78fb78184
--- /dev/null
+++ b/samples/seccomp/.gitignore
@@ -0,0 +1,3 @@
+bpf-direct
+bpf-fancy
+dropper
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
new file mode 100644
index 000000000..cf34ff6b4
--- /dev/null
+++ b/samples/seccomp/Makefile
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: GPL-2.0
+ifndef CROSS_COMPILE
+hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct
+
+HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
+HOSTCFLAGS_bpf-helper.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-helper.o += -idirafter $(objtree)/include
+bpf-fancy-objs := bpf-fancy.o bpf-helper.o
+
+HOSTCFLAGS_dropper.o += -I$(objtree)/usr/include
+HOSTCFLAGS_dropper.o += -idirafter $(objtree)/include
+dropper-objs := dropper.o
+
+HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
+bpf-direct-objs := bpf-direct.o
+
+# Try to match the kernel target.
+ifndef CONFIG_64BIT
+
+# s390 has -m31 flag to build 31 bit binaries
+ifndef CONFIG_S390
+MFLAG = -m32
+else
+MFLAG = -m31
+endif
+
+HOSTCFLAGS_bpf-direct.o += $(MFLAG)
+HOSTCFLAGS_dropper.o += $(MFLAG)
+HOSTCFLAGS_bpf-helper.o += $(MFLAG)
+HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
+HOSTLDLIBS_bpf-direct += $(MFLAG)
+HOSTLDLIBS_bpf-fancy += $(MFLAG)
+HOSTLDLIBS_dropper += $(MFLAG)
+endif
+always := $(hostprogs-m)
+endif
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
new file mode 100644
index 000000000..c09e4a17a
--- /dev/null
+++ b/samples/seccomp/bpf-direct.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#define SUPPORTED_ARCH 1
+#endif
+
+#if defined(SUPPORTED_ARCH)
+#define __USE_GNU 1
+#define _GNU_SOURCE 1
+
+#include <linux/types.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
+#define syscall_nr (offsetof(struct seccomp_data, nr))
+
+#if defined(__i386__)
+#define REG_RESULT REG_EAX
+#define REG_SYSCALL REG_EAX
+#define REG_ARG0 REG_EBX
+#define REG_ARG1 REG_ECX
+#define REG_ARG2 REG_EDX
+#define REG_ARG3 REG_ESI
+#define REG_ARG4 REG_EDI
+#define REG_ARG5 REG_EBP
+#elif defined(__x86_64__)
+#define REG_RESULT REG_RAX
+#define REG_SYSCALL REG_RAX
+#define REG_ARG0 REG_RDI
+#define REG_ARG1 REG_RSI
+#define REG_ARG2 REG_RDX
+#define REG_ARG3 REG_R10
+#define REG_ARG4 REG_R8
+#define REG_ARG5 REG_R9
+#endif
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
+#ifndef SYS_SECCOMP
+#define SYS_SECCOMP 1
+#endif
+
+static void emulator(int nr, siginfo_t *info, void *void_context)
+{
+ ucontext_t *ctx = (ucontext_t *)(void_context);
+ int syscall;
+ char *buf;
+ ssize_t bytes;
+ size_t len;
+ if (info->si_code != SYS_SECCOMP)
+ return;
+ if (!ctx)
+ return;
+ syscall = ctx->uc_mcontext.gregs[REG_SYSCALL];
+ buf = (char *) ctx->uc_mcontext.gregs[REG_ARG1];
+ len = (size_t) ctx->uc_mcontext.gregs[REG_ARG2];
+
+ if (syscall != __NR_write)
+ return;
+ if (ctx->uc_mcontext.gregs[REG_ARG0] != STDERR_FILENO)
+ return;
+ /* Redirect stderr messages to stdout. Doesn't handle EINTR, etc */
+ ctx->uc_mcontext.gregs[REG_RESULT] = -1;
+ if (write(STDOUT_FILENO, "[ERR] ", 6) > 0) {
+ bytes = write(STDOUT_FILENO, buf, len);
+ ctx->uc_mcontext.gregs[REG_RESULT] = bytes;
+ }
+ return;
+}
+
+static int install_emulator(void)
+{
+ struct sigaction act;
+ sigset_t mask;
+ memset(&act, 0, sizeof(act));
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGSYS);
+
+ act.sa_sigaction = &emulator;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGSYS, &act, NULL) < 0) {
+ perror("sigaction");
+ return -1;
+ }
+ if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {
+ perror("sigprocmask");
+ return -1;
+ }
+ return 0;
+}
+
+static int install_filter(void)
+{
+ struct sock_filter filter[] = {
+ /* Grab the system call number */
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr),
+ /* Jump table for the allowed syscalls */
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+#ifdef __NR_sigreturn
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+#endif
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 3, 2),
+
+ /* Check that read is only using stdin. */
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+
+ /* Check that write is only using stdout */
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
+ /* Trap attempts to write to stderr */
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 1, 2),
+
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ .filter = filter,
+ };
+
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl(NO_NEW_PRIVS)");
+ return 1;
+ }
+
+
+ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+ perror("prctl");
+ return 1;
+ }
+ return 0;
+}
+
+#define payload(_c) (_c), sizeof((_c))
+int main(int argc, char **argv)
+{
+ char buf[4096];
+ ssize_t bytes = 0;
+ if (install_emulator())
+ return 1;
+ if (install_filter())
+ return 1;
+ syscall(__NR_write, STDOUT_FILENO,
+ payload("OHAI! WHAT IS YOUR NAME? "));
+ bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf));
+ syscall(__NR_write, STDOUT_FILENO, payload("HELLO, "));
+ syscall(__NR_write, STDOUT_FILENO, buf, bytes);
+ syscall(__NR_write, STDERR_FILENO,
+ payload("Error message going to STDERR\n"));
+ return 0;
+}
+#else /* SUPPORTED_ARCH */
+/*
+ * This sample is x86-only. Since kernel samples are compiled with the
+ * host toolchain, a non-x86 host will result in using only the main()
+ * below.
+ */
+int main(void)
+{
+ return 1;
+}
+#endif /* SUPPORTED_ARCH */
diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c
new file mode 100644
index 000000000..1ccb43502
--- /dev/null
+++ b/samples/seccomp/bpf-fancy.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp BPF example using a macro-based generator.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
+ */
+
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "bpf-helper.h"
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
+int main(int argc, char **argv)
+{
+ struct bpf_labels l = {
+ .count = 0,
+ };
+ static const char msg1[] = "Please type something: ";
+ static const char msg2[] = "You typed: ";
+ char buf[256];
+ struct sock_filter filter[] = {
+ /* TODO: LOAD_SYSCALL_NR(arch) and enforce an arch */
+ LOAD_SYSCALL_NR,
+ SYSCALL(__NR_exit, ALLOW),
+ SYSCALL(__NR_exit_group, ALLOW),
+ SYSCALL(__NR_write, JUMP(&l, write_fd)),
+ SYSCALL(__NR_read, JUMP(&l, read)),
+ DENY, /* Don't passthrough into a label */
+
+ LABEL(&l, read),
+ ARG(0),
+ JNE(STDIN_FILENO, DENY),
+ ARG(1),
+ JNE((unsigned long)buf, DENY),
+ ARG(2),
+ JGE(sizeof(buf), DENY),
+ ALLOW,
+
+ LABEL(&l, write_fd),
+ ARG(0),
+ JEQ(STDOUT_FILENO, JUMP(&l, write_buf)),
+ JEQ(STDERR_FILENO, JUMP(&l, write_buf)),
+ DENY,
+
+ LABEL(&l, write_buf),
+ ARG(1),
+ JEQ((unsigned long)msg1, JUMP(&l, msg1_len)),
+ JEQ((unsigned long)msg2, JUMP(&l, msg2_len)),
+ JEQ((unsigned long)buf, JUMP(&l, buf_len)),
+ DENY,
+
+ LABEL(&l, msg1_len),
+ ARG(2),
+ JLT(sizeof(msg1), ALLOW),
+ DENY,
+
+ LABEL(&l, msg2_len),
+ ARG(2),
+ JLT(sizeof(msg2), ALLOW),
+ DENY,
+
+ LABEL(&l, buf_len),
+ ARG(2),
+ JLT(sizeof(buf), ALLOW),
+ DENY,
+ };
+ struct sock_fprog prog = {
+ .filter = filter,
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ };
+ ssize_t bytes;
+ bpf_resolve_jumps(&l, filter, sizeof(filter)/sizeof(*filter));
+
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl(NO_NEW_PRIVS)");
+ return 1;
+ }
+
+ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+ perror("prctl(SECCOMP)");
+ return 1;
+ }
+ syscall(__NR_write, STDOUT_FILENO, msg1, strlen(msg1));
+ bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)-1);
+ bytes = (bytes > 0 ? bytes : 0);
+ syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2));
+ syscall(__NR_write, STDERR_FILENO, buf, bytes);
+ /* Now get killed */
+ syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)+2);
+ return 0;
+}
diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c
new file mode 100644
index 000000000..ae260d77a
--- /dev/null
+++ b/samples/seccomp/bpf-helper.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp BPF helper functions
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bpf-helper.h"
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+ struct sock_filter *filter, size_t count)
+{
+ size_t i;
+
+ if (count < 1 || count > BPF_MAXINSNS)
+ return -1;
+ /*
+ * Walk it once, backwards, to build the label table and do fixups.
+ * Since backward jumps are disallowed by BPF, this is easy.
+ */
+ for (i = 0; i < count; ++i) {
+ size_t offset = count - i - 1;
+ struct sock_filter *instr = &filter[offset];
+ if (instr->code != (BPF_JMP+BPF_JA))
+ continue;
+ switch ((instr->jt<<8)|instr->jf) {
+ case (JUMP_JT<<8)|JUMP_JF:
+ if (labels->labels[instr->k].location == 0xffffffff) {
+ fprintf(stderr, "Unresolved label: '%s'\n",
+ labels->labels[instr->k].label);
+ return 1;
+ }
+ instr->k = labels->labels[instr->k].location -
+ (offset + 1);
+ instr->jt = 0;
+ instr->jf = 0;
+ continue;
+ case (LABEL_JT<<8)|LABEL_JF:
+ if (labels->labels[instr->k].location != 0xffffffff) {
+ fprintf(stderr, "Duplicate label use: '%s'\n",
+ labels->labels[instr->k].label);
+ return 1;
+ }
+ labels->labels[instr->k].location = offset;
+ instr->k = 0; /* fall through */
+ instr->jt = 0;
+ instr->jf = 0;
+ continue;
+ }
+ }
+ return 0;
+}
+
+/* Simple lookup table for labels. */
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label)
+{
+ struct __bpf_label *begin = labels->labels, *end;
+ int id;
+
+ if (labels->count == BPF_LABELS_MAX) {
+ fprintf(stderr, "Too many labels\n");
+ exit(1);
+ }
+ if (labels->count == 0) {
+ begin->label = label;
+ begin->location = 0xffffffff;
+ labels->count++;
+ return 0;
+ }
+ end = begin + labels->count;
+ for (id = 0; begin < end; ++begin, ++id) {
+ if (!strcmp(label, begin->label))
+ return id;
+ }
+ begin->label = label;
+ begin->location = 0xffffffff;
+ labels->count++;
+ return id;
+}
+
+void seccomp_bpf_print(struct sock_filter *filter, size_t count)
+{
+ struct sock_filter *end = filter + count;
+ for ( ; filter < end; ++filter)
+ printf("{ code=%u,jt=%u,jf=%u,k=%u },\n",
+ filter->code, filter->jt, filter->jf, filter->k);
+}
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
new file mode 100644
index 000000000..0cc9816fe
--- /dev/null
+++ b/samples/seccomp/bpf-helper.h
@@ -0,0 +1,263 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Example wrapper around BPF macros.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * No guarantees are provided with respect to the correctness
+ * or functionality of this code.
+ */
+#ifndef __BPF_HELPER_H__
+#define __BPF_HELPER_H__
+
+#include <asm/bitsperlong.h> /* for __BITS_PER_LONG */
+#include <endian.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h> /* for seccomp_data */
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <stddef.h>
+
+#define BPF_LABELS_MAX 256
+struct bpf_labels {
+ int count;
+ struct __bpf_label {
+ const char *label;
+ __u32 location;
+ } labels[BPF_LABELS_MAX];
+};
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+ struct sock_filter *filter, size_t count);
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label);
+void seccomp_bpf_print(struct sock_filter *filter, size_t count);
+
+#define JUMP_JT 0xff
+#define JUMP_JF 0xff
+#define LABEL_JT 0xfe
+#define LABEL_JF 0xfe
+
+#define ALLOW \
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+#define DENY \
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+#define JUMP(labels, label) \
+ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+ JUMP_JT, JUMP_JF)
+#define LABEL(labels, label) \
+ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+ LABEL_JT, LABEL_JF)
+#define SYSCALL(nr, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \
+ jt
+
+/* Lame, but just an example */
+#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label)
+
+#define EXPAND(...) __VA_ARGS__
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#else
+#error "Unknown endianness"
+#endif
+
+/* Map all width-sensitive operations */
+#if __BITS_PER_LONG == 32
+
+#define JEQ(x, jt) JEQ32(x, EXPAND(jt))
+#define JNE(x, jt) JNE32(x, EXPAND(jt))
+#define JGT(x, jt) JGT32(x, EXPAND(jt))
+#define JLT(x, jt) JLT32(x, EXPAND(jt))
+#define JGE(x, jt) JGE32(x, EXPAND(jt))
+#define JLE(x, jt) JLE32(x, EXPAND(jt))
+#define JA(x, jt) JA32(x, EXPAND(jt))
+#define ARG(i) ARG_32(i)
+
+#elif __BITS_PER_LONG == 64
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define ENDIAN(_lo, _hi) _lo, _hi
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define ENDIAN(_lo, _hi) _hi, _lo
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#endif
+
+union arg64 {
+ struct {
+ __u32 ENDIAN(lo32, hi32);
+ };
+ __u64 u64;
+};
+
+#define JEQ(x, jt) \
+ JEQ64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JGT(x, jt) \
+ JGT64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JGE(x, jt) \
+ JGE64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JNE(x, jt) \
+ JNE64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JLT(x, jt) \
+ JLT64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JLE(x, jt) \
+ JLE64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+
+#define JA(x, jt) \
+ JA64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define ARG(i) ARG_64(i)
+
+#else
+#error __BITS_PER_LONG value unusable.
+#endif
+
+/* Loads the arg into A */
+#define ARG_32(idx) \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
+
+/* Loads lo into M[0] and hi into M[1] and A */
+#define ARG_64(idx) \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
+ BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \
+ BPF_STMT(BPF_ST, 1) /* hi -> M[1] */
+
+#define JEQ32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \
+ jt
+
+#define JNE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
+ jt
+
+#define JA32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
+ jt
+
+#define JGE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
+ jt
+
+#define JGT32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
+ jt
+
+#define JLE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
+ jt
+
+#define JLT32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
+ jt
+
+/*
+ * All the JXX64 checks assume lo is saved in M[0] and hi is saved in both
+ * A and M[1]. This invariant is kept by restoring A if necessary.
+ */
+#define JEQ64(lo, hi, jt) \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ /* if (lo != arg.lo) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JNE64(lo, hi, jt) \
+ /* if (hi != arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo != arg.lo) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JA64(lo, hi, jt) \
+ /* if (hi & arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo & arg.lo) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JGE64(lo, hi, jt) \
+ /* if (hi > arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo >= arg.lo) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JGT64(lo, hi, jt) \
+ /* if (hi > arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo > arg.lo) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JLE64(lo, hi, jt) \
+ /* if (hi < arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo <= arg.lo) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JLT64(lo, hi, jt) \
+ /* if (hi < arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo < arg.lo) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 2, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define LOAD_SYSCALL_NR \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
+ offsetof(struct seccomp_data, nr))
+
+#endif /* __BPF_HELPER_H__ */
diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c
new file mode 100644
index 000000000..cc0648eb3
--- /dev/null
+++ b/samples/seccomp/dropper.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Naive system call dropper built on seccomp_filter.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * When run, returns the specified errno for the specified
+ * system call number against the given architecture.
+ *
+ */
+
+#include <errno.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+static int install_filter(int nr, int arch, int error)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+ (offsetof(struct seccomp_data, arch))),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 3),
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+ (offsetof(struct seccomp_data, nr))),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K,
+ SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ .filter = filter,
+ };
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl(NO_NEW_PRIVS)");
+ return 1;
+ }
+ if (prctl(PR_SET_SECCOMP, 2, &prog)) {
+ perror("prctl(PR_SET_SECCOMP)");
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 5) {
+ fprintf(stderr, "Usage:\n"
+ "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n"
+ "Hint: AUDIT_ARCH_I386: 0x%X\n"
+ " AUDIT_ARCH_X86_64: 0x%X\n"
+ "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
+ return 1;
+ }
+ if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0),
+ strtol(argv[3], NULL, 0)))
+ return 1;
+ execv(argv[4], &argv[4]);
+ printf("Failed to execv\n");
+ return 255;
+}
diff --git a/samples/statx/Makefile b/samples/statx/Makefile
new file mode 100644
index 000000000..59df7c25a
--- /dev/null
+++ b/samples/statx/Makefile
@@ -0,0 +1,7 @@
+# List of programs to build
+hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
diff --git a/samples/statx/test-statx.c b/samples/statx/test-statx.c
new file mode 100644
index 000000000..d4d77b094
--- /dev/null
+++ b/samples/statx/test-statx.c
@@ -0,0 +1,258 @@
+/* Test the statx() system call.
+ *
+ * Note that the output of this program is intended to look like the output of
+ * /bin/stat where possible.
+ *
+ * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define _GNU_SOURCE
+#define _ATFILE_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <sys/stat.h>
+
+#define AT_STATX_SYNC_TYPE 0x6000
+#define AT_STATX_SYNC_AS_STAT 0x0000
+#define AT_STATX_FORCE_SYNC 0x2000
+#define AT_STATX_DONT_SYNC 0x4000
+
+static __attribute__((unused))
+ssize_t statx(int dfd, const char *filename, unsigned flags,
+ unsigned int mask, struct statx *buffer)
+{
+ return syscall(__NR_statx, dfd, filename, flags, mask, buffer);
+}
+
+static void print_time(const char *field, struct statx_timestamp *ts)
+{
+ struct tm tm;
+ time_t tim;
+ char buffer[100];
+ int len;
+
+ tim = ts->tv_sec;
+ if (!localtime_r(&tim, &tm)) {
+ perror("localtime_r");
+ exit(1);
+ }
+ len = strftime(buffer, 100, "%F %T", &tm);
+ if (len == 0) {
+ perror("strftime");
+ exit(1);
+ }
+ printf("%s", field);
+ fwrite(buffer, 1, len, stdout);
+ printf(".%09u", ts->tv_nsec);
+ len = strftime(buffer, 100, "%z", &tm);
+ if (len == 0) {
+ perror("strftime2");
+ exit(1);
+ }
+ fwrite(buffer, 1, len, stdout);
+ printf("\n");
+}
+
+static void dump_statx(struct statx *stx)
+{
+ char buffer[256], ft = '?';
+
+ printf("results=%x\n", stx->stx_mask);
+
+ printf(" ");
+ if (stx->stx_mask & STATX_SIZE)
+ printf(" Size: %-15llu", (unsigned long long)stx->stx_size);
+ if (stx->stx_mask & STATX_BLOCKS)
+ printf(" Blocks: %-10llu", (unsigned long long)stx->stx_blocks);
+ printf(" IO Block: %-6llu", (unsigned long long)stx->stx_blksize);
+ if (stx->stx_mask & STATX_TYPE) {
+ switch (stx->stx_mode & S_IFMT) {
+ case S_IFIFO: printf(" FIFO\n"); ft = 'p'; break;
+ case S_IFCHR: printf(" character special file\n"); ft = 'c'; break;
+ case S_IFDIR: printf(" directory\n"); ft = 'd'; break;
+ case S_IFBLK: printf(" block special file\n"); ft = 'b'; break;
+ case S_IFREG: printf(" regular file\n"); ft = '-'; break;
+ case S_IFLNK: printf(" symbolic link\n"); ft = 'l'; break;
+ case S_IFSOCK: printf(" socket\n"); ft = 's'; break;
+ default:
+ printf(" unknown type (%o)\n", stx->stx_mode & S_IFMT);
+ break;
+ }
+ } else {
+ printf(" no type\n");
+ }
+
+ sprintf(buffer, "%02x:%02x", stx->stx_dev_major, stx->stx_dev_minor);
+ printf("Device: %-15s", buffer);
+ if (stx->stx_mask & STATX_INO)
+ printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino);
+ if (stx->stx_mask & STATX_NLINK)
+ printf(" Links: %-5u", stx->stx_nlink);
+ if (stx->stx_mask & STATX_TYPE) {
+ switch (stx->stx_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ printf(" Device type: %u,%u",
+ stx->stx_rdev_major, stx->stx_rdev_minor);
+ break;
+ }
+ }
+ printf("\n");
+
+ if (stx->stx_mask & STATX_MODE)
+ printf("Access: (%04o/%c%c%c%c%c%c%c%c%c%c) ",
+ stx->stx_mode & 07777,
+ ft,
+ stx->stx_mode & S_IRUSR ? 'r' : '-',
+ stx->stx_mode & S_IWUSR ? 'w' : '-',
+ stx->stx_mode & S_IXUSR ? 'x' : '-',
+ stx->stx_mode & S_IRGRP ? 'r' : '-',
+ stx->stx_mode & S_IWGRP ? 'w' : '-',
+ stx->stx_mode & S_IXGRP ? 'x' : '-',
+ stx->stx_mode & S_IROTH ? 'r' : '-',
+ stx->stx_mode & S_IWOTH ? 'w' : '-',
+ stx->stx_mode & S_IXOTH ? 'x' : '-');
+ if (stx->stx_mask & STATX_UID)
+ printf("Uid: %5d ", stx->stx_uid);
+ if (stx->stx_mask & STATX_GID)
+ printf("Gid: %5d\n", stx->stx_gid);
+
+ if (stx->stx_mask & STATX_ATIME)
+ print_time("Access: ", &stx->stx_atime);
+ if (stx->stx_mask & STATX_MTIME)
+ print_time("Modify: ", &stx->stx_mtime);
+ if (stx->stx_mask & STATX_CTIME)
+ print_time("Change: ", &stx->stx_ctime);
+ if (stx->stx_mask & STATX_BTIME)
+ print_time(" Birth: ", &stx->stx_btime);
+
+ if (stx->stx_attributes_mask) {
+ unsigned char bits, mbits;
+ int loop, byte;
+
+ static char attr_representation[64 + 1] =
+ /* STATX_ATTR_ flags: */
+ "????????" /* 63-56 */
+ "????????" /* 55-48 */
+ "????????" /* 47-40 */
+ "????????" /* 39-32 */
+ "????????" /* 31-24 0x00000000-ff000000 */
+ "????????" /* 23-16 0x00000000-00ff0000 */
+ "???me???" /* 15- 8 0x00000000-0000ff00 */
+ "?dai?c??" /* 7- 0 0x00000000-000000ff */
+ ;
+
+ printf("Attributes: %016llx (", stx->stx_attributes);
+ for (byte = 64 - 8; byte >= 0; byte -= 8) {
+ bits = stx->stx_attributes >> byte;
+ mbits = stx->stx_attributes_mask >> byte;
+ for (loop = 7; loop >= 0; loop--) {
+ int bit = byte + loop;
+
+ if (!(mbits & 0x80))
+ putchar('.'); /* Not supported */
+ else if (bits & 0x80)
+ putchar(attr_representation[63 - bit]);
+ else
+ putchar('-'); /* Not set */
+ bits <<= 1;
+ mbits <<= 1;
+ }
+ if (byte)
+ putchar(' ');
+ }
+ printf(")\n");
+ }
+}
+
+static void dump_hex(unsigned long long *data, int from, int to)
+{
+ unsigned offset, print_offset = 1, col = 0;
+
+ from /= 8;
+ to = (to + 7) / 8;
+
+ for (offset = from; offset < to; offset++) {
+ if (print_offset) {
+ printf("%04x: ", offset * 8);
+ print_offset = 0;
+ }
+ printf("%016llx", data[offset]);
+ col++;
+ if ((col & 3) == 0) {
+ printf("\n");
+ print_offset = 1;
+ } else {
+ printf(" ");
+ }
+ }
+
+ if (!print_offset)
+ printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+ struct statx stx;
+ int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW;
+
+ unsigned int mask = STATX_ALL;
+
+ for (argv++; *argv; argv++) {
+ if (strcmp(*argv, "-F") == 0) {
+ atflag &= ~AT_STATX_SYNC_TYPE;
+ atflag |= AT_STATX_FORCE_SYNC;
+ continue;
+ }
+ if (strcmp(*argv, "-D") == 0) {
+ atflag &= ~AT_STATX_SYNC_TYPE;
+ atflag |= AT_STATX_DONT_SYNC;
+ continue;
+ }
+ if (strcmp(*argv, "-L") == 0) {
+ atflag &= ~AT_SYMLINK_NOFOLLOW;
+ continue;
+ }
+ if (strcmp(*argv, "-O") == 0) {
+ mask &= ~STATX_BASIC_STATS;
+ continue;
+ }
+ if (strcmp(*argv, "-A") == 0) {
+ atflag |= AT_NO_AUTOMOUNT;
+ continue;
+ }
+ if (strcmp(*argv, "-R") == 0) {
+ raw = 1;
+ continue;
+ }
+
+ memset(&stx, 0xbf, sizeof(stx));
+ ret = statx(AT_FDCWD, *argv, atflag, mask, &stx);
+ printf("statx(%s) = %d\n", *argv, ret);
+ if (ret < 0) {
+ perror(*argv);
+ exit(1);
+ }
+
+ if (raw)
+ dump_hex((unsigned long long *)&stx, 0, sizeof(stx));
+
+ dump_statx(&stx);
+ }
+ return 0;
+}
diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore
new file mode 100644
index 000000000..c5c45d7ec
--- /dev/null
+++ b/samples/timers/.gitignore
@@ -0,0 +1 @@
+hpet_example
diff --git a/samples/timers/Makefile b/samples/timers/Makefile
new file mode 100644
index 000000000..f9fa07460
--- /dev/null
+++ b/samples/timers/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+ifndef CROSS_COMPILE
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+
+ifeq ($(ARCH),x86)
+CC := $(CROSS_COMPILE)gcc
+PROGS := hpet_example
+
+all: $(PROGS)
+
+clean:
+ rm -fr $(PROGS)
+
+endif
+endif
diff --git a/samples/timers/hpet_example.c b/samples/timers/hpet_example.c
new file mode 100644
index 000000000..f1cb622f6
--- /dev/null
+++ b/samples/timers/hpet_example.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <memory.h>
+#include <malloc.h>
+#include <time.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <linux/hpet.h>
+
+
+extern void hpet_open_close(int, const char **);
+extern void hpet_info(int, const char **);
+extern void hpet_poll(int, const char **);
+extern void hpet_fasync(int, const char **);
+extern void hpet_read(int, const char **);
+
+#include <sys/poll.h>
+#include <sys/ioctl.h>
+
+struct hpet_command {
+ char *command;
+ void (*func)(int argc, const char ** argv);
+} hpet_command[] = {
+ {
+ "open-close",
+ hpet_open_close
+ },
+ {
+ "info",
+ hpet_info
+ },
+ {
+ "poll",
+ hpet_poll
+ },
+ {
+ "fasync",
+ hpet_fasync
+ },
+};
+
+int
+main(int argc, const char ** argv)
+{
+ unsigned int i;
+
+ argc--;
+ argv++;
+
+ if (!argc) {
+ fprintf(stderr, "-hpet: requires command\n");
+ return -1;
+ }
+
+
+ for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++)
+ if (!strcmp(argv[0], hpet_command[i].command)) {
+ argc--;
+ argv++;
+ fprintf(stderr, "-hpet: executing %s\n",
+ hpet_command[i].command);
+ hpet_command[i].func(argc, argv);
+ return 0;
+ }
+
+ fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]);
+
+ return -1;
+}
+
+void
+hpet_open_close(int argc, const char **argv)
+{
+ int fd;
+
+ if (argc != 1) {
+ fprintf(stderr, "hpet_open_close: device-name\n");
+ return;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+ if (fd < 0)
+ fprintf(stderr, "hpet_open_close: open failed\n");
+ else
+ close(fd);
+
+ return;
+}
+
+void
+hpet_info(int argc, const char **argv)
+{
+ struct hpet_info info;
+ int fd;
+
+ if (argc != 1) {
+ fprintf(stderr, "hpet_info: device-name\n");
+ return;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "hpet_info: open of %s failed\n", argv[0]);
+ return;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_info: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_info: hi_irqfreq 0x%lx hi_flags 0x%lx ",
+ info.hi_ireqfreq, info.hi_flags);
+ fprintf(stderr, "hi_hpet %d hi_timer %d\n",
+ info.hi_hpet, info.hi_timer);
+
+out:
+ close(fd);
+ return;
+}
+
+void
+hpet_poll(int argc, const char **argv)
+{
+ unsigned long freq;
+ int iterations, i, fd;
+ struct pollfd pfd;
+ struct hpet_info info;
+ struct timeval stv, etv;
+ struct timezone tz;
+ long usec;
+
+ if (argc != 3) {
+ fprintf(stderr, "hpet_poll: device-name freq iterations\n");
+ return;
+ }
+
+ freq = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+
+ fd = open(argv[0], O_RDONLY);
+
+ if (fd < 0) {
+ fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]);
+ return;
+ }
+
+ if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
+ fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_poll: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags);
+
+ if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
+ fprintf(stderr, "hpet_poll: HPET_EPI failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_IE_ON, 0) < 0) {
+ fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n");
+ goto out;
+ }
+
+ pfd.fd = fd;
+ pfd.events = POLLIN;
+
+ for (i = 0; i < iterations; i++) {
+ pfd.revents = 0;
+ gettimeofday(&stv, &tz);
+ if (poll(&pfd, 1, -1) < 0)
+ fprintf(stderr, "hpet_poll: poll failed\n");
+ else {
+ long data;
+
+ gettimeofday(&etv, &tz);
+ usec = stv.tv_sec * 1000000 + stv.tv_usec;
+ usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec;
+
+ fprintf(stderr,
+ "hpet_poll: expired time = 0x%lx\n", usec);
+
+ fprintf(stderr, "hpet_poll: revents = 0x%x\n",
+ pfd.revents);
+
+ if (read(fd, &data, sizeof(data)) != sizeof(data)) {
+ fprintf(stderr, "hpet_poll: read failed\n");
+ }
+ else
+ fprintf(stderr, "hpet_poll: data 0x%lx\n",
+ data);
+ }
+ }
+
+out:
+ close(fd);
+ return;
+}
+
+static int hpet_sigio_count;
+
+static void
+hpet_sigio(int val)
+{
+ fprintf(stderr, "hpet_sigio: called\n");
+ hpet_sigio_count++;
+}
+
+void
+hpet_fasync(int argc, const char **argv)
+{
+ unsigned long freq;
+ int iterations, i, fd, value;
+ sig_t oldsig;
+ struct hpet_info info;
+
+ hpet_sigio_count = 0;
+ fd = -1;
+
+ if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) {
+ fprintf(stderr, "hpet_fasync: failed to set signal handler\n");
+ return;
+ }
+
+ if (argc != 3) {
+ fprintf(stderr, "hpet_fasync: device-name freq iterations\n");
+ goto out;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+
+ if (fd < 0) {
+ fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]);
+ return;
+ }
+
+
+ if ((fcntl(fd, F_SETOWN, getpid()) == 1) ||
+ ((value = fcntl(fd, F_GETFL)) == 1) ||
+ (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) {
+ fprintf(stderr, "hpet_fasync: fcntl failed\n");
+ goto out;
+ }
+
+ freq = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+
+ if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
+ fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_fasync: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags);
+
+ if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
+ fprintf(stderr, "hpet_fasync: HPET_EPI failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_IE_ON, 0) < 0) {
+ fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n");
+ goto out;
+ }
+
+ for (i = 0; i < iterations; i++) {
+ (void) pause();
+ fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count);
+ }
+
+out:
+ signal(SIGIO, oldsig);
+
+ if (fd >= 0)
+ close(fd);
+
+ return;
+}
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
new file mode 100644
index 000000000..0f8d92120
--- /dev/null
+++ b/samples/trace_events/Makefile
@@ -0,0 +1,14 @@
+# builds the trace events example kernel modules;
+# then to use one (as root): insmod <module_name.ko>
+
+# If you include a trace header outside of include/trace/events
+# then the file that does the #define CREATE_TRACE_POINTS must
+# have that tracer file in its main search path. This is because
+# define_trace.h will include it, and must be able to find it from
+# the include/trace directory.
+#
+# Here trace-events-sample.c does the CREATE_TRACE_POINTS.
+#
+CFLAGS_trace-events-sample.o := -I$(src)
+
+obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
new file mode 100644
index 000000000..552269210
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.c
@@ -0,0 +1,139 @@
+#include <linux/module.h>
+#include <linux/kthread.h>
+
+/*
+ * Any file that uses trace points, must include the header.
+ * But only one file, must include the header by defining
+ * CREATE_TRACE_POINTS first. This will make the C code that
+ * creates the handles for the trace points.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace-events-sample.h"
+
+static const char *random_strings[] = {
+ "Mother Goose",
+ "Snoopy",
+ "Gandalf",
+ "Frodo",
+ "One ring to rule them all"
+};
+
+static void simple_thread_func(int cnt)
+{
+ int array[6];
+ int len = cnt % 5;
+ int i;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+
+ for (i = 0; i < len; i++)
+ array[i] = i + 1;
+ array[i] = 0;
+
+ /* Silly tracepoints */
+ trace_foo_bar("hello", cnt, array, random_strings[len],
+ &current->cpus_allowed);
+
+ trace_foo_with_template_simple("HELLO", cnt);
+
+ trace_foo_bar_with_cond("Some times print", cnt);
+
+ trace_foo_with_template_cond("prints other times", cnt);
+
+ trace_foo_with_template_print("I have to be different", cnt);
+}
+
+static int simple_thread(void *arg)
+{
+ int cnt = 0;
+
+ while (!kthread_should_stop())
+ simple_thread_func(cnt++);
+
+ return 0;
+}
+
+static struct task_struct *simple_tsk;
+static struct task_struct *simple_tsk_fn;
+
+static void simple_thread_func_fn(int cnt)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+
+ /* More silly tracepoints */
+ trace_foo_bar_with_fn("Look at me", cnt);
+ trace_foo_with_template_fn("Look at me too", cnt);
+}
+
+static int simple_thread_fn(void *arg)
+{
+ int cnt = 0;
+
+ while (!kthread_should_stop())
+ simple_thread_func_fn(cnt++);
+
+ return 0;
+}
+
+static DEFINE_MUTEX(thread_mutex);
+static int simple_thread_cnt;
+
+int foo_bar_reg(void)
+{
+ mutex_lock(&thread_mutex);
+ if (simple_thread_cnt++)
+ goto out;
+
+ pr_info("Starting thread for foo_bar_fn\n");
+ /*
+ * We shouldn't be able to start a trace when the module is
+ * unloading (there's other locks to prevent that). But
+ * for consistency sake, we still take the thread_mutex.
+ */
+ simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn");
+ out:
+ mutex_unlock(&thread_mutex);
+ return 0;
+}
+
+void foo_bar_unreg(void)
+{
+ mutex_lock(&thread_mutex);
+ if (--simple_thread_cnt)
+ goto out;
+
+ pr_info("Killing thread for foo_bar_fn\n");
+ if (simple_tsk_fn)
+ kthread_stop(simple_tsk_fn);
+ simple_tsk_fn = NULL;
+ out:
+ mutex_unlock(&thread_mutex);
+}
+
+static int __init trace_event_init(void)
+{
+ simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
+ if (IS_ERR(simple_tsk))
+ return -1;
+
+ return 0;
+}
+
+static void __exit trace_event_exit(void)
+{
+ kthread_stop(simple_tsk);
+ mutex_lock(&thread_mutex);
+ if (simple_tsk_fn)
+ kthread_stop(simple_tsk_fn);
+ simple_tsk_fn = NULL;
+ mutex_unlock(&thread_mutex);
+}
+
+module_init(trace_event_init);
+module_exit(trace_event_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("trace-events-sample");
+MODULE_LICENSE("GPL");
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
new file mode 100644
index 000000000..80b4a7031
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.h
@@ -0,0 +1,524 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * If TRACE_SYSTEM is defined, that will be the directory created
+ * in the ftrace directory under /sys/kernel/tracing/events/<system>
+ *
+ * The define_trace.h below will also look for a file name of
+ * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
+ * In this case, it would look for sample-trace.h
+ *
+ * If the header name will be different than the system name
+ * (as in this case), then you can override the header name that
+ * define_trace.h will look up by defining TRACE_INCLUDE_FILE
+ *
+ * This file is called trace-events-sample.h but we want the system
+ * to be called "sample-trace". Therefore we must define the name of this
+ * file:
+ *
+ * #define TRACE_INCLUDE_FILE trace-events-sample
+ *
+ * As we do an the bottom of this file.
+ *
+ * Notice that TRACE_SYSTEM should be defined outside of #if
+ * protection, just like TRACE_INCLUDE_FILE.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sample-trace
+
+/*
+ * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric
+ * and underscore), although it may start with numbers. If for some
+ * reason it is not, you need to add the following lines:
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR sample_trace
+/*
+ * But the above is only needed if TRACE_SYSTEM is not alpha-numeric
+ * and underscored. By default, TRACE_SYSTEM_VAR will be equal to
+ * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if
+ * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with
+ * only alpha-numeric and underscores.
+ *
+ * The TRACE_SYSTEM_VAR is only used internally and not visible to
+ * user space.
+ */
+
+/*
+ * Notice that this file is not protected like a normal header.
+ * We also must allow for rereading of this file. The
+ *
+ * || defined(TRACE_HEADER_MULTI_READ)
+ *
+ * serves this purpose.
+ */
+#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENT_SAMPLE_H
+
+/*
+ * All trace headers should include tracepoint.h, until we finally
+ * make it into a standard header.
+ */
+#include <linux/tracepoint.h>
+
+/*
+ * The TRACE_EVENT macro is broken up into 5 parts.
+ *
+ * name: name of the trace point. This is also how to enable the tracepoint.
+ * A function called trace_foo_bar() will be created.
+ *
+ * proto: the prototype of the function trace_foo_bar()
+ * Here it is trace_foo_bar(char *foo, int bar).
+ *
+ * args: must match the arguments in the prototype.
+ * Here it is simply "foo, bar".
+ *
+ * struct: This defines the way the data will be stored in the ring buffer.
+ * The items declared here become part of a special structure
+ * called "__entry", which can be used in the fast_assign part of the
+ * TRACE_EVENT macro.
+ *
+ * Here are the currently defined types you can use:
+ *
+ * __field : Is broken up into type and name. Where type can be any
+ * primitive type (integer, long or pointer).
+ *
+ * __field(int, foo)
+ *
+ * __entry->foo = 5;
+ *
+ * __field_struct : This can be any static complex data type (struct, union
+ * but not an array). Be careful using complex types, as each
+ * event is limited in size, and copying large amounts of data
+ * into the ring buffer can slow things down.
+ *
+ * __field_struct(struct bar, foo)
+ *
+ * __entry->bar.x = y;
+
+ * __array: There are three fields (type, name, size). The type is the
+ * type of elements in the array, the name is the name of the array.
+ * size is the number of items in the array (not the total size).
+ *
+ * __array( char, foo, 10) is the same as saying: char foo[10];
+ *
+ * Assigning arrays can be done like any array:
+ *
+ * __entry->foo[0] = 'a';
+ *
+ * memcpy(__entry->foo, bar, 10);
+ *
+ * __dynamic_array: This is similar to array, but can vary its size from
+ * instance to instance of the tracepoint being called.
+ * Like __array, this too has three elements (type, name, size);
+ * type is the type of the element, name is the name of the array.
+ * The size is different than __array. It is not a static number,
+ * but the algorithm to figure out the length of the array for the
+ * specific instance of tracepoint. Again, size is the number of
+ * items in the array, not the total length in bytes.
+ *
+ * __dynamic_array( int, foo, bar) is similar to: int foo[bar];
+ *
+ * Note, unlike arrays, you must use the __get_dynamic_array() macro
+ * to access the array.
+ *
+ * memcpy(__get_dynamic_array(foo), bar, 10);
+ *
+ * Notice, that "__entry" is not needed here.
+ *
+ * __string: This is a special kind of __dynamic_array. It expects to
+ * have a null terminated character array passed to it (it allows
+ * for NULL too, which would be converted into "(null)"). __string
+ * takes two parameter (name, src), where name is the name of
+ * the string saved, and src is the string to copy into the
+ * ring buffer.
+ *
+ * __string(foo, bar) is similar to: strcpy(foo, bar)
+ *
+ * To assign a string, use the helper macro __assign_str().
+ *
+ * __assign_str(foo, bar);
+ *
+ * In most cases, the __assign_str() macro will take the same
+ * parameters as the __string() macro had to declare the string.
+ *
+ * __bitmask: This is another kind of __dynamic_array, but it expects
+ * an array of longs, and the number of bits to parse. It takes
+ * two parameters (name, nr_bits), where name is the name of the
+ * bitmask to save, and the nr_bits is the number of bits to record.
+ *
+ * __bitmask(target_cpu, nr_cpumask_bits)
+ *
+ * To assign a bitmask, use the __assign_bitmask() helper macro.
+ *
+ * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits);
+ *
+ *
+ * fast_assign: This is a C like function that is used to store the items
+ * into the ring buffer. A special variable called "__entry" will be the
+ * structure that points into the ring buffer and has the same fields as
+ * described by the struct part of TRACE_EVENT above.
+ *
+ * printk: This is a way to print out the data in pretty print. This is
+ * useful if the system crashes and you are logging via a serial line,
+ * the data can be printed to the console using this "printk" method.
+ * This is also used to print out the data from the trace files.
+ * Again, the __entry macro is used to access the data from the ring buffer.
+ *
+ * Note, __dynamic_array, __string, and __bitmask require special helpers
+ * to access the data.
+ *
+ * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo)
+ * Use __get_dynamic_array_len(foo) to get the length of the array
+ * saved. Note, __get_dynamic_array_len() returns the total allocated
+ * length of the dynamic array; __print_array() expects the second
+ * parameter to be the number of elements. To get that, the array length
+ * needs to be divided by the element size.
+ *
+ * For __string(foo, bar) use __get_str(foo)
+ *
+ * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus)
+ *
+ *
+ * Note, that for both the assign and the printk, __entry is the handler
+ * to the data structure in the ring buffer, and is defined by the
+ * TP_STRUCT__entry.
+ */
+
+/*
+ * It is OK to have helper functions in the file, but they need to be protected
+ * from being defined more than once. Remember, this file gets included more
+ * than once.
+ */
+#ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
+#define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
+static inline int __length_of(const int *list)
+{
+ int i;
+
+ if (!list)
+ return 0;
+
+ for (i = 0; list[i]; i++)
+ ;
+ return i;
+}
+
+enum {
+ TRACE_SAMPLE_FOO = 2,
+ TRACE_SAMPLE_BAR = 4,
+ TRACE_SAMPLE_ZOO = 8,
+};
+#endif
+
+/*
+ * If enums are used in the TP_printk(), their names will be shown in
+ * format files and not their values. This can cause problems with user
+ * space programs that parse the format files to know how to translate
+ * the raw binary trace output into human readable text.
+ *
+ * To help out user space programs, any enum that is used in the TP_printk()
+ * should be defined by TRACE_DEFINE_ENUM() macro. All that is needed to
+ * be done is to add this macro with the enum within it in the trace
+ * header file, and it will be converted in the output.
+ */
+
+TRACE_DEFINE_ENUM(TRACE_SAMPLE_FOO);
+TRACE_DEFINE_ENUM(TRACE_SAMPLE_BAR);
+TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO);
+
+TRACE_EVENT(foo_bar,
+
+ TP_PROTO(const char *foo, int bar, const int *lst,
+ const char *string, const struct cpumask *mask),
+
+ TP_ARGS(foo, bar, lst, string, mask),
+
+ TP_STRUCT__entry(
+ __array( char, foo, 10 )
+ __field( int, bar )
+ __dynamic_array(int, list, __length_of(lst))
+ __string( str, string )
+ __bitmask( cpus, num_possible_cpus() )
+ ),
+
+ TP_fast_assign(
+ strlcpy(__entry->foo, foo, 10);
+ __entry->bar = bar;
+ memcpy(__get_dynamic_array(list), lst,
+ __length_of(lst) * sizeof(int));
+ __assign_str(str, string);
+ __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus());
+ ),
+
+ TP_printk("foo %s %d %s %s %s %s (%s)", __entry->foo, __entry->bar,
+
+/*
+ * Notice here the use of some helper functions. This includes:
+ *
+ * __print_symbolic( variable, { value, "string" }, ... ),
+ *
+ * The variable is tested against each value of the { } pair. If
+ * the variable matches one of the values, then it will print the
+ * string in that pair. If non are matched, it returns a string
+ * version of the number (if __entry->bar == 7 then "7" is returned).
+ */
+ __print_symbolic(__entry->bar,
+ { 0, "zero" },
+ { TRACE_SAMPLE_FOO, "TWO" },
+ { TRACE_SAMPLE_BAR, "FOUR" },
+ { TRACE_SAMPLE_ZOO, "EIGHT" },
+ { 10, "TEN" }
+ ),
+
+/*
+ * __print_flags( variable, "delim", { value, "flag" }, ... ),
+ *
+ * This is similar to __print_symbolic, except that it tests the bits
+ * of the value. If ((FLAG & variable) == FLAG) then the string is
+ * printed. If more than one flag matches, then each one that does is
+ * also printed with delim in between them.
+ * If not all bits are accounted for, then the not found bits will be
+ * added in hex format: 0x506 will show BIT2|BIT4|0x500
+ */
+ __print_flags(__entry->bar, "|",
+ { 1, "BIT1" },
+ { 2, "BIT2" },
+ { 4, "BIT3" },
+ { 8, "BIT4" }
+ ),
+/*
+ * __print_array( array, len, element_size )
+ *
+ * This prints out the array that is defined by __array in a nice format.
+ */
+ __print_array(__get_dynamic_array(list),
+ __get_dynamic_array_len(list) / sizeof(int),
+ sizeof(int)),
+ __get_str(str), __get_bitmask(cpus))
+);
+
+/*
+ * There may be a case where a tracepoint should only be called if
+ * some condition is set. Otherwise the tracepoint should not be called.
+ * But to do something like:
+ *
+ * if (cond)
+ * trace_foo();
+ *
+ * Would cause a little overhead when tracing is not enabled, and that
+ * overhead, even if small, is not something we want. As tracepoints
+ * use static branch (aka jump_labels), where no branch is taken to
+ * skip the tracepoint when not enabled, and a jmp is placed to jump
+ * to the tracepoint code when it is enabled, having a if statement
+ * nullifies that optimization. It would be nice to place that
+ * condition within the static branch. This is where TRACE_EVENT_CONDITION
+ * comes in.
+ *
+ * TRACE_EVENT_CONDITION() is just like TRACE_EVENT, except it adds another
+ * parameter just after args. Where TRACE_EVENT has:
+ *
+ * TRACE_EVENT(name, proto, args, struct, assign, printk)
+ *
+ * the CONDITION version has:
+ *
+ * TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, printk)
+ *
+ * Everything is the same as TRACE_EVENT except for the new cond. Think
+ * of the cond variable as:
+ *
+ * if (cond)
+ * trace_foo_bar_with_cond();
+ *
+ * Except that the logic for the if branch is placed after the static branch.
+ * That is, the if statement that processes the condition will not be
+ * executed unless that traecpoint is enabled. Otherwise it still remains
+ * a nop.
+ */
+TRACE_EVENT_CONDITION(foo_bar_with_cond,
+
+ TP_PROTO(const char *foo, int bar),
+
+ TP_ARGS(foo, bar),
+
+ TP_CONDITION(!(bar % 10)),
+
+ TP_STRUCT__entry(
+ __string( foo, foo )
+ __field( int, bar )
+ ),
+
+ TP_fast_assign(
+ __assign_str(foo, foo);
+ __entry->bar = bar;
+ ),
+
+ TP_printk("foo %s %d", __get_str(foo), __entry->bar)
+);
+
+int foo_bar_reg(void);
+void foo_bar_unreg(void);
+
+/*
+ * Now in the case that some function needs to be called when the
+ * tracepoint is enabled and/or when it is disabled, the
+ * TRACE_EVENT_FN() serves this purpose. This is just like TRACE_EVENT()
+ * but adds two more parameters at the end:
+ *
+ * TRACE_EVENT_FN( name, proto, args, struct, assign, printk, reg, unreg)
+ *
+ * reg and unreg are functions with the prototype of:
+ *
+ * void reg(void)
+ *
+ * The reg function gets called before the tracepoint is enabled, and
+ * the unreg function gets called after the tracepoint is disabled.
+ *
+ * Note, reg and unreg are allowed to be NULL. If you only need to
+ * call a function before enabling, or after disabling, just set one
+ * function and pass in NULL for the other parameter.
+ */
+TRACE_EVENT_FN(foo_bar_with_fn,
+
+ TP_PROTO(const char *foo, int bar),
+
+ TP_ARGS(foo, bar),
+
+ TP_STRUCT__entry(
+ __string( foo, foo )
+ __field( int, bar )
+ ),
+
+ TP_fast_assign(
+ __assign_str(foo, foo);
+ __entry->bar = bar;
+ ),
+
+ TP_printk("foo %s %d", __get_str(foo), __entry->bar),
+
+ foo_bar_reg, foo_bar_unreg
+);
+
+/*
+ * Each TRACE_EVENT macro creates several helper functions to produce
+ * the code to add the tracepoint, create the files in the trace
+ * directory, hook it to perf, assign the values and to print out
+ * the raw data from the ring buffer. To prevent too much bloat,
+ * if there are more than one tracepoint that uses the same format
+ * for the proto, args, struct, assign and printk, and only the name
+ * is different, it is highly recommended to use the DECLARE_EVENT_CLASS
+ *
+ * DECLARE_EVENT_CLASS() macro creates most of the functions for the
+ * tracepoint. Then DEFINE_EVENT() is use to hook a tracepoint to those
+ * functions. This DEFINE_EVENT() is an instance of the class and can
+ * be enabled and disabled separately from other events (either TRACE_EVENT
+ * or other DEFINE_EVENT()s).
+ *
+ * Note, TRACE_EVENT() itself is simply defined as:
+ *
+ * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \
+ * DEFINE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \
+ * DEFINE_EVENT(name, name, proto, args)
+ *
+ * The DEFINE_EVENT() also can be declared with conditions and reg functions:
+ *
+ * DEFINE_EVENT_CONDITION(template, name, proto, args, cond);
+ * DEFINE_EVENT_FN(template, name, proto, args, reg, unreg);
+ */
+DECLARE_EVENT_CLASS(foo_template,
+
+ TP_PROTO(const char *foo, int bar),
+
+ TP_ARGS(foo, bar),
+
+ TP_STRUCT__entry(
+ __string( foo, foo )
+ __field( int, bar )
+ ),
+
+ TP_fast_assign(
+ __assign_str(foo, foo);
+ __entry->bar = bar;
+ ),
+
+ TP_printk("foo %s %d", __get_str(foo), __entry->bar)
+);
+
+/*
+ * Here's a better way for the previous samples (except, the first
+ * example had more fields and could not be used here).
+ */
+DEFINE_EVENT(foo_template, foo_with_template_simple,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar));
+
+DEFINE_EVENT_CONDITION(foo_template, foo_with_template_cond,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar),
+ TP_CONDITION(!(bar % 8)));
+
+
+DEFINE_EVENT_FN(foo_template, foo_with_template_fn,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar),
+ foo_bar_reg, foo_bar_unreg);
+
+/*
+ * Anytime two events share basically the same values and have
+ * the same output, use the DECLARE_EVENT_CLASS() and DEFINE_EVENT()
+ * when ever possible.
+ */
+
+/*
+ * If the event is similar to the DECLARE_EVENT_CLASS, but you need
+ * to have a different output, then use DEFINE_EVENT_PRINT() which
+ * lets you override the TP_printk() of the class.
+ */
+
+DEFINE_EVENT_PRINT(foo_template, foo_with_template_print,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar),
+ TP_printk("bar %s %d", __get_str(foo), __entry->bar));
+
+#endif
+
+/***** NOTICE! The #if protection ends here. *****/
+
+
+/*
+ * There are several ways I could have done this. If I left out the
+ * TRACE_INCLUDE_PATH, then it would default to the kernel source
+ * include/trace/events directory.
+ *
+ * I could specify a path from the define_trace.h file back to this
+ * file.
+ *
+ * #define TRACE_INCLUDE_PATH ../../samples/trace_events
+ *
+ * But the safest and easiest way to simply make it use the directory
+ * that the file is in is to add in the Makefile:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(src)
+ *
+ * This will make sure the current path is part of the include
+ * structure for our file so that define_trace.h can find it.
+ *
+ * I could have made only the top level directory the include:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)
+ *
+ * And then let the path to this directory be the TRACE_INCLUDE_PATH:
+ *
+ * #define TRACE_INCLUDE_PATH samples/trace_events
+ *
+ * But then if something defines "samples" or "trace_events" as a macro
+ * then we could risk that being converted too, and give us an unexpected
+ * result.
+ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+/*
+ * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
+ */
+#define TRACE_INCLUDE_FILE trace-events-sample
+#include <trace/define_trace.h>
diff --git a/samples/trace_printk/Makefile b/samples/trace_printk/Makefile
new file mode 100644
index 000000000..19900ab2b
--- /dev/null
+++ b/samples/trace_printk/Makefile
@@ -0,0 +1,6 @@
+# builds a module that calls various trace_printk routines
+# then to use one (as root): insmod <module_name.ko>
+
+# This module can also be used to test the trace_printk code.
+
+obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace-printk.o
diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c
new file mode 100644
index 000000000..e9e0040ff
--- /dev/null
+++ b/samples/trace_printk/trace-printk.c
@@ -0,0 +1,56 @@
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/irq_work.h>
+
+/* Must not be static to force gcc to consider these non constant */
+char *trace_printk_test_global_str =
+ "This is a dynamic string that will use trace_puts\n";
+
+char *trace_printk_test_global_str_irq =
+ "(irq) This is a dynamic string that will use trace_puts\n";
+
+char *trace_printk_test_global_str_fmt =
+ "%sThis is a %s that will use trace_printk\n";
+
+static struct irq_work irqwork;
+
+static void trace_printk_irq_work(struct irq_work *work)
+{
+ trace_printk("(irq) This is a static string that will use trace_bputs\n");
+ trace_printk(trace_printk_test_global_str_irq);
+
+ trace_printk("(irq) This is a %s that will use trace_bprintk()\n",
+ "static string");
+
+ trace_printk(trace_printk_test_global_str_fmt,
+ "(irq) ", "dynamic string");
+}
+
+static int __init trace_printk_init(void)
+{
+ init_irq_work(&irqwork, trace_printk_irq_work);
+
+ trace_printk("This is a static string that will use trace_bputs\n");
+ trace_printk(trace_printk_test_global_str);
+
+ /* Kick off printing in irq context */
+ irq_work_queue(&irqwork);
+
+ trace_printk("This is a %s that will use trace_bprintk()\n",
+ "static string");
+
+ trace_printk(trace_printk_test_global_str_fmt, "", "dynamic string");
+
+ return 0;
+}
+
+static void __exit trace_printk_exit(void)
+{
+}
+
+module_init(trace_printk_init);
+module_exit(trace_printk_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("trace-printk");
+MODULE_LICENSE("GPL");
diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile
new file mode 100644
index 000000000..8d7fd6190
--- /dev/null
+++ b/samples/uhid/Makefile
@@ -0,0 +1,7 @@
+# List of programs to build
+hostprogs-y := uhid-example
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_uhid-example.o += -I$(objtree)/usr/include
diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c
new file mode 100644
index 000000000..b72d645ce
--- /dev/null
+++ b/samples/uhid/uhid-example.c
@@ -0,0 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * UHID Example
+ *
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using uhid.
+ */
+
+/*
+ * UHID Example
+ * This example emulates a basic 3 buttons mouse with wheel over UHID. Run this
+ * program as root and then use the following keys to control the mouse:
+ * q: Quit the application
+ * 1: Toggle left button (down, up, ...)
+ * 2: Toggle right button
+ * 3: Toggle middle button
+ * a: Move mouse left
+ * d: Move mouse right
+ * w: Move mouse up
+ * s: Move mouse down
+ * r: Move wheel up
+ * f: Move wheel down
+ *
+ * Additionally to 3 button mouse, 3 keyboard LEDs are also supported (LED_NUML,
+ * LED_CAPSL and LED_SCROLLL). The device doesn't generate any related keyboard
+ * events, though. You need to manually write the EV_LED/LED_XY/1 activation
+ * input event to the evdev device to see it being sent to this device.
+ *
+ * If uhid is not available as /dev/uhid, then you can pass a different path as
+ * first argument.
+ * If <linux/uhid.h> is not installed in /usr, then compile this with:
+ * gcc -o ./uhid_test -Wall -I./include ./samples/uhid/uhid-example.c
+ * And ignore the warning about kernel headers. However, it is recommended to
+ * use the installed uhid.h if available.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <unistd.h>
+#include <linux/uhid.h>
+
+/*
+ * HID Report Desciptor
+ * We emulate a basic 3 button mouse with wheel and 3 keyboard LEDs. This is
+ * the report-descriptor as the kernel will parse it:
+ *
+ * INPUT(1)[INPUT]
+ * Field(0)
+ * Physical(GenericDesktop.Pointer)
+ * Application(GenericDesktop.Mouse)
+ * Usage(3)
+ * Button.0001
+ * Button.0002
+ * Button.0003
+ * Logical Minimum(0)
+ * Logical Maximum(1)
+ * Report Size(1)
+ * Report Count(3)
+ * Report Offset(0)
+ * Flags( Variable Absolute )
+ * Field(1)
+ * Physical(GenericDesktop.Pointer)
+ * Application(GenericDesktop.Mouse)
+ * Usage(3)
+ * GenericDesktop.X
+ * GenericDesktop.Y
+ * GenericDesktop.Wheel
+ * Logical Minimum(-128)
+ * Logical Maximum(127)
+ * Report Size(8)
+ * Report Count(3)
+ * Report Offset(8)
+ * Flags( Variable Relative )
+ * OUTPUT(2)[OUTPUT]
+ * Field(0)
+ * Application(GenericDesktop.Keyboard)
+ * Usage(3)
+ * LED.NumLock
+ * LED.CapsLock
+ * LED.ScrollLock
+ * Logical Minimum(0)
+ * Logical Maximum(1)
+ * Report Size(1)
+ * Report Count(3)
+ * Report Offset(0)
+ * Flags( Variable Absolute )
+ *
+ * This is the mapping that we expect:
+ * Button.0001 ---> Key.LeftBtn
+ * Button.0002 ---> Key.RightBtn
+ * Button.0003 ---> Key.MiddleBtn
+ * GenericDesktop.X ---> Relative.X
+ * GenericDesktop.Y ---> Relative.Y
+ * GenericDesktop.Wheel ---> Relative.Wheel
+ * LED.NumLock ---> LED.NumLock
+ * LED.CapsLock ---> LED.CapsLock
+ * LED.ScrollLock ---> LED.ScrollLock
+ *
+ * This information can be verified by reading /sys/kernel/debug/hid/<dev>/rdesc
+ * This file should print the same information as showed above.
+ */
+
+static unsigned char rdesc[] = {
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x02, /* USAGE (Mouse) */
+ 0xa1, 0x01, /* COLLECTION (Application) */
+ 0x09, 0x01, /* USAGE (Pointer) */
+ 0xa1, 0x00, /* COLLECTION (Physical) */
+ 0x85, 0x01, /* REPORT_ID (1) */
+ 0x05, 0x09, /* USAGE_PAGE (Button) */
+ 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */
+ 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */
+ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
+ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
+ 0x95, 0x03, /* REPORT_COUNT (3) */
+ 0x75, 0x01, /* REPORT_SIZE (1) */
+ 0x81, 0x02, /* INPUT (Data,Var,Abs) */
+ 0x95, 0x01, /* REPORT_COUNT (1) */
+ 0x75, 0x05, /* REPORT_SIZE (5) */
+ 0x81, 0x01, /* INPUT (Cnst,Var,Abs) */
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x30, /* USAGE (X) */
+ 0x09, 0x31, /* USAGE (Y) */
+ 0x09, 0x38, /* USAGE (WHEEL) */
+ 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */
+ 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */
+ 0x75, 0x08, /* REPORT_SIZE (8) */
+ 0x95, 0x03, /* REPORT_COUNT (3) */
+ 0x81, 0x06, /* INPUT (Data,Var,Rel) */
+ 0xc0, /* END_COLLECTION */
+ 0xc0, /* END_COLLECTION */
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x06, /* USAGE (Keyboard) */
+ 0xa1, 0x01, /* COLLECTION (Application) */
+ 0x85, 0x02, /* REPORT_ID (2) */
+ 0x05, 0x08, /* USAGE_PAGE (Led) */
+ 0x19, 0x01, /* USAGE_MINIMUM (1) */
+ 0x29, 0x03, /* USAGE_MAXIMUM (3) */
+ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
+ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
+ 0x95, 0x03, /* REPORT_COUNT (3) */
+ 0x75, 0x01, /* REPORT_SIZE (1) */
+ 0x91, 0x02, /* Output (Data,Var,Abs) */
+ 0x95, 0x01, /* REPORT_COUNT (1) */
+ 0x75, 0x05, /* REPORT_SIZE (5) */
+ 0x91, 0x01, /* Output (Cnst,Var,Abs) */
+ 0xc0, /* END_COLLECTION */
+};
+
+static int uhid_write(int fd, const struct uhid_event *ev)
+{
+ ssize_t ret;
+
+ ret = write(fd, ev, sizeof(*ev));
+ if (ret < 0) {
+ fprintf(stderr, "Cannot write to uhid: %m\n");
+ return -errno;
+ } else if (ret != sizeof(*ev)) {
+ fprintf(stderr, "Wrong size written to uhid: %ld != %lu\n",
+ ret, sizeof(ev));
+ return -EFAULT;
+ } else {
+ return 0;
+ }
+}
+
+static int create(int fd)
+{
+ struct uhid_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ ev.type = UHID_CREATE;
+ strcpy((char*)ev.u.create.name, "test-uhid-device");
+ ev.u.create.rd_data = rdesc;
+ ev.u.create.rd_size = sizeof(rdesc);
+ ev.u.create.bus = BUS_USB;
+ ev.u.create.vendor = 0x15d9;
+ ev.u.create.product = 0x0a37;
+ ev.u.create.version = 0;
+ ev.u.create.country = 0;
+
+ return uhid_write(fd, &ev);
+}
+
+static void destroy(int fd)
+{
+ struct uhid_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ ev.type = UHID_DESTROY;
+
+ uhid_write(fd, &ev);
+}
+
+/* This parses raw output reports sent by the kernel to the device. A normal
+ * uhid program shouldn't do this but instead just forward the raw report.
+ * However, for ducomentational purposes, we try to detect LED events here and
+ * print debug messages for it. */
+static void handle_output(struct uhid_event *ev)
+{
+ /* LED messages are adverised via OUTPUT reports; ignore the rest */
+ if (ev->u.output.rtype != UHID_OUTPUT_REPORT)
+ return;
+ /* LED reports have length 2 bytes */
+ if (ev->u.output.size != 2)
+ return;
+ /* first byte is report-id which is 0x02 for LEDs in our rdesc */
+ if (ev->u.output.data[0] != 0x2)
+ return;
+
+ /* print flags payload */
+ fprintf(stderr, "LED output report received with flags %x\n",
+ ev->u.output.data[1]);
+}
+
+static int event(int fd)
+{
+ struct uhid_event ev;
+ ssize_t ret;
+
+ memset(&ev, 0, sizeof(ev));
+ ret = read(fd, &ev, sizeof(ev));
+ if (ret == 0) {
+ fprintf(stderr, "Read HUP on uhid-cdev\n");
+ return -EFAULT;
+ } else if (ret < 0) {
+ fprintf(stderr, "Cannot read uhid-cdev: %m\n");
+ return -errno;
+ } else if (ret != sizeof(ev)) {
+ fprintf(stderr, "Invalid size read from uhid-dev: %ld != %lu\n",
+ ret, sizeof(ev));
+ return -EFAULT;
+ }
+
+ switch (ev.type) {
+ case UHID_START:
+ fprintf(stderr, "UHID_START from uhid-dev\n");
+ break;
+ case UHID_STOP:
+ fprintf(stderr, "UHID_STOP from uhid-dev\n");
+ break;
+ case UHID_OPEN:
+ fprintf(stderr, "UHID_OPEN from uhid-dev\n");
+ break;
+ case UHID_CLOSE:
+ fprintf(stderr, "UHID_CLOSE from uhid-dev\n");
+ break;
+ case UHID_OUTPUT:
+ fprintf(stderr, "UHID_OUTPUT from uhid-dev\n");
+ handle_output(&ev);
+ break;
+ case UHID_OUTPUT_EV:
+ fprintf(stderr, "UHID_OUTPUT_EV from uhid-dev\n");
+ break;
+ default:
+ fprintf(stderr, "Invalid event from uhid-dev: %u\n", ev.type);
+ }
+
+ return 0;
+}
+
+static bool btn1_down;
+static bool btn2_down;
+static bool btn3_down;
+static signed char abs_hor;
+static signed char abs_ver;
+static signed char wheel;
+
+static int send_event(int fd)
+{
+ struct uhid_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ ev.type = UHID_INPUT;
+ ev.u.input.size = 5;
+
+ ev.u.input.data[0] = 0x1;
+ if (btn1_down)
+ ev.u.input.data[1] |= 0x1;
+ if (btn2_down)
+ ev.u.input.data[1] |= 0x2;
+ if (btn3_down)
+ ev.u.input.data[1] |= 0x4;
+
+ ev.u.input.data[2] = abs_hor;
+ ev.u.input.data[3] = abs_ver;
+ ev.u.input.data[4] = wheel;
+
+ return uhid_write(fd, &ev);
+}
+
+static int keyboard(int fd)
+{
+ char buf[128];
+ ssize_t ret, i;
+
+ ret = read(STDIN_FILENO, buf, sizeof(buf));
+ if (ret == 0) {
+ fprintf(stderr, "Read HUP on stdin\n");
+ return -EFAULT;
+ } else if (ret < 0) {
+ fprintf(stderr, "Cannot read stdin: %m\n");
+ return -errno;
+ }
+
+ for (i = 0; i < ret; ++i) {
+ switch (buf[i]) {
+ case '1':
+ btn1_down = !btn1_down;
+ ret = send_event(fd);
+ if (ret)
+ return ret;
+ break;
+ case '2':
+ btn2_down = !btn2_down;
+ ret = send_event(fd);
+ if (ret)
+ return ret;
+ break;
+ case '3':
+ btn3_down = !btn3_down;
+ ret = send_event(fd);
+ if (ret)
+ return ret;
+ break;
+ case 'a':
+ abs_hor = -20;
+ ret = send_event(fd);
+ abs_hor = 0;
+ if (ret)
+ return ret;
+ break;
+ case 'd':
+ abs_hor = 20;
+ ret = send_event(fd);
+ abs_hor = 0;
+ if (ret)
+ return ret;
+ break;
+ case 'w':
+ abs_ver = -20;
+ ret = send_event(fd);
+ abs_ver = 0;
+ if (ret)
+ return ret;
+ break;
+ case 's':
+ abs_ver = 20;
+ ret = send_event(fd);
+ abs_ver = 0;
+ if (ret)
+ return ret;
+ break;
+ case 'r':
+ wheel = 1;
+ ret = send_event(fd);
+ wheel = 0;
+ if (ret)
+ return ret;
+ break;
+ case 'f':
+ wheel = -1;
+ ret = send_event(fd);
+ wheel = 0;
+ if (ret)
+ return ret;
+ break;
+ case 'q':
+ return -ECANCELED;
+ default:
+ fprintf(stderr, "Invalid input: %c\n", buf[i]);
+ }
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int fd;
+ const char *path = "/dev/uhid";
+ struct pollfd pfds[2];
+ int ret;
+ struct termios state;
+
+ ret = tcgetattr(STDIN_FILENO, &state);
+ if (ret) {
+ fprintf(stderr, "Cannot get tty state\n");
+ } else {
+ state.c_lflag &= ~ICANON;
+ state.c_cc[VMIN] = 1;
+ ret = tcsetattr(STDIN_FILENO, TCSANOW, &state);
+ if (ret)
+ fprintf(stderr, "Cannot set tty state\n");
+ }
+
+ if (argc >= 2) {
+ if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
+ fprintf(stderr, "Usage: %s [%s]\n", argv[0], path);
+ return EXIT_SUCCESS;
+ } else {
+ path = argv[1];
+ }
+ }
+
+ fprintf(stderr, "Open uhid-cdev %s\n", path);
+ fd = open(path, O_RDWR | O_CLOEXEC);
+ if (fd < 0) {
+ fprintf(stderr, "Cannot open uhid-cdev %s: %m\n", path);
+ return EXIT_FAILURE;
+ }
+
+ fprintf(stderr, "Create uhid device\n");
+ ret = create(fd);
+ if (ret) {
+ close(fd);
+ return EXIT_FAILURE;
+ }
+
+ pfds[0].fd = STDIN_FILENO;
+ pfds[0].events = POLLIN;
+ pfds[1].fd = fd;
+ pfds[1].events = POLLIN;
+
+ fprintf(stderr, "Press 'q' to quit...\n");
+ while (1) {
+ ret = poll(pfds, 2, -1);
+ if (ret < 0) {
+ fprintf(stderr, "Cannot poll for fds: %m\n");
+ break;
+ }
+ if (pfds[0].revents & POLLHUP) {
+ fprintf(stderr, "Received HUP on stdin\n");
+ break;
+ }
+ if (pfds[1].revents & POLLHUP) {
+ fprintf(stderr, "Received HUP on uhid-cdev\n");
+ break;
+ }
+
+ if (pfds[0].revents & POLLIN) {
+ ret = keyboard(fd);
+ if (ret)
+ break;
+ }
+ if (pfds[1].revents & POLLIN) {
+ ret = event(fd);
+ if (ret)
+ break;
+ }
+ }
+
+ fprintf(stderr, "Destroy uhid device\n");
+ destroy(fd);
+ return EXIT_SUCCESS;
+}
diff --git a/samples/v4l/Makefile b/samples/v4l/Makefile
new file mode 100644
index 000000000..65a351d75
--- /dev/null
+++ b/samples/v4l/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_VIDEO_PCI_SKELETON) := v4l2-pci-skeleton.o
diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c
new file mode 100644
index 000000000..f520e3aef
--- /dev/null
+++ b/samples/v4l/v4l2-pci-skeleton.c
@@ -0,0 +1,913 @@
+/*
+ * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source
+ * for use with other PCI drivers.
+ *
+ * This skeleton PCI driver assumes that the card has an S-Video connector as
+ * input 0 and an HDMI connector as input 1.
+ *
+ * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/videodev2.h>
+#include <linux/v4l2-dv-timings.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-dev.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-dv-timings.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-event.h>
+#include <media/videobuf2-v4l2.h>
+#include <media/videobuf2-dma-contig.h>
+
+MODULE_DESCRIPTION("V4L2 PCI Skeleton Driver");
+MODULE_AUTHOR("Hans Verkuil");
+MODULE_LICENSE("GPL v2");
+
+/**
+ * struct skeleton - All internal data for one instance of device
+ * @pdev: PCI device
+ * @v4l2_dev: top-level v4l2 device struct
+ * @vdev: video node structure
+ * @ctrl_handler: control handler structure
+ * @lock: ioctl serialization mutex
+ * @std: current SDTV standard
+ * @timings: current HDTV timings
+ * @format: current pix format
+ * @input: current video input (0 = SDTV, 1 = HDTV)
+ * @queue: vb2 video capture queue
+ * @qlock: spinlock controlling access to buf_list and sequence
+ * @buf_list: list of buffers queued for DMA
+ * @sequence: frame sequence counter
+ */
+struct skeleton {
+ struct pci_dev *pdev;
+ struct v4l2_device v4l2_dev;
+ struct video_device vdev;
+ struct v4l2_ctrl_handler ctrl_handler;
+ struct mutex lock;
+ v4l2_std_id std;
+ struct v4l2_dv_timings timings;
+ struct v4l2_pix_format format;
+ unsigned input;
+
+ struct vb2_queue queue;
+
+ spinlock_t qlock;
+ struct list_head buf_list;
+ unsigned field;
+ unsigned sequence;
+};
+
+struct skel_buffer {
+ struct vb2_buffer vb;
+ struct list_head list;
+};
+
+static inline struct skel_buffer *to_skel_buffer(struct vb2_buffer *vb2)
+{
+ return container_of(vb2, struct skel_buffer, vb);
+}
+
+static const struct pci_device_id skeleton_pci_tbl[] = {
+ /* { PCI_DEVICE(PCI_VENDOR_ID_, PCI_DEVICE_ID_) }, */
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, skeleton_pci_tbl);
+
+/*
+ * HDTV: this structure has the capabilities of the HDTV receiver.
+ * It is used to constrain the huge list of possible formats based
+ * upon the hardware capabilities.
+ */
+static const struct v4l2_dv_timings_cap skel_timings_cap = {
+ .type = V4L2_DV_BT_656_1120,
+ /* keep this initialization for compatibility with GCC < 4.4.6 */
+ .reserved = { 0 },
+ V4L2_INIT_BT_TIMINGS(
+ 720, 1920, /* min/max width */
+ 480, 1080, /* min/max height */
+ 27000000, 74250000, /* min/max pixelclock*/
+ V4L2_DV_BT_STD_CEA861, /* Supported standards */
+ /* capabilities */
+ V4L2_DV_BT_CAP_INTERLACED | V4L2_DV_BT_CAP_PROGRESSIVE
+ )
+};
+
+/*
+ * Supported SDTV standards. This does the same job as skel_timings_cap, but
+ * for standard TV formats.
+ */
+#define SKEL_TVNORMS V4L2_STD_ALL
+
+/*
+ * Interrupt handler: typically interrupts happen after a new frame has been
+ * captured. It is the job of the handler to remove the new frame from the
+ * internal list and give it back to the vb2 framework, updating the sequence
+ * counter, field and timestamp at the same time.
+ */
+static irqreturn_t skeleton_irq(int irq, void *dev_id)
+{
+#ifdef TODO
+ struct skeleton *skel = dev_id;
+
+ /* handle interrupt */
+
+ /* Once a new frame has been captured, mark it as done like this: */
+ if (captured_new_frame) {
+ ...
+ spin_lock(&skel->qlock);
+ list_del(&new_buf->list);
+ spin_unlock(&skel->qlock);
+ v4l2_get_timestamp(&new_buf->vb.v4l2_buf.timestamp);
+ new_buf->vb.v4l2_buf.sequence = skel->sequence++;
+ new_buf->vb.v4l2_buf.field = skel->field;
+ if (skel->format.field == V4L2_FIELD_ALTERNATE) {
+ if (skel->field == V4L2_FIELD_BOTTOM)
+ skel->field = V4L2_FIELD_TOP;
+ else if (skel->field == V4L2_FIELD_TOP)
+ skel->field = V4L2_FIELD_BOTTOM;
+ }
+ vb2_buffer_done(&new_buf->vb, VB2_BUF_STATE_DONE);
+ }
+#endif
+ return IRQ_HANDLED;
+}
+
+/*
+ * Setup the constraints of the queue: besides setting the number of planes
+ * per buffer and the size and allocation context of each plane, it also
+ * checks if sufficient buffers have been allocated. Usually 3 is a good
+ * minimum number: many DMA engines need a minimum of 2 buffers in the
+ * queue and you need to have another available for userspace processing.
+ */
+static int queue_setup(struct vb2_queue *vq,
+ unsigned int *nbuffers, unsigned int *nplanes,
+ unsigned int sizes[], struct device *alloc_devs[])
+{
+ struct skeleton *skel = vb2_get_drv_priv(vq);
+
+ skel->field = skel->format.field;
+ if (skel->field == V4L2_FIELD_ALTERNATE) {
+ /*
+ * You cannot use read() with FIELD_ALTERNATE since the field
+ * information (TOP/BOTTOM) cannot be passed back to the user.
+ */
+ if (vb2_fileio_is_active(vq))
+ return -EINVAL;
+ skel->field = V4L2_FIELD_TOP;
+ }
+
+ if (vq->num_buffers + *nbuffers < 3)
+ *nbuffers = 3 - vq->num_buffers;
+
+ if (*nplanes)
+ return sizes[0] < skel->format.sizeimage ? -EINVAL : 0;
+ *nplanes = 1;
+ sizes[0] = skel->format.sizeimage;
+ return 0;
+}
+
+/*
+ * Prepare the buffer for queueing to the DMA engine: check and set the
+ * payload size.
+ */
+static int buffer_prepare(struct vb2_buffer *vb)
+{
+ struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue);
+ unsigned long size = skel->format.sizeimage;
+
+ if (vb2_plane_size(vb, 0) < size) {
+ dev_err(&skel->pdev->dev, "buffer too small (%lu < %lu)\n",
+ vb2_plane_size(vb, 0), size);
+ return -EINVAL;
+ }
+
+ vb2_set_plane_payload(vb, 0, size);
+ return 0;
+}
+
+/*
+ * Queue this buffer to the DMA engine.
+ */
+static void buffer_queue(struct vb2_buffer *vb)
+{
+ struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue);
+ struct skel_buffer *buf = to_skel_buffer(vb);
+ unsigned long flags;
+
+ spin_lock_irqsave(&skel->qlock, flags);
+ list_add_tail(&buf->list, &skel->buf_list);
+
+ /* TODO: Update any DMA pointers if necessary */
+
+ spin_unlock_irqrestore(&skel->qlock, flags);
+}
+
+static void return_all_buffers(struct skeleton *skel,
+ enum vb2_buffer_state state)
+{
+ struct skel_buffer *buf, *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skel->qlock, flags);
+ list_for_each_entry_safe(buf, node, &skel->buf_list, list) {
+ vb2_buffer_done(&buf->vb, state);
+ list_del(&buf->list);
+ }
+ spin_unlock_irqrestore(&skel->qlock, flags);
+}
+
+/*
+ * Start streaming. First check if the minimum number of buffers have been
+ * queued. If not, then return -ENOBUFS and the vb2 framework will call
+ * this function again the next time a buffer has been queued until enough
+ * buffers are available to actually start the DMA engine.
+ */
+static int start_streaming(struct vb2_queue *vq, unsigned int count)
+{
+ struct skeleton *skel = vb2_get_drv_priv(vq);
+ int ret = 0;
+
+ skel->sequence = 0;
+
+ /* TODO: start DMA */
+
+ if (ret) {
+ /*
+ * In case of an error, return all active buffers to the
+ * QUEUED state
+ */
+ return_all_buffers(skel, VB2_BUF_STATE_QUEUED);
+ }
+ return ret;
+}
+
+/*
+ * Stop the DMA engine. Any remaining buffers in the DMA queue are dequeued
+ * and passed on to the vb2 framework marked as STATE_ERROR.
+ */
+static void stop_streaming(struct vb2_queue *vq)
+{
+ struct skeleton *skel = vb2_get_drv_priv(vq);
+
+ /* TODO: stop DMA */
+
+ /* Release all active buffers */
+ return_all_buffers(skel, VB2_BUF_STATE_ERROR);
+}
+
+/*
+ * The vb2 queue ops. Note that since q->lock is set we can use the standard
+ * vb2_ops_wait_prepare/finish helper functions. If q->lock would be NULL,
+ * then this driver would have to provide these ops.
+ */
+static const struct vb2_ops skel_qops = {
+ .queue_setup = queue_setup,
+ .buf_prepare = buffer_prepare,
+ .buf_queue = buffer_queue,
+ .start_streaming = start_streaming,
+ .stop_streaming = stop_streaming,
+ .wait_prepare = vb2_ops_wait_prepare,
+ .wait_finish = vb2_ops_wait_finish,
+};
+
+/*
+ * Required ioctl querycap. Note that the version field is prefilled with
+ * the version of the kernel.
+ */
+static int skeleton_querycap(struct file *file, void *priv,
+ struct v4l2_capability *cap)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ strlcpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver));
+ strlcpy(cap->card, "V4L2 PCI Skeleton", sizeof(cap->card));
+ snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s",
+ pci_name(skel->pdev));
+ return 0;
+}
+
+/*
+ * Helper function to check and correct struct v4l2_pix_format. It's used
+ * not only in VIDIOC_TRY/S_FMT, but also elsewhere if changes to the SDTV
+ * standard, HDTV timings or the video input would require updating the
+ * current format.
+ */
+static void skeleton_fill_pix_format(struct skeleton *skel,
+ struct v4l2_pix_format *pix)
+{
+ pix->pixelformat = V4L2_PIX_FMT_YUYV;
+ if (skel->input == 0) {
+ /* S-Video input */
+ pix->width = 720;
+ pix->height = (skel->std & V4L2_STD_525_60) ? 480 : 576;
+ pix->field = V4L2_FIELD_INTERLACED;
+ pix->colorspace = V4L2_COLORSPACE_SMPTE170M;
+ } else {
+ /* HDMI input */
+ pix->width = skel->timings.bt.width;
+ pix->height = skel->timings.bt.height;
+ if (skel->timings.bt.interlaced) {
+ pix->field = V4L2_FIELD_ALTERNATE;
+ pix->height /= 2;
+ } else {
+ pix->field = V4L2_FIELD_NONE;
+ }
+ pix->colorspace = V4L2_COLORSPACE_REC709;
+ }
+
+ /*
+ * The YUYV format is four bytes for every two pixels, so bytesperline
+ * is width * 2.
+ */
+ pix->bytesperline = pix->width * 2;
+ pix->sizeimage = pix->bytesperline * pix->height;
+ pix->priv = 0;
+}
+
+static int skeleton_try_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_format *f)
+{
+ struct skeleton *skel = video_drvdata(file);
+ struct v4l2_pix_format *pix = &f->fmt.pix;
+
+ /*
+ * Due to historical reasons providing try_fmt with an unsupported
+ * pixelformat will return -EINVAL for video receivers. Webcam drivers,
+ * however, will silently correct the pixelformat. Some video capture
+ * applications rely on this behavior...
+ */
+ if (pix->pixelformat != V4L2_PIX_FMT_YUYV)
+ return -EINVAL;
+ skeleton_fill_pix_format(skel, pix);
+ return 0;
+}
+
+static int skeleton_s_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_format *f)
+{
+ struct skeleton *skel = video_drvdata(file);
+ int ret;
+
+ ret = skeleton_try_fmt_vid_cap(file, priv, f);
+ if (ret)
+ return ret;
+
+ /*
+ * It is not allowed to change the format while buffers for use with
+ * streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ /* TODO: change format */
+ skel->format = f->fmt.pix;
+ return 0;
+}
+
+static int skeleton_g_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_format *f)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ f->fmt.pix = skel->format;
+ return 0;
+}
+
+static int skeleton_enum_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_fmtdesc *f)
+{
+ if (f->index != 0)
+ return -EINVAL;
+
+ f->pixelformat = V4L2_PIX_FMT_YUYV;
+ return 0;
+}
+
+static int skeleton_s_std(struct file *file, void *priv, v4l2_std_id std)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* S_STD is not supported on the HDMI input */
+ if (skel->input)
+ return -ENODATA;
+
+ /*
+ * No change, so just return. Some applications call S_STD again after
+ * the buffers for streaming have been set up, so we have to allow for
+ * this behavior.
+ */
+ if (std == skel->std)
+ return 0;
+
+ /*
+ * Changing the standard implies a format change, which is not allowed
+ * while buffers for use with streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ /* TODO: handle changing std */
+
+ skel->std = std;
+
+ /* Update the internal format */
+ skeleton_fill_pix_format(skel, &skel->format);
+ return 0;
+}
+
+static int skeleton_g_std(struct file *file, void *priv, v4l2_std_id *std)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* G_STD is not supported on the HDMI input */
+ if (skel->input)
+ return -ENODATA;
+
+ *std = skel->std;
+ return 0;
+}
+
+/*
+ * Query the current standard as seen by the hardware. This function shall
+ * never actually change the standard, it just detects and reports.
+ * The framework will initially set *std to tvnorms (i.e. the set of
+ * supported standards by this input), and this function should just AND
+ * this value. If there is no signal, then *std should be set to 0.
+ */
+static int skeleton_querystd(struct file *file, void *priv, v4l2_std_id *std)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* QUERY_STD is not supported on the HDMI input */
+ if (skel->input)
+ return -ENODATA;
+
+#ifdef TODO
+ /*
+ * Query currently seen standard. Initial value of *std is
+ * V4L2_STD_ALL. This function should look something like this:
+ */
+ get_signal_info();
+ if (no_signal) {
+ *std = 0;
+ return 0;
+ }
+ /* Use signal information to reduce the number of possible standards */
+ if (signal_has_525_lines)
+ *std &= V4L2_STD_525_60;
+ else
+ *std &= V4L2_STD_625_50;
+#endif
+ return 0;
+}
+
+static int skeleton_s_dv_timings(struct file *file, void *_fh,
+ struct v4l2_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* S_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+ /* Quick sanity check */
+ if (!v4l2_valid_dv_timings(timings, &skel_timings_cap, NULL, NULL))
+ return -EINVAL;
+
+ /* Check if the timings are part of the CEA-861 timings. */
+ if (!v4l2_find_dv_timings_cap(timings, &skel_timings_cap,
+ 0, NULL, NULL))
+ return -EINVAL;
+
+ /* Return 0 if the new timings are the same as the current timings. */
+ if (v4l2_match_dv_timings(timings, &skel->timings, 0, false))
+ return 0;
+
+ /*
+ * Changing the timings implies a format change, which is not allowed
+ * while buffers for use with streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ /* TODO: Configure new timings */
+
+ /* Save timings */
+ skel->timings = *timings;
+
+ /* Update the internal format */
+ skeleton_fill_pix_format(skel, &skel->format);
+ return 0;
+}
+
+static int skeleton_g_dv_timings(struct file *file, void *_fh,
+ struct v4l2_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* G_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+ *timings = skel->timings;
+ return 0;
+}
+
+static int skeleton_enum_dv_timings(struct file *file, void *_fh,
+ struct v4l2_enum_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* ENUM_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+ return v4l2_enum_dv_timings_cap(timings, &skel_timings_cap,
+ NULL, NULL);
+}
+
+/*
+ * Query the current timings as seen by the hardware. This function shall
+ * never actually change the timings, it just detects and reports.
+ * If no signal is detected, then return -ENOLINK. If the hardware cannot
+ * lock to the signal, then return -ENOLCK. If the signal is out of range
+ * of the capabilities of the system (e.g., it is possible that the receiver
+ * can lock but that the DMA engine it is connected to cannot handle
+ * pixelclocks above a certain frequency), then -ERANGE is returned.
+ */
+static int skeleton_query_dv_timings(struct file *file, void *_fh,
+ struct v4l2_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* QUERY_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+#ifdef TODO
+ /*
+ * Query currently seen timings. This function should look
+ * something like this:
+ */
+ detect_timings();
+ if (no_signal)
+ return -ENOLINK;
+ if (cannot_lock_to_signal)
+ return -ENOLCK;
+ if (signal_out_of_range_of_capabilities)
+ return -ERANGE;
+
+ /* Useful for debugging */
+ v4l2_print_dv_timings(skel->v4l2_dev.name, "query_dv_timings:",
+ timings, true);
+#endif
+ return 0;
+}
+
+static int skeleton_dv_timings_cap(struct file *file, void *fh,
+ struct v4l2_dv_timings_cap *cap)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* DV_TIMINGS_CAP is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+ *cap = skel_timings_cap;
+ return 0;
+}
+
+static int skeleton_enum_input(struct file *file, void *priv,
+ struct v4l2_input *i)
+{
+ if (i->index > 1)
+ return -EINVAL;
+
+ i->type = V4L2_INPUT_TYPE_CAMERA;
+ if (i->index == 0) {
+ i->std = SKEL_TVNORMS;
+ strlcpy(i->name, "S-Video", sizeof(i->name));
+ i->capabilities = V4L2_IN_CAP_STD;
+ } else {
+ i->std = 0;
+ strlcpy(i->name, "HDMI", sizeof(i->name));
+ i->capabilities = V4L2_IN_CAP_DV_TIMINGS;
+ }
+ return 0;
+}
+
+static int skeleton_s_input(struct file *file, void *priv, unsigned int i)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ if (i > 1)
+ return -EINVAL;
+
+ /*
+ * Changing the input implies a format change, which is not allowed
+ * while buffers for use with streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ skel->input = i;
+ /*
+ * Update tvnorms. The tvnorms value is used by the core to implement
+ * VIDIOC_ENUMSTD so it has to be correct. If tvnorms == 0, then
+ * ENUMSTD will return -ENODATA.
+ */
+ skel->vdev.tvnorms = i ? 0 : SKEL_TVNORMS;
+
+ /* Update the internal format */
+ skeleton_fill_pix_format(skel, &skel->format);
+ return 0;
+}
+
+static int skeleton_g_input(struct file *file, void *priv, unsigned int *i)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ *i = skel->input;
+ return 0;
+}
+
+/* The control handler. */
+static int skeleton_s_ctrl(struct v4l2_ctrl *ctrl)
+{
+ /*struct skeleton *skel =
+ container_of(ctrl->handler, struct skeleton, ctrl_handler);*/
+
+ switch (ctrl->id) {
+ case V4L2_CID_BRIGHTNESS:
+ /* TODO: set brightness to ctrl->val */
+ break;
+ case V4L2_CID_CONTRAST:
+ /* TODO: set contrast to ctrl->val */
+ break;
+ case V4L2_CID_SATURATION:
+ /* TODO: set saturation to ctrl->val */
+ break;
+ case V4L2_CID_HUE:
+ /* TODO: set hue to ctrl->val */
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* ------------------------------------------------------------------
+ File operations for the device
+ ------------------------------------------------------------------*/
+
+static const struct v4l2_ctrl_ops skel_ctrl_ops = {
+ .s_ctrl = skeleton_s_ctrl,
+};
+
+/*
+ * The set of all supported ioctls. Note that all the streaming ioctls
+ * use the vb2 helper functions that take care of all the locking and
+ * that also do ownership tracking (i.e. only the filehandle that requested
+ * the buffers can call the streaming ioctls, all other filehandles will
+ * receive -EBUSY if they attempt to call the same streaming ioctls).
+ *
+ * The last three ioctls also use standard helper functions: these implement
+ * standard behavior for drivers with controls.
+ */
+static const struct v4l2_ioctl_ops skel_ioctl_ops = {
+ .vidioc_querycap = skeleton_querycap,
+ .vidioc_try_fmt_vid_cap = skeleton_try_fmt_vid_cap,
+ .vidioc_s_fmt_vid_cap = skeleton_s_fmt_vid_cap,
+ .vidioc_g_fmt_vid_cap = skeleton_g_fmt_vid_cap,
+ .vidioc_enum_fmt_vid_cap = skeleton_enum_fmt_vid_cap,
+
+ .vidioc_g_std = skeleton_g_std,
+ .vidioc_s_std = skeleton_s_std,
+ .vidioc_querystd = skeleton_querystd,
+
+ .vidioc_s_dv_timings = skeleton_s_dv_timings,
+ .vidioc_g_dv_timings = skeleton_g_dv_timings,
+ .vidioc_enum_dv_timings = skeleton_enum_dv_timings,
+ .vidioc_query_dv_timings = skeleton_query_dv_timings,
+ .vidioc_dv_timings_cap = skeleton_dv_timings_cap,
+
+ .vidioc_enum_input = skeleton_enum_input,
+ .vidioc_g_input = skeleton_g_input,
+ .vidioc_s_input = skeleton_s_input,
+
+ .vidioc_reqbufs = vb2_ioctl_reqbufs,
+ .vidioc_create_bufs = vb2_ioctl_create_bufs,
+ .vidioc_querybuf = vb2_ioctl_querybuf,
+ .vidioc_qbuf = vb2_ioctl_qbuf,
+ .vidioc_dqbuf = vb2_ioctl_dqbuf,
+ .vidioc_expbuf = vb2_ioctl_expbuf,
+ .vidioc_streamon = vb2_ioctl_streamon,
+ .vidioc_streamoff = vb2_ioctl_streamoff,
+
+ .vidioc_log_status = v4l2_ctrl_log_status,
+ .vidioc_subscribe_event = v4l2_ctrl_subscribe_event,
+ .vidioc_unsubscribe_event = v4l2_event_unsubscribe,
+};
+
+/*
+ * The set of file operations. Note that all these ops are standard core
+ * helper functions.
+ */
+static const struct v4l2_file_operations skel_fops = {
+ .owner = THIS_MODULE,
+ .open = v4l2_fh_open,
+ .release = vb2_fop_release,
+ .unlocked_ioctl = video_ioctl2,
+ .read = vb2_fop_read,
+ .mmap = vb2_fop_mmap,
+ .poll = vb2_fop_poll,
+};
+
+/*
+ * The initial setup of this device instance. Note that the initial state of
+ * the driver should be complete. So the initial format, standard, timings
+ * and video input should all be initialized to some reasonable value.
+ */
+static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ /* The initial timings are chosen to be 720p60. */
+ static const struct v4l2_dv_timings timings_def =
+ V4L2_DV_BT_CEA_1280X720P60;
+ struct skeleton *skel;
+ struct video_device *vdev;
+ struct v4l2_ctrl_handler *hdl;
+ struct vb2_queue *q;
+ int ret;
+
+ /* Enable PCI */
+ ret = pci_enable_device(pdev);
+ if (ret)
+ return ret;
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (ret) {
+ dev_err(&pdev->dev, "no suitable DMA available.\n");
+ goto disable_pci;
+ }
+
+ /* Allocate a new instance */
+ skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL);
+ if (!skel) {
+ ret = -ENOMEM;
+ goto disable_pci;
+ }
+
+ /* Allocate the interrupt */
+ ret = devm_request_irq(&pdev->dev, pdev->irq,
+ skeleton_irq, 0, KBUILD_MODNAME, skel);
+ if (ret) {
+ dev_err(&pdev->dev, "request_irq failed\n");
+ goto disable_pci;
+ }
+ skel->pdev = pdev;
+
+ /* Fill in the initial format-related settings */
+ skel->timings = timings_def;
+ skel->std = V4L2_STD_625_50;
+ skeleton_fill_pix_format(skel, &skel->format);
+
+ /* Initialize the top-level structure */
+ ret = v4l2_device_register(&pdev->dev, &skel->v4l2_dev);
+ if (ret)
+ goto disable_pci;
+
+ mutex_init(&skel->lock);
+
+ /* Add the controls */
+ hdl = &skel->ctrl_handler;
+ v4l2_ctrl_handler_init(hdl, 4);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_BRIGHTNESS, 0, 255, 1, 127);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_CONTRAST, 0, 255, 1, 16);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_SATURATION, 0, 255, 1, 127);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_HUE, -128, 127, 1, 0);
+ if (hdl->error) {
+ ret = hdl->error;
+ goto free_hdl;
+ }
+ skel->v4l2_dev.ctrl_handler = hdl;
+
+ /* Initialize the vb2 queue */
+ q = &skel->queue;
+ q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+ q->io_modes = VB2_MMAP | VB2_DMABUF | VB2_READ;
+ q->dev = &pdev->dev;
+ q->drv_priv = skel;
+ q->buf_struct_size = sizeof(struct skel_buffer);
+ q->ops = &skel_qops;
+ q->mem_ops = &vb2_dma_contig_memops;
+ q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
+ /*
+ * Assume that this DMA engine needs to have at least two buffers
+ * available before it can be started. The start_streaming() op
+ * won't be called until at least this many buffers are queued up.
+ */
+ q->min_buffers_needed = 2;
+ /*
+ * The serialization lock for the streaming ioctls. This is the same
+ * as the main serialization lock, but if some of the non-streaming
+ * ioctls could take a long time to execute, then you might want to
+ * have a different lock here to prevent VIDIOC_DQBUF from being
+ * blocked while waiting for another action to finish. This is
+ * generally not needed for PCI devices, but USB devices usually do
+ * want a separate lock here.
+ */
+ q->lock = &skel->lock;
+ /*
+ * Since this driver can only do 32-bit DMA we must make sure that
+ * the vb2 core will allocate the buffers in 32-bit DMA memory.
+ */
+ q->gfp_flags = GFP_DMA32;
+ ret = vb2_queue_init(q);
+ if (ret)
+ goto free_hdl;
+
+ INIT_LIST_HEAD(&skel->buf_list);
+ spin_lock_init(&skel->qlock);
+
+ /* Initialize the video_device structure */
+ vdev = &skel->vdev;
+ strlcpy(vdev->name, KBUILD_MODNAME, sizeof(vdev->name));
+ /*
+ * There is nothing to clean up, so release is set to an empty release
+ * function. The release callback must be non-NULL.
+ */
+ vdev->release = video_device_release_empty;
+ vdev->fops = &skel_fops,
+ vdev->ioctl_ops = &skel_ioctl_ops,
+ vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_READWRITE |
+ V4L2_CAP_STREAMING;
+ /*
+ * The main serialization lock. All ioctls are serialized by this
+ * lock. Exception: if q->lock is set, then the streaming ioctls
+ * are serialized by that separate lock.
+ */
+ vdev->lock = &skel->lock;
+ vdev->queue = q;
+ vdev->v4l2_dev = &skel->v4l2_dev;
+ /* Supported SDTV standards, if any */
+ vdev->tvnorms = SKEL_TVNORMS;
+ video_set_drvdata(vdev, skel);
+
+ ret = video_register_device(vdev, VFL_TYPE_GRABBER, -1);
+ if (ret)
+ goto free_hdl;
+
+ dev_info(&pdev->dev, "V4L2 PCI Skeleton Driver loaded\n");
+ return 0;
+
+free_hdl:
+ v4l2_ctrl_handler_free(&skel->ctrl_handler);
+ v4l2_device_unregister(&skel->v4l2_dev);
+disable_pci:
+ pci_disable_device(pdev);
+ return ret;
+}
+
+static void skeleton_remove(struct pci_dev *pdev)
+{
+ struct v4l2_device *v4l2_dev = pci_get_drvdata(pdev);
+ struct skeleton *skel = container_of(v4l2_dev, struct skeleton, v4l2_dev);
+
+ video_unregister_device(&skel->vdev);
+ v4l2_ctrl_handler_free(&skel->ctrl_handler);
+ v4l2_device_unregister(&skel->v4l2_dev);
+ pci_disable_device(skel->pdev);
+}
+
+static struct pci_driver skeleton_driver = {
+ .name = KBUILD_MODNAME,
+ .probe = skeleton_probe,
+ .remove = skeleton_remove,
+ .id_table = skeleton_pci_tbl,
+};
+
+module_pci_driver(skeleton_driver);
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
new file mode 100644
index 000000000..7db889ca1
--- /dev/null
+++ b/samples/vfio-mdev/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
new file mode 100644
index 000000000..2535c3677
--- /dev/null
+++ b/samples/vfio-mdev/mbochs.c
@@ -0,0 +1,1405 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * Emulate enough of qemu stdvga to make bochs-drm.ko happy. That is
+ * basically the vram memory bar and the bochs dispi interface vbe
+ * registers in the mmio register bar. Specifically it does *not*
+ * include any legacy vga stuff. Device looks a lot like "qemu -device
+ * secondary-vga".
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <linux/dma-buf.h>
+#include <linux/highmem.h>
+#include <drm/drm_fourcc.h>
+#include <drm/drm_rect.h>
+#include <drm/drm_modeset_lock.h>
+#include <drm/drm_property.h>
+#include <drm/drm_plane.h>
+
+
+#define VBE_DISPI_INDEX_ID 0x0
+#define VBE_DISPI_INDEX_XRES 0x1
+#define VBE_DISPI_INDEX_YRES 0x2
+#define VBE_DISPI_INDEX_BPP 0x3
+#define VBE_DISPI_INDEX_ENABLE 0x4
+#define VBE_DISPI_INDEX_BANK 0x5
+#define VBE_DISPI_INDEX_VIRT_WIDTH 0x6
+#define VBE_DISPI_INDEX_VIRT_HEIGHT 0x7
+#define VBE_DISPI_INDEX_X_OFFSET 0x8
+#define VBE_DISPI_INDEX_Y_OFFSET 0x9
+#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa
+#define VBE_DISPI_INDEX_COUNT 0xb
+
+#define VBE_DISPI_ID0 0xB0C0
+#define VBE_DISPI_ID1 0xB0C1
+#define VBE_DISPI_ID2 0xB0C2
+#define VBE_DISPI_ID3 0xB0C3
+#define VBE_DISPI_ID4 0xB0C4
+#define VBE_DISPI_ID5 0xB0C5
+
+#define VBE_DISPI_DISABLED 0x00
+#define VBE_DISPI_ENABLED 0x01
+#define VBE_DISPI_GETCAPS 0x02
+#define VBE_DISPI_8BIT_DAC 0x20
+#define VBE_DISPI_LFB_ENABLED 0x40
+#define VBE_DISPI_NOCLEARMEM 0x80
+
+
+#define MBOCHS_NAME "mbochs"
+#define MBOCHS_CLASS_NAME "mbochs"
+
+#define MBOCHS_CONFIG_SPACE_SIZE 0xff
+#define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE
+#define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE
+#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \
+ MBOCHS_MMIO_BAR_SIZE)
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+
+MODULE_LICENSE("GPL v2");
+
+static int max_mbytes = 256;
+module_param_named(count, max_mbytes, int, 0444);
+MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices");
+
+
+#define MBOCHS_TYPE_1 "small"
+#define MBOCHS_TYPE_2 "medium"
+#define MBOCHS_TYPE_3 "large"
+
+static const struct mbochs_type {
+ const char *name;
+ u32 mbytes;
+} mbochs_types[] = {
+ {
+ .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1,
+ .mbytes = 4,
+ }, {
+ .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2,
+ .mbytes = 16,
+ }, {
+ .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3,
+ .mbytes = 64,
+ },
+};
+
+
+static dev_t mbochs_devt;
+static struct class *mbochs_class;
+static struct cdev mbochs_cdev;
+static struct device mbochs_dev;
+static int mbochs_used_mbytes;
+
+struct mbochs_mode {
+ u32 drm_format;
+ u32 bytepp;
+ u32 width;
+ u32 height;
+ u32 stride;
+ u32 __pad;
+ u64 offset;
+ u64 size;
+};
+
+struct mbochs_dmabuf {
+ struct mbochs_mode mode;
+ u32 id;
+ struct page **pages;
+ pgoff_t pagecount;
+ struct dma_buf *buf;
+ struct mdev_state *mdev_state;
+ struct list_head next;
+ bool unlinked;
+};
+
+/* State of each mdev device */
+struct mdev_state {
+ u8 *vconfig;
+ u64 bar_mask[3];
+ u32 memory_bar_mask;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+ struct vfio_device_info dev_info;
+
+ const struct mbochs_type *type;
+ u16 vbe[VBE_DISPI_INDEX_COUNT];
+ u64 memsize;
+ struct page **pages;
+ pgoff_t pagecount;
+
+ struct list_head dmabufs;
+ u32 active_id;
+ u32 next_id;
+};
+
+static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = {
+ [VBE_DISPI_INDEX_ID] = "id",
+ [VBE_DISPI_INDEX_XRES] = "xres",
+ [VBE_DISPI_INDEX_YRES] = "yres",
+ [VBE_DISPI_INDEX_BPP] = "bpp",
+ [VBE_DISPI_INDEX_ENABLE] = "enable",
+ [VBE_DISPI_INDEX_BANK] = "bank",
+ [VBE_DISPI_INDEX_VIRT_WIDTH] = "virt-width",
+ [VBE_DISPI_INDEX_VIRT_HEIGHT] = "virt-height",
+ [VBE_DISPI_INDEX_X_OFFSET] = "x-offset",
+ [VBE_DISPI_INDEX_Y_OFFSET] = "y-offset",
+ [VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = "video-mem",
+};
+
+static const char *vbe_name(u32 index)
+{
+ if (index < ARRAY_SIZE(vbe_name_list))
+ return vbe_name_list[index];
+ return "(invalid)";
+}
+
+static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff);
+static struct page *mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff);
+
+static const struct mbochs_type *mbochs_find_type(struct kobject *kobj)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mbochs_types); i++)
+ if (strcmp(mbochs_types[i].name, kobj->name) == 0)
+ return mbochs_types + i;
+ return NULL;
+}
+
+static void mbochs_create_config_space(struct mdev_state *mdev_state)
+{
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+ 0x1234);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+ 0x1111);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+ PCI_SUBVENDOR_ID_REDHAT_QUMRANET);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+ PCI_SUBDEVICE_ID_QEMU);
+
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+ PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+ PCI_CLASS_DISPLAY_OTHER);
+ mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH);
+ mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32);
+ mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1;
+}
+
+static int mbochs_check_framebuffer(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u16 *vbe = mdev_state->vbe;
+ u32 virt_width;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED))
+ goto nofb;
+
+ memset(mode, 0, sizeof(*mode));
+ switch (vbe[VBE_DISPI_INDEX_BPP]) {
+ case 32:
+ mode->drm_format = DRM_FORMAT_XRGB8888;
+ mode->bytepp = 4;
+ break;
+ default:
+ dev_info_ratelimited(dev, "%s: bpp %d not supported\n",
+ __func__, vbe[VBE_DISPI_INDEX_BPP]);
+ goto nofb;
+ }
+
+ mode->width = vbe[VBE_DISPI_INDEX_XRES];
+ mode->height = vbe[VBE_DISPI_INDEX_YRES];
+ virt_width = vbe[VBE_DISPI_INDEX_VIRT_WIDTH];
+ if (virt_width < mode->width)
+ virt_width = mode->width;
+ mode->stride = virt_width * mode->bytepp;
+ mode->size = (u64)mode->stride * mode->height;
+ mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp +
+ (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride);
+
+ if (mode->width < 64 || mode->height < 64) {
+ dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n",
+ __func__, mode->width, mode->height);
+ goto nofb;
+ }
+ if (mode->offset + mode->size > mdev_state->memsize) {
+ dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n",
+ __func__);
+ goto nofb;
+ }
+
+ return 0;
+
+nofb:
+ memset(mode, 0, sizeof(*mode));
+ return -EINVAL;
+}
+
+static bool mbochs_modes_equal(struct mbochs_mode *mode1,
+ struct mbochs_mode *mode2)
+{
+ return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0;
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int index = (offset - PCI_BASE_ADDRESS_0) / 0x04;
+ u32 cfg_addr;
+
+ switch (offset) {
+ case PCI_BASE_ADDRESS_0:
+ case PCI_BASE_ADDRESS_2:
+ cfg_addr = *(u32 *)buf;
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = (cfg_addr & mdev_state->bar_mask[index]);
+ } else {
+ cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+ if (cfg_addr)
+ dev_info(dev, "BAR #%d @ 0x%x\n",
+ index, cfg_addr);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] &
+ ~PCI_BASE_ADDRESS_MEM_MASK);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ }
+}
+
+static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int index;
+ u16 reg16;
+
+ switch (offset) {
+ case 0x400 ... 0x41f: /* vga ioports remapped */
+ goto unhandled;
+ case 0x500 ... 0x515: /* bochs dispi interface */
+ if (count != 2)
+ goto unhandled;
+ index = (offset - 0x500) / 2;
+ reg16 = *(u16 *)buf;
+ if (index < ARRAY_SIZE(mdev_state->vbe))
+ mdev_state->vbe[index] = reg16;
+ dev_dbg(dev, "%s: vbe write %d = %d (%s)\n",
+ __func__, index, reg16, vbe_name(index));
+ break;
+ case 0x600 ... 0x607: /* qemu extended regs */
+ goto unhandled;
+ default:
+unhandled:
+ dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+ __func__, offset, count);
+ break;
+ }
+}
+
+static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u16 reg16 = 0;
+ int index;
+
+ switch (offset) {
+ case 0x500 ... 0x515: /* bochs dispi interface */
+ if (count != 2)
+ goto unhandled;
+ index = (offset - 0x500) / 2;
+ if (index < ARRAY_SIZE(mdev_state->vbe))
+ reg16 = mdev_state->vbe[index];
+ dev_dbg(dev, "%s: vbe read %d = %d (%s)\n",
+ __func__, index, reg16, vbe_name(index));
+ *(u16 *)buf = reg16;
+ break;
+ default:
+unhandled:
+ dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+ __func__, offset, count);
+ memset(buf, 0, count);
+ break;
+ }
+}
+
+static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+ struct page *pg;
+ loff_t poff;
+ char *map;
+ int ret = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ if (pos < MBOCHS_CONFIG_SPACE_SIZE) {
+ if (is_write)
+ handle_pci_cfg_write(mdev_state, pos, buf, count);
+ else
+ memcpy(buf, (mdev_state->vconfig + pos), count);
+
+ } else if (pos >= MBOCHS_MMIO_BAR_OFFSET &&
+ pos + count <= MBOCHS_MEMORY_BAR_OFFSET) {
+ pos -= MBOCHS_MMIO_BAR_OFFSET;
+ if (is_write)
+ handle_mmio_write(mdev_state, pos, buf, count);
+ else
+ handle_mmio_read(mdev_state, pos, buf, count);
+
+ } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET &&
+ pos + count <=
+ MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) {
+ pos -= MBOCHS_MMIO_BAR_OFFSET;
+ poff = pos & ~PAGE_MASK;
+ pg = __mbochs_get_page(mdev_state, pos >> PAGE_SHIFT);
+ map = kmap(pg);
+ if (is_write)
+ memcpy(map + poff, buf, count);
+ else
+ memcpy(buf, map + poff, count);
+ kunmap(pg);
+ put_page(pg);
+
+ } else {
+ dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n",
+ __func__, is_write ? "WR" : "RD", pos);
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+static int mbochs_reset(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ u32 size64k = mdev_state->memsize / (64 * 1024);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++)
+ mdev_state->vbe[i] = 0;
+ mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5;
+ mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k;
+ return 0;
+}
+
+static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ const struct mbochs_type *type = mbochs_find_type(kobj);
+ struct device *dev = mdev_dev(mdev);
+ struct mdev_state *mdev_state;
+
+ if (!type)
+ type = &mbochs_types[0];
+ if (type->mbytes + mbochs_used_mbytes > max_mbytes)
+ return -ENOMEM;
+
+ mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+ if (mdev_state == NULL)
+ return -ENOMEM;
+
+ mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
+ if (mdev_state->vconfig == NULL)
+ goto err_mem;
+
+ mdev_state->memsize = type->mbytes * 1024 * 1024;
+ mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT;
+ mdev_state->pages = kcalloc(mdev_state->pagecount,
+ sizeof(struct page *),
+ GFP_KERNEL);
+ if (!mdev_state->pages)
+ goto err_mem;
+
+ dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__,
+ kobj->name, type->mbytes, mdev_state->pagecount);
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ mdev_set_drvdata(mdev, mdev_state);
+ INIT_LIST_HEAD(&mdev_state->dmabufs);
+ mdev_state->next_id = 1;
+
+ mdev_state->type = type;
+ mbochs_create_config_space(mdev_state);
+ mbochs_reset(mdev);
+
+ mbochs_used_mbytes += type->mbytes;
+ return 0;
+
+err_mem:
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+ return -ENOMEM;
+}
+
+static int mbochs_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ mbochs_used_mbytes -= mdev_state->type->mbytes;
+ mdev_set_drvdata(mdev, NULL);
+ kfree(mdev_state->pages);
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+ return 0;
+}
+
+static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff)
+{
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!mdev_state->pages[pgoff]) {
+ mdev_state->pages[pgoff] =
+ alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0);
+ if (!mdev_state->pages[pgoff])
+ return NULL;
+ }
+
+ get_page(mdev_state->pages[pgoff]);
+ return mdev_state->pages[pgoff];
+}
+
+static struct page *mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff)
+{
+ struct page *page;
+
+ if (WARN_ON(pgoff >= mdev_state->pagecount))
+ return NULL;
+
+ mutex_lock(&mdev_state->ops_lock);
+ page = __mbochs_get_page(mdev_state, pgoff);
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return page;
+}
+
+static void mbochs_put_pages(struct mdev_state *mdev_state)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int i, count = 0;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ for (i = 0; i < mdev_state->pagecount; i++) {
+ if (!mdev_state->pages[i])
+ continue;
+ put_page(mdev_state->pages[i]);
+ mdev_state->pages[i] = NULL;
+ count++;
+ }
+ dev_dbg(dev, "%s: %d pages released\n", __func__, count);
+}
+
+static vm_fault_t mbochs_region_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mdev_state *mdev_state = vma->vm_private_data;
+ pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+
+ if (page_offset >= mdev_state->pagecount)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = mbochs_get_page(mdev_state, page_offset);
+ if (!vmf->page)
+ return VM_FAULT_SIGBUS;
+
+ return 0;
+}
+
+static const struct vm_operations_struct mbochs_region_vm_ops = {
+ .fault = mbochs_region_vm_fault,
+};
+
+static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+ return -EINVAL;
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+ return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ vma->vm_ops = &mbochs_region_vm_ops;
+ vma->vm_private_data = mdev_state;
+ return 0;
+}
+
+static vm_fault_t mbochs_dmabuf_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mbochs_dmabuf *dmabuf = vma->vm_private_data;
+
+ if (WARN_ON(vmf->pgoff >= dmabuf->pagecount))
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = dmabuf->pages[vmf->pgoff];
+ get_page(vmf->page);
+ return 0;
+}
+
+static const struct vm_operations_struct mbochs_dmabuf_vm_ops = {
+ .fault = mbochs_dmabuf_vm_fault,
+};
+
+static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ vma->vm_ops = &mbochs_dmabuf_vm_ops;
+ vma->vm_private_data = dmabuf;
+ return 0;
+}
+
+static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf,
+ const char *prefix)
+{
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+ u32 fourcc = dmabuf->mode.drm_format;
+
+ dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n",
+ prefix, dmabuf->id,
+ fourcc ? ((fourcc >> 0) & 0xff) : '-',
+ fourcc ? ((fourcc >> 8) & 0xff) : '-',
+ fourcc ? ((fourcc >> 16) & 0xff) : '-',
+ fourcc ? ((fourcc >> 24) & 0xff) : '-',
+ dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride,
+ dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount);
+}
+
+static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at,
+ enum dma_data_direction direction)
+{
+ struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+ struct sg_table *sg;
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ sg = kzalloc(sizeof(*sg), GFP_KERNEL);
+ if (!sg)
+ goto err1;
+ if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount,
+ 0, dmabuf->mode.size, GFP_KERNEL) < 0)
+ goto err2;
+ if (!dma_map_sg(at->dev, sg->sgl, sg->nents, direction))
+ goto err3;
+
+ return sg;
+
+err3:
+ sg_free_table(sg);
+err2:
+ kfree(sg);
+err1:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at,
+ struct sg_table *sg,
+ enum dma_data_direction direction)
+{
+ struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ sg_free_table(sg);
+ kfree(sg);
+}
+
+static void mbochs_release_dmabuf(struct dma_buf *buf)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct mdev_state *mdev_state = dmabuf->mdev_state;
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ pgoff_t pg;
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ for (pg = 0; pg < dmabuf->pagecount; pg++)
+ put_page(dmabuf->pages[pg]);
+
+ mutex_lock(&mdev_state->ops_lock);
+ dmabuf->buf = NULL;
+ if (dmabuf->unlinked)
+ kfree(dmabuf);
+ mutex_unlock(&mdev_state->ops_lock);
+}
+
+static void *mbochs_kmap_dmabuf(struct dma_buf *buf, unsigned long page_num)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct page *page = dmabuf->pages[page_num];
+
+ return kmap(page);
+}
+
+static void mbochs_kunmap_dmabuf(struct dma_buf *buf, unsigned long page_num,
+ void *vaddr)
+{
+ kunmap(vaddr);
+}
+
+static struct dma_buf_ops mbochs_dmabuf_ops = {
+ .map_dma_buf = mbochs_map_dmabuf,
+ .unmap_dma_buf = mbochs_unmap_dmabuf,
+ .release = mbochs_release_dmabuf,
+ .map = mbochs_kmap_dmabuf,
+ .unmap = mbochs_kunmap_dmabuf,
+ .mmap = mbochs_mmap_dmabuf,
+};
+
+static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct mbochs_dmabuf *dmabuf;
+ pgoff_t page_offset, pg;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL);
+ if (!dmabuf)
+ return NULL;
+
+ dmabuf->mode = *mode;
+ dmabuf->id = mdev_state->next_id++;
+ dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE);
+ dmabuf->pages = kcalloc(dmabuf->pagecount, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!dmabuf->pages)
+ goto err_free_dmabuf;
+
+ page_offset = dmabuf->mode.offset >> PAGE_SHIFT;
+ for (pg = 0; pg < dmabuf->pagecount; pg++) {
+ dmabuf->pages[pg] = __mbochs_get_page(mdev_state,
+ page_offset + pg);
+ if (!dmabuf->pages[pg])
+ goto err_free_pages;
+ }
+
+ dmabuf->mdev_state = mdev_state;
+ list_add(&dmabuf->next, &mdev_state->dmabufs);
+
+ mbochs_print_dmabuf(dmabuf, __func__);
+ return dmabuf;
+
+err_free_pages:
+ while (pg > 0)
+ put_page(dmabuf->pages[--pg]);
+ kfree(dmabuf->pages);
+err_free_dmabuf:
+ kfree(dmabuf);
+ return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct mbochs_dmabuf *dmabuf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+ if (mbochs_modes_equal(&dmabuf->mode, mode))
+ return dmabuf;
+
+ return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
+{
+ struct mbochs_dmabuf *dmabuf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+ if (dmabuf->id == id)
+ return dmabuf;
+
+ return NULL;
+}
+
+static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
+{
+ struct mdev_state *mdev_state = dmabuf->mdev_state;
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct dma_buf *buf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) {
+ dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ exp_info.ops = &mbochs_dmabuf_ops;
+ exp_info.size = dmabuf->mode.size;
+ exp_info.priv = dmabuf;
+
+ buf = dma_buf_export(&exp_info);
+ if (IS_ERR(buf)) {
+ dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n",
+ __func__, PTR_ERR(buf));
+ return PTR_ERR(buf);
+ }
+
+ dmabuf->buf = buf;
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+ return 0;
+}
+
+static int mbochs_get_region_info(struct mdev_device *mdev,
+ struct vfio_region_info *region_info,
+ u16 *cap_type_id, void **cap_type)
+{
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -EINVAL;
+
+ if (region_info->index >= VFIO_PCI_NUM_REGIONS)
+ return -EINVAL;
+
+ switch (region_info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ region_info->offset = 0;
+ region_info->size = MBOCHS_CONFIG_SPACE_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ region_info->offset = MBOCHS_MEMORY_BAR_OFFSET;
+ region_info->size = mdev_state->memsize;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP);
+ break;
+ case VFIO_PCI_BAR2_REGION_INDEX:
+ region_info->offset = MBOCHS_MMIO_BAR_OFFSET;
+ region_info->size = MBOCHS_MMIO_BAR_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ default:
+ region_info->size = 0;
+ region_info->offset = 0;
+ region_info->flags = 0;
+ }
+
+ return 0;
+}
+
+static int mbochs_get_irq_info(struct mdev_device *mdev,
+ struct vfio_irq_info *irq_info)
+{
+ irq_info->count = 0;
+ return 0;
+}
+
+static int mbochs_get_device_info(struct mdev_device *mdev,
+ struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+ return 0;
+}
+
+static int mbochs_query_gfx_plane(struct mdev_device *mdev,
+ struct vfio_device_gfx_plane_info *plane)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+ struct mbochs_dmabuf *dmabuf;
+ struct mbochs_mode mode;
+ int ret;
+
+ if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+ if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+ VFIO_GFX_PLANE_TYPE_DMABUF))
+ return 0;
+ return -EINVAL;
+ }
+
+ if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF)
+ return -EINVAL;
+
+ plane->drm_format_mod = 0;
+ plane->x_pos = 0;
+ plane->y_pos = 0;
+ plane->x_hot = 0;
+ plane->y_hot = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ ret = -EINVAL;
+ if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY)
+ ret = mbochs_check_framebuffer(mdev_state, &mode);
+ if (ret < 0) {
+ plane->drm_format = 0;
+ plane->width = 0;
+ plane->height = 0;
+ plane->stride = 0;
+ plane->size = 0;
+ plane->dmabuf_id = 0;
+ goto done;
+ }
+
+ dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode);
+ if (!dmabuf)
+ mbochs_dmabuf_alloc(mdev_state, &mode);
+ if (!dmabuf) {
+ mutex_unlock(&mdev_state->ops_lock);
+ return -ENOMEM;
+ }
+
+ plane->drm_format = dmabuf->mode.drm_format;
+ plane->width = dmabuf->mode.width;
+ plane->height = dmabuf->mode.height;
+ plane->stride = dmabuf->mode.stride;
+ plane->size = dmabuf->mode.size;
+ plane->dmabuf_id = dmabuf->id;
+
+done:
+ if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
+ mdev_state->active_id != plane->dmabuf_id) {
+ dev_dbg(dev, "%s: primary: %d => %d\n", __func__,
+ mdev_state->active_id, plane->dmabuf_id);
+ mdev_state->active_id = plane->dmabuf_id;
+ }
+ mutex_unlock(&mdev_state->ops_lock);
+ return 0;
+}
+
+static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
+ u32 id)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct mbochs_dmabuf *dmabuf;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id);
+ if (!dmabuf) {
+ mutex_unlock(&mdev_state->ops_lock);
+ return -ENOENT;
+ }
+
+ if (!dmabuf->buf)
+ mbochs_dmabuf_export(dmabuf);
+
+ mutex_unlock(&mdev_state->ops_lock);
+
+ if (!dmabuf->buf)
+ return -EINVAL;
+
+ return dma_buf_fd(dmabuf->buf, 0);
+}
+
+static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = 0;
+ unsigned long minsz;
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_get_device_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_DEVICE_GET_REGION_INFO:
+ {
+ struct vfio_region_info info;
+ u16 cap_type_id = 0;
+ void *cap_type = NULL;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_get_region_info(mdev, &info, &cap_type_id,
+ &cap_type);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= mdev_state->dev_info.num_irqs))
+ return -EINVAL;
+
+ ret = mbochs_get_irq_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_QUERY_GFX_PLANE:
+ {
+ struct vfio_device_gfx_plane_info plane;
+
+ minsz = offsetofend(struct vfio_device_gfx_plane_info,
+ region_index);
+
+ if (copy_from_user(&plane, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (plane.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_query_gfx_plane(mdev, &plane);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &plane, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_GFX_DMABUF:
+ {
+ u32 dmabuf_id;
+
+ if (get_user(dmabuf_id, (__u32 __user *)arg))
+ return -EFAULT;
+
+ return mbochs_get_gfx_dmabuf(mdev, dmabuf_id);
+ }
+
+ case VFIO_DEVICE_SET_IRQS:
+ return -EINVAL;
+
+ case VFIO_DEVICE_RESET:
+ return mbochs_reset(mdev);
+ }
+ return -ENOTTY;
+}
+
+static int mbochs_open(struct mdev_device *mdev)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void mbochs_close(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct mbochs_dmabuf *dmabuf, *tmp;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) {
+ list_del(&dmabuf->next);
+ if (dmabuf->buf) {
+ /* free in mbochs_release_dmabuf() */
+ dmabuf->unlinked = true;
+ } else {
+ kfree(dmabuf);
+ }
+ }
+ mbochs_put_pages(mdev_state);
+
+ mutex_unlock(&mdev_state->ops_lock);
+ module_put(THIS_MODULE);
+}
+
+static ssize_t
+memory_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct mdev_device *mdev = mdev_from_dev(dev);
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
+}
+static DEVICE_ATTR_RO(memory);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_memory.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static ssize_t
+name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "%s\n", kobj->name);
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t
+description_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ const struct mbochs_type *type = mbochs_find_type(kobj);
+
+ return sprintf(buf, "virtual display, %d MB video memory\n",
+ type ? type->mbytes : 0);
+}
+MDEV_TYPE_ATTR_RO(description);
+
+static ssize_t
+available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ const struct mbochs_type *type = mbochs_find_type(kobj);
+ int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
+
+ return sprintf(buf, "%d\n", count);
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+ &mdev_type_attr_name.attr,
+ &mdev_type_attr_description.attr,
+ &mdev_type_attr_device_api.attr,
+ &mdev_type_attr_available_instances.attr,
+ NULL,
+};
+
+static struct attribute_group mdev_type_group1 = {
+ .name = MBOCHS_TYPE_1,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group2 = {
+ .name = MBOCHS_TYPE_2,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group3 = {
+ .name = MBOCHS_TYPE_3,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group *mdev_type_groups[] = {
+ &mdev_type_group1,
+ &mdev_type_group2,
+ &mdev_type_group3,
+ NULL,
+};
+
+static const struct mdev_parent_ops mdev_fops = {
+ .owner = THIS_MODULE,
+ .mdev_attr_groups = mdev_dev_groups,
+ .supported_type_groups = mdev_type_groups,
+ .create = mbochs_create,
+ .remove = mbochs_remove,
+ .open = mbochs_open,
+ .release = mbochs_close,
+ .read = mbochs_read,
+ .write = mbochs_write,
+ .ioctl = mbochs_ioctl,
+ .mmap = mbochs_mmap,
+};
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+static void mbochs_device_release(struct device *dev)
+{
+ /* nothing */
+}
+
+static int __init mbochs_dev_init(void)
+{
+ int ret = 0;
+
+ ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK, MBOCHS_NAME);
+ if (ret < 0) {
+ pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
+ return ret;
+ }
+ cdev_init(&mbochs_cdev, &vd_fops);
+ cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK);
+ pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
+
+ mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
+ if (IS_ERR(mbochs_class)) {
+ pr_err("Error: failed to register mbochs_dev class\n");
+ ret = PTR_ERR(mbochs_class);
+ goto failed1;
+ }
+ mbochs_dev.class = mbochs_class;
+ mbochs_dev.release = mbochs_device_release;
+ dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME);
+
+ ret = device_register(&mbochs_dev);
+ if (ret)
+ goto failed2;
+
+ ret = mdev_register_device(&mbochs_dev, &mdev_fops);
+ if (ret)
+ goto failed3;
+
+ return 0;
+
+failed3:
+ device_unregister(&mbochs_dev);
+failed2:
+ class_destroy(mbochs_class);
+failed1:
+ cdev_del(&mbochs_cdev);
+ unregister_chrdev_region(mbochs_devt, MINORMASK);
+ return ret;
+}
+
+static void __exit mbochs_dev_exit(void)
+{
+ mbochs_dev.bus = NULL;
+ mdev_unregister_device(&mbochs_dev);
+
+ device_unregister(&mbochs_dev);
+ cdev_del(&mbochs_cdev);
+ unregister_chrdev_region(mbochs_devt, MINORMASK);
+ class_destroy(mbochs_class);
+ mbochs_class = NULL;
+}
+
+module_init(mbochs_dev_init)
+module_exit(mbochs_dev_exit)
diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h
new file mode 100644
index 000000000..96b3b1b49
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-defs.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Simple pci display device.
+ *
+ * Framebuffer memory is pci bar 0.
+ * Configuration (read-only) is in pci config space.
+ * Format field uses drm fourcc codes.
+ * ATM only DRM_FORMAT_XRGB8888 is supported.
+ */
+
+/* pci ids */
+#define MDPY_PCI_VENDOR_ID 0x1b36 /* redhat */
+#define MDPY_PCI_DEVICE_ID 0x000f
+#define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET
+#define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU
+
+/* pci cfg space offsets for fb config (dword) */
+#define MDPY_VENDORCAP_OFFSET 0x40
+#define MDPY_VENDORCAP_SIZE 0x10
+#define MDPY_FORMAT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x04)
+#define MDPY_WIDTH_OFFSET (MDPY_VENDORCAP_OFFSET + 0x08)
+#define MDPY_HEIGHT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x0c)
diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c
new file mode 100644
index 000000000..a760e130b
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-fb.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Framebuffer driver for mdpy (mediated virtual pci display device).
+ *
+ * See mdpy-defs.h for device specs
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * Using some code snippets from simplefb and cirrusfb.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+static const struct fb_fix_screeninfo mdpy_fb_fix = {
+ .id = "mdpy-fb",
+ .type = FB_TYPE_PACKED_PIXELS,
+ .visual = FB_VISUAL_TRUECOLOR,
+ .accel = FB_ACCEL_NONE,
+};
+
+static const struct fb_var_screeninfo mdpy_fb_var = {
+ .height = -1,
+ .width = -1,
+ .activate = FB_ACTIVATE_NOW,
+ .vmode = FB_VMODE_NONINTERLACED,
+
+ .bits_per_pixel = 32,
+ .transp.offset = 24,
+ .red.offset = 16,
+ .green.offset = 8,
+ .blue.offset = 0,
+ .transp.length = 8,
+ .red.length = 8,
+ .green.length = 8,
+ .blue.length = 8,
+};
+
+#define PSEUDO_PALETTE_SIZE 16
+
+struct mdpy_fb_par {
+ u32 palette[PSEUDO_PALETTE_SIZE];
+};
+
+static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
+ u_int transp, struct fb_info *info)
+{
+ u32 *pal = info->pseudo_palette;
+ u32 cr = red >> (16 - info->var.red.length);
+ u32 cg = green >> (16 - info->var.green.length);
+ u32 cb = blue >> (16 - info->var.blue.length);
+ u32 value, mask;
+
+ if (regno >= PSEUDO_PALETTE_SIZE)
+ return -EINVAL;
+
+ value = (cr << info->var.red.offset) |
+ (cg << info->var.green.offset) |
+ (cb << info->var.blue.offset);
+ if (info->var.transp.length > 0) {
+ mask = (1 << info->var.transp.length) - 1;
+ mask <<= info->var.transp.offset;
+ value |= mask;
+ }
+ pal[regno] = value;
+
+ return 0;
+}
+
+static void mdpy_fb_destroy(struct fb_info *info)
+{
+ if (info->screen_base)
+ iounmap(info->screen_base);
+}
+
+static struct fb_ops mdpy_fb_ops = {
+ .owner = THIS_MODULE,
+ .fb_destroy = mdpy_fb_destroy,
+ .fb_setcolreg = mdpy_fb_setcolreg,
+ .fb_fillrect = cfb_fillrect,
+ .fb_copyarea = cfb_copyarea,
+ .fb_imageblit = cfb_imageblit,
+};
+
+static int mdpy_fb_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ struct fb_info *info;
+ struct mdpy_fb_par *par;
+ u32 format, width, height;
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret < 0)
+ return ret;
+
+ ret = pci_request_regions(pdev, "mdpy-fb");
+ if (ret < 0)
+ return ret;
+
+ pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format);
+ pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width);
+ pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height);
+ if (format != DRM_FORMAT_XRGB8888) {
+ pci_err(pdev, "format mismatch (0x%x != 0x%x)\n",
+ format, DRM_FORMAT_XRGB8888);
+ ret = -EINVAL;
+ goto err_release_regions;
+ }
+ if (width < 100 || width > 10000) {
+ pci_err(pdev, "width (%d) out of range\n", width);
+ ret = -EINVAL;
+ goto err_release_regions;
+ }
+ if (height < 100 || height > 10000) {
+ pci_err(pdev, "height (%d) out of range\n", height);
+ ret = -EINVAL;
+ goto err_release_regions;
+ }
+ pci_info(pdev, "mdpy found: %dx%d framebuffer\n",
+ width, height);
+
+ info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev);
+ if (!info) {
+ ret = -ENOMEM;
+ goto err_release_regions;
+ }
+ pci_set_drvdata(pdev, info);
+ par = info->par;
+
+ info->fix = mdpy_fb_fix;
+ info->fix.smem_start = pci_resource_start(pdev, 0);
+ info->fix.smem_len = pci_resource_len(pdev, 0);
+ info->fix.line_length = width * 4;
+
+ info->var = mdpy_fb_var;
+ info->var.xres = width;
+ info->var.yres = height;
+ info->var.xres_virtual = width;
+ info->var.yres_virtual = height;
+
+ info->screen_size = info->fix.smem_len;
+ info->screen_base = ioremap(info->fix.smem_start,
+ info->screen_size);
+ if (!info->screen_base) {
+ pci_err(pdev, "ioremap(pcibar) failed\n");
+ ret = -EIO;
+ goto err_release_fb;
+ }
+
+ info->apertures = alloc_apertures(1);
+ if (!info->apertures) {
+ ret = -ENOMEM;
+ goto err_unmap;
+ }
+ info->apertures->ranges[0].base = info->fix.smem_start;
+ info->apertures->ranges[0].size = info->fix.smem_len;
+
+ info->fbops = &mdpy_fb_ops;
+ info->flags = FBINFO_DEFAULT;
+ info->pseudo_palette = par->palette;
+
+ ret = register_framebuffer(info);
+ if (ret < 0) {
+ pci_err(pdev, "mdpy-fb device register failed: %d\n", ret);
+ goto err_unmap;
+ }
+
+ pci_info(pdev, "fb%d registered\n", info->node);
+ return 0;
+
+err_unmap:
+ iounmap(info->screen_base);
+
+err_release_fb:
+ framebuffer_release(info);
+
+err_release_regions:
+ pci_release_regions(pdev);
+
+ return ret;
+}
+
+static void mdpy_fb_remove(struct pci_dev *pdev)
+{
+ struct fb_info *info = pci_get_drvdata(pdev);
+
+ unregister_framebuffer(info);
+ framebuffer_release(info);
+}
+
+static struct pci_device_id mdpy_fb_pci_table[] = {
+ {
+ .vendor = MDPY_PCI_VENDOR_ID,
+ .device = MDPY_PCI_DEVICE_ID,
+ .subvendor = MDPY_PCI_SUBVENDOR_ID,
+ .subdevice = MDPY_PCI_SUBDEVICE_ID,
+ }, {
+ /* end of list */
+ }
+};
+
+static struct pci_driver mdpy_fb_pci_driver = {
+ .name = "mdpy-fb",
+ .id_table = mdpy_fb_pci_table,
+ .probe = mdpy_fb_probe,
+ .remove = mdpy_fb_remove,
+};
+
+static int __init mdpy_fb_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&mdpy_fb_pci_driver);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+module_init(mdpy_fb_init);
+
+MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table);
+MODULE_LICENSE("GPL v2");
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
new file mode 100644
index 000000000..d774717cd
--- /dev/null
+++ b/samples/vfio-mdev/mdpy.c
@@ -0,0 +1,807 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * See mdpy-defs.h for device specs
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+#define MDPY_NAME "mdpy"
+#define MDPY_CLASS_NAME "mdpy"
+
+#define MDPY_CONFIG_SPACE_SIZE 0xff
+#define MDPY_MEMORY_BAR_OFFSET PAGE_SIZE
+#define MDPY_DISPLAY_REGION 16
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+
+MODULE_LICENSE("GPL v2");
+
+static int max_devices = 4;
+module_param_named(count, max_devices, int, 0444);
+MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices");
+
+
+#define MDPY_TYPE_1 "vga"
+#define MDPY_TYPE_2 "xga"
+#define MDPY_TYPE_3 "hd"
+
+static const struct mdpy_type {
+ const char *name;
+ u32 format;
+ u32 bytepp;
+ u32 width;
+ u32 height;
+} mdpy_types[] = {
+ {
+ .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 640,
+ .height = 480,
+ }, {
+ .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 1024,
+ .height = 768,
+ }, {
+ .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 1920,
+ .height = 1080,
+ },
+};
+
+static dev_t mdpy_devt;
+static struct class *mdpy_class;
+static struct cdev mdpy_cdev;
+static struct device mdpy_dev;
+static u32 mdpy_count;
+
+/* State of each mdev device */
+struct mdev_state {
+ u8 *vconfig;
+ u32 bar_mask;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+ struct vfio_device_info dev_info;
+
+ const struct mdpy_type *type;
+ u32 memsize;
+ void *memblk;
+};
+
+static const struct mdpy_type *mdpy_find_type(struct kobject *kobj)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mdpy_types); i++)
+ if (strcmp(mdpy_types[i].name, kobj->name) == 0)
+ return mdpy_types + i;
+ return NULL;
+}
+
+static void mdpy_create_config_space(struct mdev_state *mdev_state)
+{
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+ MDPY_PCI_VENDOR_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+ MDPY_PCI_DEVICE_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+ MDPY_PCI_SUBVENDOR_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+ MDPY_PCI_SUBDEVICE_ID);
+
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+ PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_STATUS],
+ PCI_STATUS_CAP_LIST);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+ PCI_CLASS_DISPLAY_OTHER);
+ mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH);
+ mdev_state->bar_mask = ~(mdev_state->memsize) + 1;
+
+ /* vendor specific capability for the config registers */
+ mdev_state->vconfig[PCI_CAPABILITY_LIST] = MDPY_VENDORCAP_OFFSET;
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 0] = 0x09; /* vendor cap */
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 1] = 0x00; /* next ptr */
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 2] = MDPY_VENDORCAP_SIZE;
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET],
+ mdev_state->type->format);
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET],
+ mdev_state->type->width);
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET],
+ mdev_state->type->height);
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u32 cfg_addr;
+
+ switch (offset) {
+ case PCI_BASE_ADDRESS_0:
+ cfg_addr = *(u32 *)buf;
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = (cfg_addr & mdev_state->bar_mask);
+ } else {
+ cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+ if (cfg_addr)
+ dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] &
+ ~PCI_BASE_ADDRESS_MEM_MASK);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ }
+}
+
+static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+ int ret = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ if (pos < MDPY_CONFIG_SPACE_SIZE) {
+ if (is_write)
+ handle_pci_cfg_write(mdev_state, pos, buf, count);
+ else
+ memcpy(buf, (mdev_state->vconfig + pos), count);
+
+ } else if ((pos >= MDPY_MEMORY_BAR_OFFSET) &&
+ (pos + count <=
+ MDPY_MEMORY_BAR_OFFSET + mdev_state->memsize)) {
+ pos -= MDPY_MEMORY_BAR_OFFSET;
+ if (is_write)
+ memcpy(mdev_state->memblk, buf, count);
+ else
+ memcpy(buf, mdev_state->memblk, count);
+
+ } else {
+ dev_info(dev, "%s: %s @0x%llx (unhandled)\n",
+ __func__, is_write ? "WR" : "RD", pos);
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+static int mdpy_reset(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ u32 stride, i;
+
+ /* initialize with gray gradient */
+ stride = mdev_state->type->width * mdev_state->type->bytepp;
+ for (i = 0; i < mdev_state->type->height; i++)
+ memset(mdev_state->memblk + i * stride,
+ i * 255 / mdev_state->type->height,
+ stride);
+ return 0;
+}
+
+static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ const struct mdpy_type *type = mdpy_find_type(kobj);
+ struct device *dev = mdev_dev(mdev);
+ struct mdev_state *mdev_state;
+ u32 fbsize;
+
+ if (mdpy_count >= max_devices)
+ return -ENOMEM;
+
+ mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+ if (mdev_state == NULL)
+ return -ENOMEM;
+
+ mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
+ if (mdev_state->vconfig == NULL) {
+ kfree(mdev_state);
+ return -ENOMEM;
+ }
+
+ if (!type)
+ type = &mdpy_types[0];
+ fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
+
+ mdev_state->memblk = vmalloc_user(fbsize);
+ if (!mdev_state->memblk) {
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+ return -ENOMEM;
+ }
+ dev_info(dev, "%s: %s (%dx%d)\n",
+ __func__, kobj->name, type->width, type->height);
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ mdev_set_drvdata(mdev, mdev_state);
+
+ mdev_state->type = type;
+ mdev_state->memsize = fbsize;
+ mdpy_create_config_space(mdev_state);
+ mdpy_reset(mdev);
+
+ mdpy_count++;
+ return 0;
+}
+
+static int mdpy_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ struct device *dev = mdev_dev(mdev);
+
+ dev_info(dev, "%s\n", __func__);
+
+ mdev_set_drvdata(mdev, NULL);
+ vfree(mdev_state->memblk);
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+
+ mdpy_count--;
+ return 0;
+}
+
+static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+ return -EINVAL;
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+ return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ return remap_vmalloc_range_partial(vma, vma->vm_start,
+ mdev_state->memblk, 0,
+ vma->vm_end - vma->vm_start);
+}
+
+static int mdpy_get_region_info(struct mdev_device *mdev,
+ struct vfio_region_info *region_info,
+ u16 *cap_type_id, void **cap_type)
+{
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -EINVAL;
+
+ if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
+ region_info->index != MDPY_DISPLAY_REGION)
+ return -EINVAL;
+
+ switch (region_info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ region_info->offset = 0;
+ region_info->size = MDPY_CONFIG_SPACE_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ case MDPY_DISPLAY_REGION:
+ region_info->offset = MDPY_MEMORY_BAR_OFFSET;
+ region_info->size = mdev_state->memsize;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP);
+ break;
+ default:
+ region_info->size = 0;
+ region_info->offset = 0;
+ region_info->flags = 0;
+ }
+
+ return 0;
+}
+
+static int mdpy_get_irq_info(struct mdev_device *mdev,
+ struct vfio_irq_info *irq_info)
+{
+ irq_info->count = 0;
+ return 0;
+}
+
+static int mdpy_get_device_info(struct mdev_device *mdev,
+ struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+ return 0;
+}
+
+static int mdpy_query_gfx_plane(struct mdev_device *mdev,
+ struct vfio_device_gfx_plane_info *plane)
+{
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+ if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+ VFIO_GFX_PLANE_TYPE_REGION))
+ return 0;
+ return -EINVAL;
+ }
+
+ if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION)
+ return -EINVAL;
+
+ plane->drm_format = mdev_state->type->format;
+ plane->width = mdev_state->type->width;
+ plane->height = mdev_state->type->height;
+ plane->stride = (mdev_state->type->width *
+ mdev_state->type->bytepp);
+ plane->size = mdev_state->memsize;
+ plane->region_index = MDPY_DISPLAY_REGION;
+
+ /* unused */
+ plane->drm_format_mod = 0;
+ plane->x_pos = 0;
+ plane->y_pos = 0;
+ plane->x_hot = 0;
+ plane->y_hot = 0;
+
+ return 0;
+}
+
+static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = 0;
+ unsigned long minsz;
+ struct mdev_state *mdev_state;
+
+ mdev_state = mdev_get_drvdata(mdev);
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_get_device_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_DEVICE_GET_REGION_INFO:
+ {
+ struct vfio_region_info info;
+ u16 cap_type_id = 0;
+ void *cap_type = NULL;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_get_region_info(mdev, &info, &cap_type_id,
+ &cap_type);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= mdev_state->dev_info.num_irqs))
+ return -EINVAL;
+
+ ret = mdpy_get_irq_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_QUERY_GFX_PLANE:
+ {
+ struct vfio_device_gfx_plane_info plane;
+
+ minsz = offsetofend(struct vfio_device_gfx_plane_info,
+ region_index);
+
+ if (copy_from_user(&plane, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (plane.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_query_gfx_plane(mdev, &plane);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &plane, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_SET_IRQS:
+ return -EINVAL;
+
+ case VFIO_DEVICE_RESET:
+ return mdpy_reset(mdev);
+ }
+ return -ENOTTY;
+}
+
+static int mdpy_open(struct mdev_device *mdev)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void mdpy_close(struct mdev_device *mdev)
+{
+ module_put(THIS_MODULE);
+}
+
+static ssize_t
+resolution_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct mdev_device *mdev = mdev_from_dev(dev);
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+
+ return sprintf(buf, "%dx%d\n",
+ mdev_state->type->width,
+ mdev_state->type->height);
+}
+static DEVICE_ATTR_RO(resolution);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_resolution.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static ssize_t
+name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "%s\n", kobj->name);
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t
+description_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ const struct mdpy_type *type = mdpy_find_type(kobj);
+
+ return sprintf(buf, "virtual display, %dx%d framebuffer\n",
+ type ? type->width : 0,
+ type ? type->height : 0);
+}
+MDEV_TYPE_ATTR_RO(description);
+
+static ssize_t
+available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "%d\n", max_devices - mdpy_count);
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+ &mdev_type_attr_name.attr,
+ &mdev_type_attr_description.attr,
+ &mdev_type_attr_device_api.attr,
+ &mdev_type_attr_available_instances.attr,
+ NULL,
+};
+
+static struct attribute_group mdev_type_group1 = {
+ .name = MDPY_TYPE_1,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group2 = {
+ .name = MDPY_TYPE_2,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group3 = {
+ .name = MDPY_TYPE_3,
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group *mdev_type_groups[] = {
+ &mdev_type_group1,
+ &mdev_type_group2,
+ &mdev_type_group3,
+ NULL,
+};
+
+static const struct mdev_parent_ops mdev_fops = {
+ .owner = THIS_MODULE,
+ .mdev_attr_groups = mdev_dev_groups,
+ .supported_type_groups = mdev_type_groups,
+ .create = mdpy_create,
+ .remove = mdpy_remove,
+ .open = mdpy_open,
+ .release = mdpy_close,
+ .read = mdpy_read,
+ .write = mdpy_write,
+ .ioctl = mdpy_ioctl,
+ .mmap = mdpy_mmap,
+};
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+static void mdpy_device_release(struct device *dev)
+{
+ /* nothing */
+}
+
+static int __init mdpy_dev_init(void)
+{
+ int ret = 0;
+
+ ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK, MDPY_NAME);
+ if (ret < 0) {
+ pr_err("Error: failed to register mdpy_dev, err: %d\n", ret);
+ return ret;
+ }
+ cdev_init(&mdpy_cdev, &vd_fops);
+ cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK);
+ pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
+
+ mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
+ if (IS_ERR(mdpy_class)) {
+ pr_err("Error: failed to register mdpy_dev class\n");
+ ret = PTR_ERR(mdpy_class);
+ goto failed1;
+ }
+ mdpy_dev.class = mdpy_class;
+ mdpy_dev.release = mdpy_device_release;
+ dev_set_name(&mdpy_dev, "%s", MDPY_NAME);
+
+ ret = device_register(&mdpy_dev);
+ if (ret)
+ goto failed2;
+
+ ret = mdev_register_device(&mdpy_dev, &mdev_fops);
+ if (ret)
+ goto failed3;
+
+ return 0;
+
+failed3:
+ device_unregister(&mdpy_dev);
+failed2:
+ class_destroy(mdpy_class);
+failed1:
+ cdev_del(&mdpy_cdev);
+ unregister_chrdev_region(mdpy_devt, MINORMASK);
+ return ret;
+}
+
+static void __exit mdpy_dev_exit(void)
+{
+ mdpy_dev.bus = NULL;
+ mdev_unregister_device(&mdpy_dev);
+
+ device_unregister(&mdpy_dev);
+ cdev_del(&mdpy_cdev);
+ unregister_chrdev_region(mdpy_devt, MINORMASK);
+ class_destroy(mdpy_class);
+ mdpy_class = NULL;
+}
+
+module_init(mdpy_dev_init)
+module_exit(mdpy_dev_exit)
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
new file mode 100644
index 000000000..f6732aa16
--- /dev/null
+++ b/samples/vfio-mdev/mtty.c
@@ -0,0 +1,1516 @@
+/*
+ * Mediated virtual PCI serial host device driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Sample driver that creates mdev device that simulates serial port over PCI
+ * card.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <linux/serial.h>
+#include <uapi/linux/serial_reg.h>
+#include <linux/eventfd.h>
+/*
+ * #defines
+ */
+
+#define VERSION_STRING "0.1"
+#define DRIVER_AUTHOR "NVIDIA Corporation"
+
+#define MTTY_CLASS_NAME "mtty"
+
+#define MTTY_NAME "mtty"
+
+#define MTTY_STRING_LEN 16
+
+#define MTTY_CONFIG_SPACE_SIZE 0xff
+#define MTTY_IO_BAR_SIZE 0x8
+#define MTTY_MMIO_BAR_SIZE 0x100000
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+#define MAX_FIFO_SIZE 16
+
+#define CIRCULAR_BUF_INC_IDX(idx) (idx = (idx + 1) & (MAX_FIFO_SIZE - 1))
+
+#define MTTY_VFIO_PCI_OFFSET_SHIFT 40
+
+#define MTTY_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> MTTY_VFIO_PCI_OFFSET_SHIFT)
+#define MTTY_VFIO_PCI_INDEX_TO_OFFSET(index) \
+ ((u64)(index) << MTTY_VFIO_PCI_OFFSET_SHIFT)
+#define MTTY_VFIO_PCI_OFFSET_MASK \
+ (((u64)(1) << MTTY_VFIO_PCI_OFFSET_SHIFT) - 1)
+#define MAX_MTTYS 24
+
+/*
+ * Global Structures
+ */
+
+struct mtty_dev {
+ dev_t vd_devt;
+ struct class *vd_class;
+ struct cdev vd_cdev;
+ struct idr vd_idr;
+ struct device dev;
+} mtty_dev;
+
+struct mdev_region_info {
+ u64 start;
+ u64 phys_start;
+ u32 size;
+ u64 vfio_offset;
+};
+
+#if defined(DEBUG_REGS)
+const char *wr_reg[] = {
+ "TX",
+ "IER",
+ "FCR",
+ "LCR",
+ "MCR",
+ "LSR",
+ "MSR",
+ "SCR"
+};
+
+const char *rd_reg[] = {
+ "RX",
+ "IER",
+ "IIR",
+ "LCR",
+ "MCR",
+ "LSR",
+ "MSR",
+ "SCR"
+};
+#endif
+
+/* loop back buffer */
+struct rxtx {
+ u8 fifo[MAX_FIFO_SIZE];
+ u8 head, tail;
+ u8 count;
+};
+
+struct serial_port {
+ u8 uart_reg[8]; /* 8 registers */
+ struct rxtx rxtx; /* loop back buffer */
+ bool dlab;
+ bool overrun;
+ u16 divisor;
+ u8 fcr; /* FIFO control register */
+ u8 max_fifo_size;
+ u8 intr_trigger_level; /* interrupt trigger level */
+};
+
+/* State of each mdev device */
+struct mdev_state {
+ int irq_fd;
+ struct eventfd_ctx *intx_evtfd;
+ struct eventfd_ctx *msi_evtfd;
+ int irq_index;
+ u8 *vconfig;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+ struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
+ u32 bar_mask[VFIO_PCI_NUM_REGIONS];
+ struct list_head next;
+ struct serial_port s[2];
+ struct mutex rxtx_lock;
+ struct vfio_device_info dev_info;
+ int nr_ports;
+};
+
+struct mutex mdev_list_lock;
+struct list_head mdev_devices_list;
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+/* function prototypes */
+
+static int mtty_trigger_interrupt(uuid_le uuid);
+
+/* Helper functions */
+static struct mdev_state *find_mdev_state_by_uuid(uuid_le uuid)
+{
+ struct mdev_state *mds;
+
+ list_for_each_entry(mds, &mdev_devices_list, next) {
+ if (uuid_le_cmp(mdev_uuid(mds->mdev), uuid) == 0)
+ return mds;
+ }
+
+ return NULL;
+}
+
+void dump_buffer(u8 *buf, uint32_t count)
+{
+#if defined(DEBUG)
+ int i;
+
+ pr_info("Buffer:\n");
+ for (i = 0; i < count; i++) {
+ pr_info("%2x ", *(buf + i));
+ if ((i + 1) % 16 == 0)
+ pr_info("\n");
+ }
+#endif
+}
+
+static void mtty_create_config_space(struct mdev_state *mdev_state)
+{
+ /* PCI dev ID */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x0], 0x32534348);
+
+ /* Control: I/O+, Mem-, BusMaster- */
+ STORE_LE16((u16 *) &mdev_state->vconfig[0x4], 0x0001);
+
+ /* Status: capabilities list absent */
+ STORE_LE16((u16 *) &mdev_state->vconfig[0x6], 0x0200);
+
+ /* Rev ID */
+ mdev_state->vconfig[0x8] = 0x10;
+
+ /* programming interface class : 16550-compatible serial controller */
+ mdev_state->vconfig[0x9] = 0x02;
+
+ /* Sub class : 00 */
+ mdev_state->vconfig[0xa] = 0x00;
+
+ /* Base class : Simple Communication controllers */
+ mdev_state->vconfig[0xb] = 0x07;
+
+ /* base address registers */
+ /* BAR0: IO space */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x10], 0x000001);
+ mdev_state->bar_mask[0] = ~(MTTY_IO_BAR_SIZE) + 1;
+
+ if (mdev_state->nr_ports == 2) {
+ /* BAR1: IO space */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x14], 0x000001);
+ mdev_state->bar_mask[1] = ~(MTTY_IO_BAR_SIZE) + 1;
+ }
+
+ /* Subsystem ID */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x2c], 0x32534348);
+
+ mdev_state->vconfig[0x34] = 0x00; /* Cap Ptr */
+ mdev_state->vconfig[0x3d] = 0x01; /* interrupt pin (INTA#) */
+
+ /* Vendor specific data */
+ mdev_state->vconfig[0x40] = 0x23;
+ mdev_state->vconfig[0x43] = 0x80;
+ mdev_state->vconfig[0x44] = 0x23;
+ mdev_state->vconfig[0x48] = 0x23;
+ mdev_state->vconfig[0x4c] = 0x23;
+
+ mdev_state->vconfig[0x60] = 0x50;
+ mdev_state->vconfig[0x61] = 0x43;
+ mdev_state->vconfig[0x62] = 0x49;
+ mdev_state->vconfig[0x63] = 0x20;
+ mdev_state->vconfig[0x64] = 0x53;
+ mdev_state->vconfig[0x65] = 0x65;
+ mdev_state->vconfig[0x66] = 0x72;
+ mdev_state->vconfig[0x67] = 0x69;
+ mdev_state->vconfig[0x68] = 0x61;
+ mdev_state->vconfig[0x69] = 0x6c;
+ mdev_state->vconfig[0x6a] = 0x2f;
+ mdev_state->vconfig[0x6b] = 0x55;
+ mdev_state->vconfig[0x6c] = 0x41;
+ mdev_state->vconfig[0x6d] = 0x52;
+ mdev_state->vconfig[0x6e] = 0x54;
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ u8 *buf, u32 count)
+{
+ u32 cfg_addr, bar_mask, bar_index = 0;
+
+ switch (offset) {
+ case 0x04: /* device control */
+ case 0x06: /* device status */
+ /* do nothing */
+ break;
+ case 0x3c: /* interrupt line */
+ mdev_state->vconfig[0x3c] = buf[0];
+ break;
+ case 0x3d:
+ /*
+ * Interrupt Pin is hardwired to INTA.
+ * This field is write protected by hardware
+ */
+ break;
+ case 0x10: /* BAR0 */
+ case 0x14: /* BAR1 */
+ if (offset == 0x10)
+ bar_index = 0;
+ else if (offset == 0x14)
+ bar_index = 1;
+
+ if ((mdev_state->nr_ports == 1) && (bar_index == 1)) {
+ STORE_LE32(&mdev_state->vconfig[offset], 0);
+ break;
+ }
+
+ cfg_addr = *(u32 *)buf;
+ pr_info("BAR%d addr 0x%x\n", bar_index, cfg_addr);
+
+ if (cfg_addr == 0xffffffff) {
+ bar_mask = mdev_state->bar_mask[bar_index];
+ cfg_addr = (cfg_addr & bar_mask);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] & 0x3ul);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ case 0x18: /* BAR2 */
+ case 0x1c: /* BAR3 */
+ case 0x20: /* BAR4 */
+ STORE_LE32(&mdev_state->vconfig[offset], 0);
+ break;
+ default:
+ pr_info("PCI config write @0x%x of %d bytes not handled\n",
+ offset, count);
+ break;
+ }
+}
+
+static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state,
+ u16 offset, u8 *buf, u32 count)
+{
+ u8 data = *buf;
+
+ /* Handle data written by guest */
+ switch (offset) {
+ case UART_TX:
+ /* if DLAB set, data is LSB of divisor */
+ if (mdev_state->s[index].dlab) {
+ mdev_state->s[index].divisor |= data;
+ break;
+ }
+
+ mutex_lock(&mdev_state->rxtx_lock);
+
+ /* save in TX buffer */
+ if (mdev_state->s[index].rxtx.count <
+ mdev_state->s[index].max_fifo_size) {
+ mdev_state->s[index].rxtx.fifo[
+ mdev_state->s[index].rxtx.head] = data;
+ mdev_state->s[index].rxtx.count++;
+ CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.head);
+ mdev_state->s[index].overrun = false;
+
+ /*
+ * Trigger interrupt if receive data interrupt is
+ * enabled and fifo reached trigger level
+ */
+ if ((mdev_state->s[index].uart_reg[UART_IER] &
+ UART_IER_RDI) &&
+ (mdev_state->s[index].rxtx.count ==
+ mdev_state->s[index].intr_trigger_level)) {
+ /* trigger interrupt */
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: Fifo level trigger\n",
+ index);
+#endif
+ mtty_trigger_interrupt(
+ mdev_uuid(mdev_state->mdev));
+ }
+ } else {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: Buffer Overflow\n", index);
+#endif
+ mdev_state->s[index].overrun = true;
+
+ /*
+ * Trigger interrupt if receiver line status interrupt
+ * is enabled
+ */
+ if (mdev_state->s[index].uart_reg[UART_IER] &
+ UART_IER_RLSI)
+ mtty_trigger_interrupt(
+ mdev_uuid(mdev_state->mdev));
+ }
+ mutex_unlock(&mdev_state->rxtx_lock);
+ break;
+
+ case UART_IER:
+ /* if DLAB set, data is MSB of divisor */
+ if (mdev_state->s[index].dlab)
+ mdev_state->s[index].divisor |= (u16)data << 8;
+ else {
+ mdev_state->s[index].uart_reg[offset] = data;
+ mutex_lock(&mdev_state->rxtx_lock);
+ if ((data & UART_IER_THRI) &&
+ (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail)) {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: IER_THRI write\n",
+ index);
+#endif
+ mtty_trigger_interrupt(
+ mdev_uuid(mdev_state->mdev));
+ }
+
+ mutex_unlock(&mdev_state->rxtx_lock);
+ }
+
+ break;
+
+ case UART_FCR:
+ mdev_state->s[index].fcr = data;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ if (data & (UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT)) {
+ /* clear loop back FIFO */
+ mdev_state->s[index].rxtx.count = 0;
+ mdev_state->s[index].rxtx.head = 0;
+ mdev_state->s[index].rxtx.tail = 0;
+ }
+ mutex_unlock(&mdev_state->rxtx_lock);
+
+ switch (data & UART_FCR_TRIGGER_MASK) {
+ case UART_FCR_TRIGGER_1:
+ mdev_state->s[index].intr_trigger_level = 1;
+ break;
+
+ case UART_FCR_TRIGGER_4:
+ mdev_state->s[index].intr_trigger_level = 4;
+ break;
+
+ case UART_FCR_TRIGGER_8:
+ mdev_state->s[index].intr_trigger_level = 8;
+ break;
+
+ case UART_FCR_TRIGGER_14:
+ mdev_state->s[index].intr_trigger_level = 14;
+ break;
+ }
+
+ /*
+ * Set trigger level to 1 otherwise or implement timer with
+ * timeout of 4 characters and on expiring that timer set
+ * Recevice data timeout in IIR register
+ */
+ mdev_state->s[index].intr_trigger_level = 1;
+ if (data & UART_FCR_ENABLE_FIFO)
+ mdev_state->s[index].max_fifo_size = MAX_FIFO_SIZE;
+ else {
+ mdev_state->s[index].max_fifo_size = 1;
+ mdev_state->s[index].intr_trigger_level = 1;
+ }
+
+ break;
+
+ case UART_LCR:
+ if (data & UART_LCR_DLAB) {
+ mdev_state->s[index].dlab = true;
+ mdev_state->s[index].divisor = 0;
+ } else
+ mdev_state->s[index].dlab = false;
+
+ mdev_state->s[index].uart_reg[offset] = data;
+ break;
+
+ case UART_MCR:
+ mdev_state->s[index].uart_reg[offset] = data;
+
+ if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) &&
+ (data & UART_MCR_OUT2)) {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: MCR_OUT2 write\n", index);
+#endif
+ mtty_trigger_interrupt(mdev_uuid(mdev_state->mdev));
+ }
+
+ if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) &&
+ (data & (UART_MCR_RTS | UART_MCR_DTR))) {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: MCR RTS/DTR write\n", index);
+#endif
+ mtty_trigger_interrupt(mdev_uuid(mdev_state->mdev));
+ }
+ break;
+
+ case UART_LSR:
+ case UART_MSR:
+ /* do nothing */
+ break;
+
+ case UART_SCR:
+ mdev_state->s[index].uart_reg[offset] = data;
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state,
+ u16 offset, u8 *buf, u32 count)
+{
+ /* Handle read requests by guest */
+ switch (offset) {
+ case UART_RX:
+ /* if DLAB set, data is LSB of divisor */
+ if (mdev_state->s[index].dlab) {
+ *buf = (u8)mdev_state->s[index].divisor;
+ break;
+ }
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* return data in tx buffer */
+ if (mdev_state->s[index].rxtx.head !=
+ mdev_state->s[index].rxtx.tail) {
+ *buf = mdev_state->s[index].rxtx.fifo[
+ mdev_state->s[index].rxtx.tail];
+ mdev_state->s[index].rxtx.count--;
+ CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.tail);
+ }
+
+ if (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail) {
+ /*
+ * Trigger interrupt if tx buffer empty interrupt is
+ * enabled and fifo is empty
+ */
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: Buffer Empty\n", index);
+#endif
+ if (mdev_state->s[index].uart_reg[UART_IER] &
+ UART_IER_THRI)
+ mtty_trigger_interrupt(
+ mdev_uuid(mdev_state->mdev));
+ }
+ mutex_unlock(&mdev_state->rxtx_lock);
+
+ break;
+
+ case UART_IER:
+ if (mdev_state->s[index].dlab) {
+ *buf = (u8)(mdev_state->s[index].divisor >> 8);
+ break;
+ }
+ *buf = mdev_state->s[index].uart_reg[offset] & 0x0f;
+ break;
+
+ case UART_IIR:
+ {
+ u8 ier = mdev_state->s[index].uart_reg[UART_IER];
+ *buf = 0;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* Interrupt priority 1: Parity, overrun, framing or break */
+ if ((ier & UART_IER_RLSI) && mdev_state->s[index].overrun)
+ *buf |= UART_IIR_RLSI;
+
+ /* Interrupt priority 2: Fifo trigger level reached */
+ if ((ier & UART_IER_RDI) &&
+ (mdev_state->s[index].rxtx.count >=
+ mdev_state->s[index].intr_trigger_level))
+ *buf |= UART_IIR_RDI;
+
+ /* Interrupt priotiry 3: transmitter holding register empty */
+ if ((ier & UART_IER_THRI) &&
+ (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail))
+ *buf |= UART_IIR_THRI;
+
+ /* Interrupt priotiry 4: Modem status: CTS, DSR, RI or DCD */
+ if ((ier & UART_IER_MSI) &&
+ (mdev_state->s[index].uart_reg[UART_MCR] &
+ (UART_MCR_RTS | UART_MCR_DTR)))
+ *buf |= UART_IIR_MSI;
+
+ /* bit0: 0=> interrupt pending, 1=> no interrupt is pending */
+ if (*buf == 0)
+ *buf = UART_IIR_NO_INT;
+
+ /* set bit 6 & 7 to be 16550 compatible */
+ *buf |= 0xC0;
+ mutex_unlock(&mdev_state->rxtx_lock);
+ }
+ break;
+
+ case UART_LCR:
+ case UART_MCR:
+ *buf = mdev_state->s[index].uart_reg[offset];
+ break;
+
+ case UART_LSR:
+ {
+ u8 lsr = 0;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* atleast one char in FIFO */
+ if (mdev_state->s[index].rxtx.head !=
+ mdev_state->s[index].rxtx.tail)
+ lsr |= UART_LSR_DR;
+
+ /* if FIFO overrun */
+ if (mdev_state->s[index].overrun)
+ lsr |= UART_LSR_OE;
+
+ /* transmit FIFO empty and tramsitter empty */
+ if (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail)
+ lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+
+ mutex_unlock(&mdev_state->rxtx_lock);
+ *buf = lsr;
+ break;
+ }
+ case UART_MSR:
+ *buf = UART_MSR_DSR | UART_MSR_DDSR | UART_MSR_DCD;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* if AFE is 1 and FIFO have space, set CTS bit */
+ if (mdev_state->s[index].uart_reg[UART_MCR] &
+ UART_MCR_AFE) {
+ if (mdev_state->s[index].rxtx.count <
+ mdev_state->s[index].max_fifo_size)
+ *buf |= UART_MSR_CTS | UART_MSR_DCTS;
+ } else
+ *buf |= UART_MSR_CTS | UART_MSR_DCTS;
+ mutex_unlock(&mdev_state->rxtx_lock);
+
+ break;
+
+ case UART_SCR:
+ *buf = mdev_state->s[index].uart_reg[offset];
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void mdev_read_base(struct mdev_state *mdev_state)
+{
+ int index, pos;
+ u32 start_lo, start_hi;
+ u32 mem_type;
+
+ pos = PCI_BASE_ADDRESS_0;
+
+ for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) {
+
+ if (!mdev_state->region_info[index].size)
+ continue;
+
+ start_lo = (*(u32 *)(mdev_state->vconfig + pos)) &
+ PCI_BASE_ADDRESS_MEM_MASK;
+ mem_type = (*(u32 *)(mdev_state->vconfig + pos)) &
+ PCI_BASE_ADDRESS_MEM_TYPE_MASK;
+
+ switch (mem_type) {
+ case PCI_BASE_ADDRESS_MEM_TYPE_64:
+ start_hi = (*(u32 *)(mdev_state->vconfig + pos + 4));
+ pos += 4;
+ break;
+ case PCI_BASE_ADDRESS_MEM_TYPE_32:
+ case PCI_BASE_ADDRESS_MEM_TYPE_1M:
+ /* 1M mem BAR treated as 32-bit BAR */
+ default:
+ /* mem unknown type treated as 32-bit BAR */
+ start_hi = 0;
+ break;
+ }
+ pos += 4;
+ mdev_state->region_info[index].start = ((u64)start_hi << 32) |
+ start_lo;
+ }
+}
+
+static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ struct mdev_state *mdev_state;
+ unsigned int index;
+ loff_t offset;
+ int ret = 0;
+
+ if (!mdev || !buf)
+ return -EINVAL;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state) {
+ pr_err("%s mdev_state not found\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos);
+ offset = pos & MTTY_VFIO_PCI_OFFSET_MASK;
+ switch (index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+
+#if defined(DEBUG)
+ pr_info("%s: PCI config space %s at offset 0x%llx\n",
+ __func__, is_write ? "write" : "read", offset);
+#endif
+ if (is_write) {
+ dump_buffer(buf, count);
+ handle_pci_cfg_write(mdev_state, offset, buf, count);
+ } else {
+ memcpy(buf, (mdev_state->vconfig + offset), count);
+ dump_buffer(buf, count);
+ }
+
+ break;
+
+ case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+ if (!mdev_state->region_info[index].start)
+ mdev_read_base(mdev_state);
+
+ if (is_write) {
+ dump_buffer(buf, count);
+
+#if defined(DEBUG_REGS)
+ pr_info("%s: BAR%d WR @0x%llx %s val:0x%02x dlab:%d\n",
+ __func__, index, offset, wr_reg[offset],
+ *buf, mdev_state->s[index].dlab);
+#endif
+ handle_bar_write(index, mdev_state, offset, buf, count);
+ } else {
+ handle_bar_read(index, mdev_state, offset, buf, count);
+ dump_buffer(buf, count);
+
+#if defined(DEBUG_REGS)
+ pr_info("%s: BAR%d RD @0x%llx %s val:0x%02x dlab:%d\n",
+ __func__, index, offset, rd_reg[offset],
+ *buf, mdev_state->s[index].dlab);
+#endif
+ }
+ break;
+
+ default:
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+int mtty_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state;
+ char name[MTTY_STRING_LEN];
+ int nr_ports = 0, i;
+
+ if (!mdev)
+ return -EINVAL;
+
+ for (i = 0; i < 2; i++) {
+ snprintf(name, MTTY_STRING_LEN, "%s-%d",
+ dev_driver_string(mdev_parent_dev(mdev)), i + 1);
+ if (!strcmp(kobj->name, name)) {
+ nr_ports = i + 1;
+ break;
+ }
+ }
+
+ if (!nr_ports)
+ return -EINVAL;
+
+ mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+ if (mdev_state == NULL)
+ return -ENOMEM;
+
+ mdev_state->nr_ports = nr_ports;
+ mdev_state->irq_index = -1;
+ mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE;
+ mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE;
+ mutex_init(&mdev_state->rxtx_lock);
+ mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL);
+
+ if (mdev_state->vconfig == NULL) {
+ kfree(mdev_state);
+ return -ENOMEM;
+ }
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ mdev_set_drvdata(mdev, mdev_state);
+
+ mtty_create_config_space(mdev_state);
+
+ mutex_lock(&mdev_list_lock);
+ list_add(&mdev_state->next, &mdev_devices_list);
+ mutex_unlock(&mdev_list_lock);
+
+ return 0;
+}
+
+int mtty_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mds, *tmp_mds;
+ struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+ int ret = -EINVAL;
+
+ mutex_lock(&mdev_list_lock);
+ list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) {
+ if (mdev_state == mds) {
+ list_del(&mdev_state->next);
+ mdev_set_drvdata(mdev, NULL);
+ kfree(mdev_state->vconfig);
+ kfree(mdev_state);
+ ret = 0;
+ break;
+ }
+ }
+ mutex_unlock(&mdev_list_lock);
+
+ return ret;
+}
+
+int mtty_reset(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state;
+
+ if (!mdev)
+ return -EINVAL;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -EINVAL;
+
+ pr_info("%s: called\n", __func__);
+
+ return 0;
+}
+
+ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags,
+ unsigned int index, unsigned int start,
+ unsigned int count, void *data)
+{
+ int ret = 0;
+ struct mdev_state *mdev_state;
+
+ if (!mdev)
+ return -EINVAL;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -EINVAL;
+
+ mutex_lock(&mdev_state->ops_lock);
+ switch (index) {
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_MASK:
+ case VFIO_IRQ_SET_ACTION_UNMASK:
+ break;
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ {
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ pr_info("%s: disable INTx\n", __func__);
+ if (mdev_state->intx_evtfd)
+ eventfd_ctx_put(mdev_state->intx_evtfd);
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ int fd = *(int *)data;
+
+ if (fd > 0) {
+ struct eventfd_ctx *evt;
+
+ evt = eventfd_ctx_fdget(fd);
+ if (IS_ERR(evt)) {
+ ret = PTR_ERR(evt);
+ break;
+ }
+ mdev_state->intx_evtfd = evt;
+ mdev_state->irq_fd = fd;
+ mdev_state->irq_index = index;
+ break;
+ }
+ }
+ break;
+ }
+ }
+ break;
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_MASK:
+ case VFIO_IRQ_SET_ACTION_UNMASK:
+ break;
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ if (mdev_state->msi_evtfd)
+ eventfd_ctx_put(mdev_state->msi_evtfd);
+ pr_info("%s: disable MSI\n", __func__);
+ mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX;
+ break;
+ }
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ int fd = *(int *)data;
+ struct eventfd_ctx *evt;
+
+ if (fd <= 0)
+ break;
+
+ if (mdev_state->msi_evtfd)
+ break;
+
+ evt = eventfd_ctx_fdget(fd);
+ if (IS_ERR(evt)) {
+ ret = PTR_ERR(evt);
+ break;
+ }
+ mdev_state->msi_evtfd = evt;
+ mdev_state->irq_fd = fd;
+ mdev_state->irq_index = index;
+ }
+ break;
+ }
+ break;
+ case VFIO_PCI_MSIX_IRQ_INDEX:
+ pr_info("%s: MSIX_IRQ\n", __func__);
+ break;
+ case VFIO_PCI_ERR_IRQ_INDEX:
+ pr_info("%s: ERR_IRQ\n", __func__);
+ break;
+ case VFIO_PCI_REQ_IRQ_INDEX:
+ pr_info("%s: REQ_IRQ\n", __func__);
+ break;
+ }
+
+ mutex_unlock(&mdev_state->ops_lock);
+ return ret;
+}
+
+static int mtty_trigger_interrupt(uuid_le uuid)
+{
+ int ret = -1;
+ struct mdev_state *mdev_state;
+
+ mdev_state = find_mdev_state_by_uuid(uuid);
+
+ if (!mdev_state) {
+ pr_info("%s: mdev not found\n", __func__);
+ return -EINVAL;
+ }
+
+ if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) &&
+ (!mdev_state->msi_evtfd))
+ return -EINVAL;
+ else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) &&
+ (!mdev_state->intx_evtfd)) {
+ pr_info("%s: Intr eventfd not found\n", __func__);
+ return -EINVAL;
+ }
+
+ if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX)
+ ret = eventfd_signal(mdev_state->msi_evtfd, 1);
+ else
+ ret = eventfd_signal(mdev_state->intx_evtfd, 1);
+
+#if defined(DEBUG_INTR)
+ pr_info("Intx triggered\n");
+#endif
+ if (ret != 1)
+ pr_err("%s: eventfd signal failed (%d)\n", __func__, ret);
+
+ return ret;
+}
+
+int mtty_get_region_info(struct mdev_device *mdev,
+ struct vfio_region_info *region_info,
+ u16 *cap_type_id, void **cap_type)
+{
+ unsigned int size = 0;
+ struct mdev_state *mdev_state;
+ u32 bar_index;
+
+ if (!mdev)
+ return -EINVAL;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -EINVAL;
+
+ bar_index = region_info->index;
+ if (bar_index >= VFIO_PCI_NUM_REGIONS)
+ return -EINVAL;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ switch (bar_index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ size = MTTY_CONFIG_SPACE_SIZE;
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ size = MTTY_IO_BAR_SIZE;
+ break;
+ case VFIO_PCI_BAR1_REGION_INDEX:
+ if (mdev_state->nr_ports == 2)
+ size = MTTY_IO_BAR_SIZE;
+ break;
+ default:
+ size = 0;
+ break;
+ }
+
+ mdev_state->region_info[bar_index].size = size;
+ mdev_state->region_info[bar_index].vfio_offset =
+ MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
+
+ region_info->size = size;
+ region_info->offset = MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
+ region_info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ mutex_unlock(&mdev_state->ops_lock);
+ return 0;
+}
+
+int mtty_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
+{
+ switch (irq_info->index) {
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ case VFIO_PCI_REQ_IRQ_INDEX:
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
+ irq_info->count = 1;
+
+ if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
+ irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |
+ VFIO_IRQ_INFO_AUTOMASKED);
+ else
+ irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;
+
+ return 0;
+}
+
+int mtty_get_device_info(struct mdev_device *mdev,
+ struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+
+ return 0;
+}
+
+static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = 0;
+ unsigned long minsz;
+ struct mdev_state *mdev_state;
+
+ if (!mdev)
+ return -EINVAL;
+
+ mdev_state = mdev_get_drvdata(mdev);
+ if (!mdev_state)
+ return -ENODEV;
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mtty_get_device_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_DEVICE_GET_REGION_INFO:
+ {
+ struct vfio_region_info info;
+ u16 cap_type_id = 0;
+ void *cap_type = NULL;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mtty_get_region_info(mdev, &info, &cap_type_id,
+ &cap_type);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= mdev_state->dev_info.num_irqs))
+ return -EINVAL;
+
+ ret = mtty_get_irq_info(mdev, &info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_DEVICE_SET_IRQS:
+ {
+ struct vfio_irq_set hdr;
+ u8 *data = NULL, *ptr = NULL;
+ size_t data_size = 0;
+
+ minsz = offsetofend(struct vfio_irq_set, count);
+
+ if (copy_from_user(&hdr, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ ret = vfio_set_irqs_validate_and_prepare(&hdr,
+ mdev_state->dev_info.num_irqs,
+ VFIO_PCI_NUM_IRQS,
+ &data_size);
+ if (ret)
+ return ret;
+
+ if (data_size) {
+ ptr = data = memdup_user((void __user *)(arg + minsz),
+ data_size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ }
+
+ ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start,
+ hdr.count, data);
+
+ kfree(ptr);
+ return ret;
+ }
+ case VFIO_DEVICE_RESET:
+ return mtty_reset(mdev);
+ }
+ return -ENOTTY;
+}
+
+int mtty_open(struct mdev_device *mdev)
+{
+ pr_info("%s\n", __func__);
+ return 0;
+}
+
+void mtty_close(struct mdev_device *mdev)
+{
+ pr_info("%s\n", __func__);
+}
+
+static ssize_t
+sample_mtty_dev_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "This is phy device\n");
+}
+
+static DEVICE_ATTR_RO(sample_mtty_dev);
+
+static struct attribute *mtty_dev_attrs[] = {
+ &dev_attr_sample_mtty_dev.attr,
+ NULL,
+};
+
+static const struct attribute_group mtty_dev_group = {
+ .name = "mtty_dev",
+ .attrs = mtty_dev_attrs,
+};
+
+const struct attribute_group *mtty_dev_groups[] = {
+ &mtty_dev_group,
+ NULL,
+};
+
+static ssize_t
+sample_mdev_dev_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ if (mdev_from_dev(dev))
+ return sprintf(buf, "This is MDEV %s\n", dev_name(dev));
+
+ return sprintf(buf, "\n");
+}
+
+static DEVICE_ATTR_RO(sample_mdev_dev);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_sample_mdev_dev.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static ssize_t
+name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ char name[MTTY_STRING_LEN];
+ int i;
+ const char *name_str[2] = {"Single port serial", "Dual port serial"};
+
+ for (i = 0; i < 2; i++) {
+ snprintf(name, MTTY_STRING_LEN, "%s-%d",
+ dev_driver_string(dev), i + 1);
+ if (!strcmp(kobj->name, name))
+ return sprintf(buf, "%s\n", name_str[i]);
+ }
+
+ return -EINVAL;
+}
+
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t
+available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ char name[MTTY_STRING_LEN];
+ int i;
+ struct mdev_state *mds;
+ int ports = 0, used = 0;
+
+ for (i = 0; i < 2; i++) {
+ snprintf(name, MTTY_STRING_LEN, "%s-%d",
+ dev_driver_string(dev), i + 1);
+ if (!strcmp(kobj->name, name)) {
+ ports = i + 1;
+ break;
+ }
+ }
+
+ if (!ports)
+ return -EINVAL;
+
+ list_for_each_entry(mds, &mdev_devices_list, next)
+ used += mds->nr_ports;
+
+ return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports);
+}
+
+MDEV_TYPE_ATTR_RO(available_instances);
+
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+
+MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *mdev_types_attrs[] = {
+ &mdev_type_attr_name.attr,
+ &mdev_type_attr_device_api.attr,
+ &mdev_type_attr_available_instances.attr,
+ NULL,
+};
+
+static struct attribute_group mdev_type_group1 = {
+ .name = "1",
+ .attrs = mdev_types_attrs,
+};
+
+static struct attribute_group mdev_type_group2 = {
+ .name = "2",
+ .attrs = mdev_types_attrs,
+};
+
+struct attribute_group *mdev_type_groups[] = {
+ &mdev_type_group1,
+ &mdev_type_group2,
+ NULL,
+};
+
+static const struct mdev_parent_ops mdev_fops = {
+ .owner = THIS_MODULE,
+ .dev_attr_groups = mtty_dev_groups,
+ .mdev_attr_groups = mdev_dev_groups,
+ .supported_type_groups = mdev_type_groups,
+ .create = mtty_create,
+ .remove = mtty_remove,
+ .open = mtty_open,
+ .release = mtty_close,
+ .read = mtty_read,
+ .write = mtty_write,
+ .ioctl = mtty_ioctl,
+};
+
+static void mtty_device_release(struct device *dev)
+{
+ dev_dbg(dev, "mtty: released\n");
+}
+
+static int __init mtty_dev_init(void)
+{
+ int ret = 0;
+
+ pr_info("mtty_dev: %s\n", __func__);
+
+ memset(&mtty_dev, 0, sizeof(mtty_dev));
+
+ idr_init(&mtty_dev.vd_idr);
+
+ ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK, MTTY_NAME);
+
+ if (ret < 0) {
+ pr_err("Error: failed to register mtty_dev, err:%d\n", ret);
+ return ret;
+ }
+
+ cdev_init(&mtty_dev.vd_cdev, &vd_fops);
+ cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK);
+
+ pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt));
+
+ mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME);
+
+ if (IS_ERR(mtty_dev.vd_class)) {
+ pr_err("Error: failed to register mtty_dev class\n");
+ ret = PTR_ERR(mtty_dev.vd_class);
+ goto failed1;
+ }
+
+ mtty_dev.dev.class = mtty_dev.vd_class;
+ mtty_dev.dev.release = mtty_device_release;
+ dev_set_name(&mtty_dev.dev, "%s", MTTY_NAME);
+
+ ret = device_register(&mtty_dev.dev);
+ if (ret)
+ goto failed2;
+
+ ret = mdev_register_device(&mtty_dev.dev, &mdev_fops);
+ if (ret)
+ goto failed3;
+
+ mutex_init(&mdev_list_lock);
+ INIT_LIST_HEAD(&mdev_devices_list);
+
+ goto all_done;
+
+failed3:
+
+ device_unregister(&mtty_dev.dev);
+failed2:
+ class_destroy(mtty_dev.vd_class);
+
+failed1:
+ cdev_del(&mtty_dev.vd_cdev);
+ unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK);
+
+all_done:
+ return ret;
+}
+
+static void __exit mtty_dev_exit(void)
+{
+ mtty_dev.dev.bus = NULL;
+ mdev_unregister_device(&mtty_dev.dev);
+
+ device_unregister(&mtty_dev.dev);
+ idr_destroy(&mtty_dev.vd_idr);
+ cdev_del(&mtty_dev.vd_cdev);
+ unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK);
+ class_destroy(mtty_dev.vd_class);
+ mtty_dev.vd_class = NULL;
+ pr_info("mtty_dev: Unloaded!\n");
+}
+
+module_init(mtty_dev_init)
+module_exit(mtty_dev_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_INFO(supported, "Test driver that simulate serial port over PCI");
+MODULE_VERSION(VERSION_STRING);
+MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore
new file mode 100644
index 000000000..ff0ebb540
--- /dev/null
+++ b/samples/watchdog/.gitignore
@@ -0,0 +1 @@
+watchdog-simple
diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile
new file mode 100644
index 000000000..a9430fa60
--- /dev/null
+++ b/samples/watchdog/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+CC := $(CROSS_COMPILE)gcc
+PROGS := watchdog-simple
+
+all: $(PROGS)
+
+clean:
+ rm -fr $(PROGS)
+
diff --git a/samples/watchdog/watchdog-simple.c b/samples/watchdog/watchdog-simple.c
new file mode 100644
index 000000000..9ce66d2ca
--- /dev/null
+++ b/samples/watchdog/watchdog-simple.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(void)
+{
+ int fd = open("/dev/watchdog", O_WRONLY);
+ int ret = 0;
+ if (fd == -1) {
+ perror("watchdog");
+ exit(EXIT_FAILURE);
+ }
+ while (1) {
+ ret = write(fd, "\0", 1);
+ if (ret != 1) {
+ ret = -1;
+ break;
+ }
+ sleep(10);
+ }
+ close(fd);
+ return ret;
+}