diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
commit | 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch) | |
tree | a94efe259b9009378be6d90eb30d2b019d95c194 /samples | |
parent | Initial commit. (diff) | |
download | linux-upstream/5.10.209.tar.xz linux-upstream/5.10.209.zip |
Adding upstream version 5.10.209.upstream/5.10.209upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'samples')
248 files changed, 41327 insertions, 0 deletions
diff --git a/samples/Kconfig b/samples/Kconfig new file mode 100644 index 000000000..e76cdfc50 --- /dev/null +++ b/samples/Kconfig @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig SAMPLES + bool "Sample kernel code" + help + You can build and test sample kernel code here. + +if SAMPLES + +config SAMPLE_AUXDISPLAY + bool "auxdisplay sample" + depends on CC_CAN_LINK + +config SAMPLE_TRACE_EVENTS + tristate "Build trace_events examples -- loadable modules only" + depends on EVENT_TRACING && m + help + This build trace event example modules. + +config SAMPLE_TRACE_PRINTK + tristate "Build trace_printk module - tests various trace_printk formats" + depends on EVENT_TRACING && m + help + This builds a module that calls trace_printk() and can be used to + test various trace_printk() calls from a module. + +config SAMPLE_FTRACE_DIRECT + tristate "Build register_ftrace_direct() example" + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m + depends on X86_64 # has x86_64 inlined asm + help + This builds an ftrace direct function example + that hooks to wake_up_process and prints the parameters. + +config SAMPLE_TRACE_ARRAY + tristate "Build sample module for kernel access to Ftrace instancess" + depends on EVENT_TRACING && m + help + This builds a module that demonstrates the use of various APIs to + access Ftrace instances from within the kernel. + +config SAMPLE_KOBJECT + tristate "Build kobject examples" + help + This config option will allow you to build a number of + different kobject sample modules showing how to use kobjects, + ksets, and ktypes properly. + + If in doubt, say "N" here. + +config SAMPLE_KPROBES + tristate "Build kprobes examples -- loadable modules only" + depends on KPROBES && m + help + This build several kprobes example modules. + +config SAMPLE_KRETPROBES + tristate "Build kretprobes example -- loadable modules only" + default m + depends on SAMPLE_KPROBES && KRETPROBES + +config SAMPLE_HW_BREAKPOINT + tristate "Build kernel hardware breakpoint examples -- loadable module only" + depends on HAVE_HW_BREAKPOINT && m + help + This builds kernel hardware breakpoint example modules. + +config SAMPLE_KFIFO + tristate "Build kfifo examples -- loadable modules only" + depends on m + help + This config option will allow you to build a number of + different kfifo sample modules showing how to use the + generic kfifo API. + + If in doubt, say "N" here. + +config SAMPLE_KDB + tristate "Build kdb command example -- loadable modules only" + depends on KGDB_KDB && m + help + Build an example of how to dynamically add the hello + command to the kdb shell. + +config SAMPLE_QMI_CLIENT + tristate "Build qmi client sample -- loadable modules only" + depends on m + depends on ARCH_QCOM + depends on NET + select QCOM_QMI_HELPERS + help + Build an QMI client sample driver, which demonstrates how to + communicate with a remote QRTR service, using QMI encoded messages. + +config SAMPLE_RPMSG_CLIENT + tristate "Build rpmsg client sample -- loadable modules only" + depends on RPMSG && m + help + Build an rpmsg client sample driver, which demonstrates how + to communicate with an AMP-configured remote processor over + the rpmsg bus. + +config SAMPLE_LIVEPATCH + tristate "Build live patching samples -- loadable modules only" + depends on LIVEPATCH && m + help + Build sample live patch demonstrations. + +config SAMPLE_CONFIGFS + tristate "Build configfs patching sample -- loadable modules only" + depends on CONFIGFS_FS && m + help + Builds a sample configfs interface. + +config SAMPLE_CONNECTOR + tristate "Build connector sample -- loadable modules only" + depends on CONNECTOR && HEADERS_INSTALL && m + help + When enabled, this builds both a sample kernel module for + the connector interface and a user space tool to communicate + with it. + See also Documentation/driver-api/connector.rst + +config SAMPLE_HIDRAW + bool "hidraw sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + +config SAMPLE_PIDFD + bool "pidfd sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + +config SAMPLE_SECCOMP + bool "Build seccomp sample code" + depends on SECCOMP_FILTER && CC_CAN_LINK && HEADERS_INSTALL + help + Build samples of seccomp filters using various methods of + BPF filter construction. + +config SAMPLE_TIMER + bool "Timer sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + +config SAMPLE_UHID + bool "UHID sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build UHID sample program. + +config SAMPLE_VFIO_MDEV_MTTY + tristate "Build VFIO mtty example mediated device sample code -- loadable modules only" + depends on VFIO_MDEV_DEVICE && m + help + Build a virtual tty sample driver for use as a VFIO + mediated device + +config SAMPLE_VFIO_MDEV_MDPY + tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only" + depends on VFIO_MDEV_DEVICE && m + help + Build a virtual display sample driver for use as a VFIO + mediated device. It is a simple framebuffer and supports + the region display interface (VFIO_GFX_PLANE_TYPE_REGION). + +config SAMPLE_VFIO_MDEV_MDPY_FB + tristate "Build VFIO mdpy example guest fbdev driver -- loadable module only" + depends on FB && m + select FB_CFB_FILLRECT + select FB_CFB_COPYAREA + select FB_CFB_IMAGEBLIT + help + Guest fbdev driver for the virtual display sample driver. + +config SAMPLE_VFIO_MDEV_MBOCHS + tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only" + depends on VFIO_MDEV_DEVICE && m + select DMA_SHARED_BUFFER + help + Build a virtual display sample driver for use as a VFIO + mediated device. It supports the region display interface + (VFIO_GFX_PLANE_TYPE_DMABUF). + Emulate enough of qemu stdvga to make bochs-drm.ko happy. + That is basically the vram memory bar and the bochs dispi + interface vbe registers in the mmio register bar. + Specifically it does *not* include any legacy vga stuff. + Device looks a lot like "qemu -device secondary-vga". + +config SAMPLE_ANDROID_BINDERFS + bool "Build Android binderfs example" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Builds a sample program to illustrate the use of the Android binderfs + filesystem. + +config SAMPLE_VFS + bool "Build example programs that use new VFS system calls" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build example userspace programs that use new VFS system calls such + as mount API and statx(). Note that this is restricted to the x86 + arch whilst it accesses system calls that aren't yet in all arches. + +config SAMPLE_INTEL_MEI + bool "Build example program working with intel mei driver" + depends on INTEL_MEI + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build a sample program to work with mei device. + +config SAMPLE_WATCHDOG + bool "watchdog sample" + depends on CC_CAN_LINK + +config SAMPLE_WATCH_QUEUE + bool "Build example watch_queue notification API consumer" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build example userspace program to use the new mount_notify(), + sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function. + +endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 000000000..c3392a595 --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for Linux samples code + +subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay +subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs +obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ +obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ +subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw +obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/ +obj-$(CONFIG_SAMPLE_KDB) += kdb/ +obj-$(CONFIG_SAMPLE_KFIFO) += kfifo/ +obj-$(CONFIG_SAMPLE_KOBJECT) += kobject/ +obj-$(CONFIG_SAMPLE_KPROBES) += kprobes/ +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch/ +subdir-$(CONFIG_SAMPLE_PIDFD) += pidfd +obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi/ +obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/ +subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp +subdir-$(CONFIG_SAMPLE_TIMER) += timers +obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ +obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ +obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ +subdir-$(CONFIG_SAMPLE_UHID) += uhid +obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ +obj-y += vfio-mdev/ +subdir-$(CONFIG_SAMPLE_VFS) += vfs +obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/ +subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog +subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/ diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore new file mode 100644 index 000000000..2ed744c0e --- /dev/null +++ b/samples/auxdisplay/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +cfag12864b-example diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile new file mode 100644 index 000000000..19d556893 --- /dev/null +++ b/samples/auxdisplay/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += cfag12864b-example diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c new file mode 100644 index 000000000..bfeab44f8 --- /dev/null +++ b/samples/auxdisplay/cfag12864b-example.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Filename: cfag12864b-example.c + * Version: 0.1.0 + * Description: cfag12864b LCD userspace example program + * + * Author: Copyright (C) Miguel Ojeda Sandonis + * Date: 2006-10-31 + */ + +/* + * ------------------------ + * start of cfag12864b code + * ------------------------ + */ + +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> + +#define CFAG12864B_WIDTH (128) +#define CFAG12864B_HEIGHT (64) +#define CFAG12864B_SIZE (128 * 64 / 8) +#define CFAG12864B_BPB (8) +#define CFAG12864B_ADDRESS(x, y) ((y) * CFAG12864B_WIDTH / \ + CFAG12864B_BPB + (x) / CFAG12864B_BPB) +#define CFAG12864B_BIT(n) (((unsigned char) 1) << (n)) + +#undef CFAG12864B_DOCHECK +#ifdef CFAG12864B_DOCHECK + #define CFAG12864B_CHECK(x, y) ((x) < CFAG12864B_WIDTH && \ + (y) < CFAG12864B_HEIGHT) +#else + #define CFAG12864B_CHECK(x, y) (1) +#endif + +int cfag12864b_fd; +unsigned char * cfag12864b_mem; +unsigned char cfag12864b_buffer[CFAG12864B_SIZE]; + +/* + * init a cfag12864b framebuffer device + * + * No error: return = 0 + * Unable to open: return = -1 + * Unable to mmap: return = -2 + */ +static int cfag12864b_init(char *path) +{ + cfag12864b_fd = open(path, O_RDWR); + if (cfag12864b_fd == -1) + return -1; + + cfag12864b_mem = mmap(0, CFAG12864B_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, cfag12864b_fd, 0); + if (cfag12864b_mem == MAP_FAILED) { + close(cfag12864b_fd); + return -2; + } + + return 0; +} + +/* + * exit a cfag12864b framebuffer device + */ +static void cfag12864b_exit(void) +{ + munmap(cfag12864b_mem, CFAG12864B_SIZE); + close(cfag12864b_fd); +} + +/* + * set (x, y) pixel + */ +static void cfag12864b_set(unsigned char x, unsigned char y) +{ + if (CFAG12864B_CHECK(x, y)) + cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] |= + CFAG12864B_BIT(x % CFAG12864B_BPB); +} + +/* + * unset (x, y) pixel + */ +static void cfag12864b_unset(unsigned char x, unsigned char y) +{ + if (CFAG12864B_CHECK(x, y)) + cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &= + ~CFAG12864B_BIT(x % CFAG12864B_BPB); +} + +/* + * is set (x, y) pixel? + * + * Pixel off: return = 0 + * Pixel on: return = 1 + */ +static unsigned char cfag12864b_isset(unsigned char x, unsigned char y) +{ + if (CFAG12864B_CHECK(x, y)) + if (cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] & + CFAG12864B_BIT(x % CFAG12864B_BPB)) + return 1; + + return 0; +} + +/* + * not (x, y) pixel + */ +static void cfag12864b_not(unsigned char x, unsigned char y) +{ + if (cfag12864b_isset(x, y)) + cfag12864b_unset(x, y); + else + cfag12864b_set(x, y); +} + +/* + * fill (set all pixels) + */ +static void cfag12864b_fill(void) +{ + unsigned short i; + + for (i = 0; i < CFAG12864B_SIZE; i++) + cfag12864b_buffer[i] = 0xFF; +} + +/* + * clear (unset all pixels) + */ +static void cfag12864b_clear(void) +{ + unsigned short i; + + for (i = 0; i < CFAG12864B_SIZE; i++) + cfag12864b_buffer[i] = 0; +} + +/* + * format a [128*64] matrix + * + * Pixel off: src[i] = 0 + * Pixel on: src[i] > 0 + */ +static void cfag12864b_format(unsigned char * matrix) +{ + unsigned char i, j, n; + + for (i = 0; i < CFAG12864B_HEIGHT; i++) + for (j = 0; j < CFAG12864B_WIDTH / CFAG12864B_BPB; j++) { + cfag12864b_buffer[i * CFAG12864B_WIDTH / CFAG12864B_BPB + + j] = 0; + for (n = 0; n < CFAG12864B_BPB; n++) + if (matrix[i * CFAG12864B_WIDTH + + j * CFAG12864B_BPB + n]) + cfag12864b_buffer[i * CFAG12864B_WIDTH / + CFAG12864B_BPB + j] |= + CFAG12864B_BIT(n); + } +} + +/* + * blit buffer to lcd + */ +static void cfag12864b_blit(void) +{ + memcpy(cfag12864b_mem, cfag12864b_buffer, CFAG12864B_SIZE); +} + +/* + * ---------------------- + * end of cfag12864b code + * ---------------------- + */ + +#include <stdio.h> + +#define EXAMPLES 6 + +static void example(unsigned char n) +{ + unsigned short i, j; + unsigned char matrix[CFAG12864B_WIDTH * CFAG12864B_HEIGHT]; + + if (n > EXAMPLES) + return; + + printf("Example %i/%i - ", n, EXAMPLES); + + switch (n) { + case 1: + printf("Draw points setting bits"); + cfag12864b_clear(); + for (i = 0; i < CFAG12864B_WIDTH; i += 2) + for (j = 0; j < CFAG12864B_HEIGHT; j += 2) + cfag12864b_set(i, j); + break; + + case 2: + printf("Clear the LCD"); + cfag12864b_clear(); + break; + + case 3: + printf("Draw rows formatting a [128*64] matrix"); + memset(matrix, 0, CFAG12864B_WIDTH * CFAG12864B_HEIGHT); + for (i = 0; i < CFAG12864B_WIDTH; i++) + for (j = 0; j < CFAG12864B_HEIGHT; j += 2) + matrix[j * CFAG12864B_WIDTH + i] = 1; + cfag12864b_format(matrix); + break; + + case 4: + printf("Fill the lcd"); + cfag12864b_fill(); + break; + + case 5: + printf("Draw columns unsetting bits"); + for (i = 0; i < CFAG12864B_WIDTH; i += 2) + for (j = 0; j < CFAG12864B_HEIGHT; j++) + cfag12864b_unset(i, j); + break; + + case 6: + printf("Do negative not-ing all bits"); + for (i = 0; i < CFAG12864B_WIDTH; i++) + for (j = 0; j < CFAG12864B_HEIGHT; j ++) + cfag12864b_not(i, j); + break; + } + + puts(" - [Press Enter]"); +} + +int main(int argc, char *argv[]) +{ + unsigned char n; + + if (argc != 2) { + printf( + "Syntax: %s fbdev\n" + "Usually: /dev/fb0, /dev/fb1...\n", argv[0]); + return -1; + } + + if (cfag12864b_init(argv[1])) { + printf("Can't init %s fbdev\n", argv[1]); + return -2; + } + + for (n = 1; n <= EXAMPLES; n++) { + example(n); + cfag12864b_blit(); + while (getchar() != '\n'); + } + + cfag12864b_exit(); + + return 0; +} diff --git a/samples/binderfs/.gitignore b/samples/binderfs/.gitignore new file mode 100644 index 000000000..eb60241e8 --- /dev/null +++ b/samples/binderfs/.gitignore @@ -0,0 +1 @@ +binderfs_example diff --git a/samples/binderfs/Makefile b/samples/binderfs/Makefile new file mode 100644 index 000000000..629e43b9b --- /dev/null +++ b/samples/binderfs/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += binderfs_example + +userccflags += -I usr/include diff --git a/samples/binderfs/binderfs_example.c b/samples/binderfs/binderfs_example.c new file mode 100644 index 000000000..0fd92cdda --- /dev/null +++ b/samples/binderfs/binderfs_example.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <linux/android/binder.h> +#include <linux/android/binderfs.h> + +int main(int argc, char *argv[]) +{ + int fd, ret, saved_errno; + struct binderfs_device device = { 0 }; + + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + fprintf(stderr, "%s - Failed to unshare mount namespace\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); + if (ret < 0) { + fprintf(stderr, "%s - Failed to mount / as private\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = mkdir("/dev/binderfs", 0755); + if (ret < 0 && errno != EEXIST) { + fprintf(stderr, "%s - Failed to create binderfs mountpoint\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = mount(NULL, "/dev/binderfs", "binder", 0, 0); + if (ret < 0) { + fprintf(stderr, "%s - Failed to mount binderfs\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + memcpy(device.name, "my-binder", strlen("my-binder")); + + fd = open("/dev/binderfs/binder-control", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "%s - Failed to open binder-control device\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = ioctl(fd, BINDER_CTL_ADD, &device); + saved_errno = errno; + close(fd); + errno = saved_errno; + if (ret < 0) { + fprintf(stderr, "%s - Failed to allocate new binder device\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + printf("Allocated new binder device with major %d, minor %d, and name %s\n", + device.major, device.minor, device.name); + + ret = unlink("/dev/binderfs/my-binder"); + if (ret < 0) { + fprintf(stderr, "%s - Failed to delete binder device\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Cleanup happens when the mount namespace dies. */ + exit(EXIT_SUCCESS); +} diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore new file mode 100644 index 000000000..b2f29bc8d --- /dev/null +++ b/samples/bpf/.gitignore @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0-only +cpustat +fds_example +hbm +ibumad +lathist +lwt_len_hist +map_perf_test +offwaketime +per_socket_stats_example +sampleip +sock_example +sockex1 +sockex2 +sockex3 +spintest +syscall_nrs.h +syscall_tp +task_fd_query +tc_l2_redirect +test_cgrp2_array_pin +test_cgrp2_attach +test_cgrp2_attach2 +test_cgrp2_sock +test_cgrp2_sock2 +test_current_task_under_cgroup +test_lru_dist +test_map_in_map +test_overhead +test_probe_write_user +trace_event +trace_output +tracex1 +tracex2 +tracex3 +tracex4 +tracex5 +tracex6 +tracex7 +xdp1 +xdp2 +xdp_adjust_tail +xdp_fwd +xdp_monitor +xdp_redirect +xdp_redirect_cpu +xdp_redirect_map +xdp_router_ipv4 +xdp_rxq_info +xdp_sample_pkts +xdp_tx_iptunnel +xdpsock +xsk_fwd +testfile.img diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile new file mode 100644 index 000000000..aeebf5d12 --- /dev/null +++ b/samples/bpf/Makefile @@ -0,0 +1,329 @@ +# SPDX-License-Identifier: GPL-2.0 + +BPF_SAMPLES_PATH ?= $(abspath $(srctree)/$(src)) +TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools + +# List of programs to build +tprogs-y := test_lru_dist +tprogs-y += sock_example +tprogs-y += fds_example +tprogs-y += sockex1 +tprogs-y += sockex2 +tprogs-y += sockex3 +tprogs-y += tracex1 +tprogs-y += tracex2 +tprogs-y += tracex3 +tprogs-y += tracex4 +tprogs-y += tracex5 +tprogs-y += tracex6 +tprogs-y += tracex7 +tprogs-y += test_probe_write_user +tprogs-y += trace_output +tprogs-y += lathist +tprogs-y += offwaketime +tprogs-y += spintest +tprogs-y += map_perf_test +tprogs-y += test_overhead +tprogs-y += test_cgrp2_array_pin +tprogs-y += test_cgrp2_attach +tprogs-y += test_cgrp2_sock +tprogs-y += test_cgrp2_sock2 +tprogs-y += xdp1 +tprogs-y += xdp2 +tprogs-y += xdp_router_ipv4 +tprogs-y += test_current_task_under_cgroup +tprogs-y += trace_event +tprogs-y += sampleip +tprogs-y += tc_l2_redirect +tprogs-y += lwt_len_hist +tprogs-y += xdp_tx_iptunnel +tprogs-y += test_map_in_map +tprogs-y += per_socket_stats_example +tprogs-y += xdp_redirect +tprogs-y += xdp_redirect_map +tprogs-y += xdp_redirect_cpu +tprogs-y += xdp_monitor +tprogs-y += xdp_rxq_info +tprogs-y += syscall_tp +tprogs-y += cpustat +tprogs-y += xdp_adjust_tail +tprogs-y += xdpsock +tprogs-y += xsk_fwd +tprogs-y += xdp_fwd +tprogs-y += task_fd_query +tprogs-y += xdp_sample_pkts +tprogs-y += ibumad +tprogs-y += hbm + +# Libbpf dependencies +LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a + +CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o +TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o + +fds_example-objs := fds_example.o +sockex1-objs := sockex1_user.o +sockex2-objs := sockex2_user.o +sockex3-objs := sockex3_user.o +tracex1-objs := tracex1_user.o $(TRACE_HELPERS) +tracex2-objs := tracex2_user.o +tracex3-objs := tracex3_user.o +tracex4-objs := tracex4_user.o +tracex5-objs := tracex5_user.o $(TRACE_HELPERS) +tracex6-objs := tracex6_user.o +tracex7-objs := tracex7_user.o +test_probe_write_user-objs := test_probe_write_user_user.o +trace_output-objs := trace_output_user.o $(TRACE_HELPERS) +lathist-objs := lathist_user.o +offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) +spintest-objs := spintest_user.o $(TRACE_HELPERS) +map_perf_test-objs := map_perf_test_user.o +test_overhead-objs := bpf_load.o test_overhead_user.o +test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o +test_cgrp2_attach-objs := test_cgrp2_attach.o +test_cgrp2_sock-objs := test_cgrp2_sock.o +test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o +xdp1-objs := xdp1_user.o +# reuse xdp1 source intentionally +xdp2-objs := xdp1_user.o +xdp_router_ipv4-objs := xdp_router_ipv4_user.o +test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \ + test_current_task_under_cgroup_user.o +trace_event-objs := trace_event_user.o $(TRACE_HELPERS) +sampleip-objs := sampleip_user.o $(TRACE_HELPERS) +tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o +lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o +xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o +test_map_in_map-objs := test_map_in_map_user.o +per_socket_stats_example-objs := cookie_uid_helper_example.o +xdp_redirect-objs := xdp_redirect_user.o +xdp_redirect_map-objs := xdp_redirect_map_user.o +xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o +xdp_monitor-objs := xdp_monitor_user.o +xdp_rxq_info-objs := xdp_rxq_info_user.o +syscall_tp-objs := syscall_tp_user.o +cpustat-objs := cpustat_user.o +xdp_adjust_tail-objs := xdp_adjust_tail_user.o +xdpsock-objs := xdpsock_user.o +xsk_fwd-objs := xsk_fwd.o +xdp_fwd-objs := xdp_fwd_user.o +task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) +xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) +ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) +hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) + +# Tell kbuild to always build the programs +always-y := $(tprogs-y) +always-y += sockex1_kern.o +always-y += sockex2_kern.o +always-y += sockex3_kern.o +always-y += tracex1_kern.o +always-y += tracex2_kern.o +always-y += tracex3_kern.o +always-y += tracex4_kern.o +always-y += tracex5_kern.o +always-y += tracex6_kern.o +always-y += tracex7_kern.o +always-y += sock_flags_kern.o +always-y += test_probe_write_user_kern.o +always-y += trace_output_kern.o +always-y += tcbpf1_kern.o +always-y += tc_l2_redirect_kern.o +always-y += lathist_kern.o +always-y += offwaketime_kern.o +always-y += spintest_kern.o +always-y += map_perf_test_kern.o +always-y += test_overhead_tp_kern.o +always-y += test_overhead_raw_tp_kern.o +always-y += test_overhead_kprobe_kern.o +always-y += parse_varlen.o parse_simple.o parse_ldabs.o +always-y += test_cgrp2_tc_kern.o +always-y += xdp1_kern.o +always-y += xdp2_kern.o +always-y += xdp_router_ipv4_kern.o +always-y += test_current_task_under_cgroup_kern.o +always-y += trace_event_kern.o +always-y += sampleip_kern.o +always-y += lwt_len_hist_kern.o +always-y += xdp_tx_iptunnel_kern.o +always-y += test_map_in_map_kern.o +always-y += tcp_synrto_kern.o +always-y += tcp_rwnd_kern.o +always-y += tcp_bufs_kern.o +always-y += tcp_cong_kern.o +always-y += tcp_iw_kern.o +always-y += tcp_clamp_kern.o +always-y += tcp_basertt_kern.o +always-y += tcp_tos_reflect_kern.o +always-y += tcp_dumpstats_kern.o +always-y += xdp_redirect_kern.o +always-y += xdp_redirect_map_kern.o +always-y += xdp_redirect_cpu_kern.o +always-y += xdp_monitor_kern.o +always-y += xdp_rxq_info_kern.o +always-y += xdp2skb_meta_kern.o +always-y += syscall_tp_kern.o +always-y += cpustat_kern.o +always-y += xdp_adjust_tail_kern.o +always-y += xdp_fwd_kern.o +always-y += task_fd_query_kern.o +always-y += xdp_sample_pkts_kern.o +always-y += ibumad_kern.o +always-y += hbm_out_kern.o +always-y += hbm_edt_kern.o +always-y += xdpsock_kern.o + +ifeq ($(ARCH), arm) +# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux +# headers when arm instruction set identification is requested. +ARM_ARCH_SELECTOR := $(filter -D__LINUX_ARM_ARCH__%, $(KBUILD_CFLAGS)) +BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) +TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) +endif + +TPROGS_CFLAGS += -Wall -O2 +TPROGS_CFLAGS += -Wmissing-prototypes +TPROGS_CFLAGS += -Wstrict-prototypes + +TPROGS_CFLAGS += -I$(objtree)/usr/include +TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ +TPROGS_CFLAGS += -I$(srctree)/tools/lib/ +TPROGS_CFLAGS += -I$(srctree)/tools/include +TPROGS_CFLAGS += -I$(srctree)/tools/perf +TPROGS_CFLAGS += -DHAVE_ATTR_TEST=0 + +ifdef SYSROOT +TPROGS_CFLAGS += --sysroot=$(SYSROOT) +TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib +endif + +TPROGCFLAGS_bpf_load.o += -Wno-unused-variable + +TPROGS_LDLIBS += $(LIBBPF) -lelf -lz +TPROGLDLIBS_tracex4 += -lrt +TPROGLDLIBS_trace_output += -lrt +TPROGLDLIBS_map_perf_test += -lrt +TPROGLDLIBS_test_overhead += -lrt +TPROGLDLIBS_xdpsock += -pthread +TPROGLDLIBS_xsk_fwd += -pthread + +# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: +# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +LLC ?= llc +CLANG ?= clang +OPT ?= opt +LLVM_DIS ?= llvm-dis +LLVM_OBJCOPY ?= llvm-objcopy +BTF_PAHOLE ?= pahole + +# Detect that we're cross compiling and use the cross compiler +ifdef CROSS_COMPILE +CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif + +# Don't evaluate probes and warnings if we need to run make recursively +ifneq ($(src),) +HDR_PROBE := $(shell printf "\#include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ + $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \ + -o /dev/null 2>/dev/null && echo okay) + +ifeq ($(HDR_PROBE),) +$(warning WARNING: Detected possible issues with include path.) +$(warning WARNING: Please install kernel headers locally (make headers_install).) +endif + +BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) +BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) +BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') +BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \ + $(CLANG) -target bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \ + readelf -S ./llvm_btf_verify.o | grep BTF; \ + /bin/rm -f ./llvm_btf_verify.o) + +BPF_EXTRA_CFLAGS += -fno-stack-protector +ifneq ($(BTF_LLVM_PROBE),) + BPF_EXTRA_CFLAGS += -g +else +ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),) + BPF_EXTRA_CFLAGS += -g + LLC_FLAGS += -mattr=dwarfris + DWARF2BTF = y +endif +endif +endif + +# Trick to allow make to be run from this directory +all: + $(MAKE) -C ../../ M=$(CURDIR) BPF_SAMPLES_PATH=$(CURDIR) + +clean: + $(MAKE) -C ../../ M=$(CURDIR) clean + @find $(CURDIR) -type f -name '*~' -delete + +$(LIBBPF): FORCE +# Fix up variables inherited from Kbuild that tools/ build system won't like + $(MAKE) -C $(dir $@) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ + LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ O= + +$(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE + $(call filechk,offsets,__SYSCALL_NRS_H__) + +targets += syscall_nrs.s +clean-files += syscall_nrs.h + +FORCE: + + +# Verify LLVM compiler tools are available and bpf target is supported by llc +.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC) + +verify_cmds: $(CLANG) $(LLC) + @for TOOL in $^ ; do \ + if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \ + echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\ + exit 1; \ + else true; fi; \ + done + +verify_target_bpf: verify_cmds + @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \ + echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\ + echo " NOTICE: LLVM version >= 3.7.1 required" ;\ + exit 2; \ + else true; fi + +$(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) +$(src)/*.c: verify_target_bpf $(LIBBPF) + +$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h +$(obj)/hbm.o: $(src)/hbm.h +$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h + +-include $(BPF_SAMPLES_PATH)/Makefile.target + +# asm/sysreg.h - inline assembly used by it is incompatible with llvm. +# But, there is no easy way to fix it, so just exclude it since it is +# useless for BPF samples. +# below we use long chain of commands, clang | opt | llvm-dis | llc, +# to generate final object file. 'clang' compiles the source into IR +# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin +# processing (llvm12) and IR optimizations. 'llvm-dis' converts +# 'opt' output to IR, and finally 'llc' generates bpf byte code. +$(obj)/%.o: $(src)/%.c + @echo " CLANG-bpf " $@ + $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ + -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ + -I$(srctree)/tools/lib/ \ + -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ + -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ + -Wno-gnu-variable-sized-type-not-at-end \ + -Wno-address-of-packed-member -Wno-tautological-compare \ + -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ + -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ + -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ + $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ +ifeq ($(DWARF2BTF),y) + $(BTF_PAHOLE) -J $@ +endif diff --git a/samples/bpf/Makefile.target b/samples/bpf/Makefile.target new file mode 100644 index 000000000..7621f55e2 --- /dev/null +++ b/samples/bpf/Makefile.target @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# ========================================================================== +# Building binaries on the host system +# Binaries are not used during the compilation of the kernel, and intended +# to be build for target board, target board can be host of course. Added to +# build binaries to run not on host system. +# +# Sample syntax +# tprogs-y := xsk_example +# Will compile xsk_example.c and create an executable named xsk_example +# +# tprogs-y := xdpsock +# xdpsock-objs := xdpsock_1.o xdpsock_2.o +# Will compile xdpsock_1.c and xdpsock_2.c, and then link the executable +# xdpsock, based on xdpsock_1.o and xdpsock_2.o +# +# Derived from scripts/Makefile.host +# +__tprogs := $(sort $(tprogs-y)) + +# C code +# Executables compiled from a single .c file +tprog-csingle := $(foreach m,$(__tprogs), \ + $(if $($(m)-objs),,$(m))) + +# C executables linked based on several .o files +tprog-cmulti := $(foreach m,$(__tprogs),\ + $(if $($(m)-objs),$(m))) + +# Object (.o) files compiled from .c files +tprog-cobjs := $(sort $(foreach m,$(__tprogs),$($(m)-objs))) + +tprog-csingle := $(addprefix $(obj)/,$(tprog-csingle)) +tprog-cmulti := $(addprefix $(obj)/,$(tprog-cmulti)) +tprog-cobjs := $(addprefix $(obj)/,$(tprog-cobjs)) + +##### +# Handle options to gcc. Support building with separate output directory + +_tprogc_flags = $(TPROGS_CFLAGS) \ + $(TPROGCFLAGS_$(basetarget).o) + +# $(objtree)/$(obj) for including generated headers from checkin source files +ifeq ($(KBUILD_EXTMOD),) +ifdef building_out_of_srctree +_tprogc_flags += -I $(objtree)/$(obj) +endif +endif + +tprogc_flags = -Wp,-MD,$(depfile) $(_tprogc_flags) + +# Create executable from a single .c file +# tprog-csingle -> Executable +quiet_cmd_tprog-csingle = CC $@ + cmd_tprog-csingle = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ $< \ + $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) +$(tprog-csingle): $(obj)/%: $(src)/%.c FORCE + $(call if_changed_dep,tprog-csingle) + +# Link an executable based on list of .o files, all plain c +# tprog-cmulti -> executable +quiet_cmd_tprog-cmulti = LD $@ + cmd_tprog-cmulti = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ \ + $(addprefix $(obj)/,$($(@F)-objs)) \ + $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) +$(tprog-cmulti): $(tprog-cobjs) FORCE + $(call if_changed,tprog-cmulti) +$(call multi_depend, $(tprog-cmulti), , -objs) + +# Create .o file from a single .c file +# tprog-cobjs -> .o +quiet_cmd_tprog-cobjs = CC $@ + cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $< +$(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE + $(call if_changed_dep,tprog-cobjs) diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst new file mode 100644 index 000000000..dd34b2d26 --- /dev/null +++ b/samples/bpf/README.rst @@ -0,0 +1,105 @@ +eBPF sample programs +==================== + +This directory contains a test stubs, verifier test-suite and examples +for using eBPF. The examples use libbpf from tools/lib/bpf. + +Build dependencies +================== + +Compiling requires having installed: + * clang >= version 3.4.0 + * llvm >= version 3.7.1 + +Note that LLVM's tool 'llc' must support target 'bpf', list version +and supported targets with command: ``llc --version`` + +Clean and configuration +----------------------- + +It can be needed to clean tools, samples or kernel before trying new arch or +after some changes (on demand):: + + make -C tools clean + make -C samples/bpf clean + make clean + +Configure kernel, defconfig for instance:: + + make defconfig + +Kernel headers +-------------- + +There are usually dependencies to header files of the current kernel. +To avoid installing devel kernel headers system wide, as a normal +user, simply call:: + + make headers_install + +This will creates a local "usr/include" directory in the git/build top +level directory, that the make system automatically pickup first. + +Compiling +========= + +For building the BPF samples, issue the below command from the kernel +top level directory:: + + make M=samples/bpf + +It is also possible to call make from this directory. This will just +hide the invocation of make as above. + +Manually compiling LLVM with 'bpf' support +------------------------------------------ + +Since version 3.7.0, LLVM adds a proper LLVM backend target for the +BPF bytecode architecture. + +By default llvm will build all non-experimental backends including bpf. +To generate a smaller llc binary one can use:: + + -DLLVM_TARGETS_TO_BUILD="BPF" + +Quick sniplet for manually compiling LLVM and clang +(build dependencies are cmake and gcc-c++):: + + $ git clone http://llvm.org/git/llvm.git + $ cd llvm/tools + $ git clone --depth 1 http://llvm.org/git/clang.git + $ cd ..; mkdir build; cd build + $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" + $ make -j $(getconf _NPROCESSORS_ONLN) + +It is also possible to point make to the newly compiled 'llc' or +'clang' command via redefining LLC or CLANG on the make command line:: + + make M=samples/bpf LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + +Cross compiling samples +----------------------- +In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH +environment variables before calling make. But do this before clean, +cofiguration and header install steps described above. This will direct make to +build samples for the cross target:: + + export ARCH=arm64 + export CROSS_COMPILE="aarch64-linux-gnu-" + +Headers can be also installed on RFS of target board if need to keep them in +sync (not necessarily and it creates a local "usr/include" directory also):: + + make INSTALL_HDR_PATH=~/some_sysroot/usr headers_install + +Pointing LLC and CLANG is not necessarily if it's installed on HOST and have +in its targets appropriate arm64 arch (usually it has several arches). +Build samples:: + + make M=samples/bpf + +Or build samples with SYSROOT if some header or library is absent in toolchain, +say libelf, providing address to file system containing headers and libs, +can be RFS of target board:: + + make M=samples/bpf SYSROOT=~/some_sysroot diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h new file mode 100644 index 000000000..7048bb359 --- /dev/null +++ b/samples/bpf/asm_goto_workaround.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Facebook */ +#ifndef __ASM_GOTO_WORKAROUND_H +#define __ASM_GOTO_WORKAROUND_H + +/* + * This will bring in asm_volatile_goto and asm_inline macro definitions + * if enabled by compiler and config options. + */ +#include <linux/types.h> + +#ifdef asm_volatile_goto +#undef asm_volatile_goto +#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto") +#endif + +/* + * asm_inline is defined as asm __inline in "include/linux/compiler_types.h" + * if supported by the kernel's CC (i.e CONFIG_CC_HAS_ASM_INLINE) which is not + * supported by CLANG. + */ +#ifdef asm_inline +#undef asm_inline +#define asm_inline asm +#endif + +#define volatile(x...) volatile("") +#endif diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h new file mode 100644 index 000000000..544237980 --- /dev/null +++ b/samples/bpf/bpf_insn.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* eBPF instruction mini library */ +#ifndef __BPF_INSN_H +#define __BPF_INSN_H + +struct bpf_insn; + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +#define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_ALU32_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Short form of mov, dst_reg = src_reg */ + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* Short form of mov, dst_reg = imm32 */ + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + +#ifndef BPF_PSEUDO_MAP_FD +# define BPF_PSEUDO_MAP_FD 1 +#endif + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + + +/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ + +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Raw code statement block */ + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +/* Program exit */ + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#endif diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c new file mode 100644 index 000000000..c5ad528f0 --- /dev/null +++ b/samples/bpf/bpf_load.c @@ -0,0 +1,667 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <libelf.h> +#include <gelf.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdbool.h> +#include <stdlib.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/perf_event.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/types.h> +#include <sys/socket.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <poll.h> +#include <ctype.h> +#include <assert.h> +#include <bpf/bpf.h> +#include "bpf_load.h" +#include "perf-sys.h" + +#define DEBUGFS "/sys/kernel/debug/tracing/" + +static char license[128]; +static int kern_version; +static bool processed_sec[128]; +char bpf_log_buf[BPF_LOG_BUF_SIZE]; +int map_fd[MAX_MAPS]; +int prog_fd[MAX_PROGS]; +int event_fd[MAX_PROGS]; +int prog_cnt; +int prog_array_fd = -1; + +struct bpf_map_data map_data[MAX_MAPS]; +int map_data_count; + +static int populate_prog_array(const char *event, int prog_fd) +{ + int ind = atoi(event), err; + + err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); + if (err < 0) { + printf("failed to store prog_fd in prog_array\n"); + return -1; + } + return 0; +} + +static int write_kprobe_events(const char *val) +{ + int fd, ret, flags; + + if (val == NULL) + return -1; + else if (val[0] == '\0') + flags = O_WRONLY | O_TRUNC; + else + flags = O_WRONLY | O_APPEND; + + fd = open(DEBUGFS "kprobe_events", flags); + + ret = write(fd, val, strlen(val)); + close(fd); + + return ret; +} + +static int load_and_attach(const char *event, struct bpf_insn *prog, int size) +{ + bool is_socket = strncmp(event, "socket", 6) == 0; + bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; + bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; + bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; + bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; + bool is_xdp = strncmp(event, "xdp", 3) == 0; + bool is_perf_event = strncmp(event, "perf_event", 10) == 0; + bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; + bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; + bool is_sockops = strncmp(event, "sockops", 7) == 0; + bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; + bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; + size_t insns_cnt = size / sizeof(struct bpf_insn); + enum bpf_prog_type prog_type; + char buf[256]; + int fd, efd, err, id; + struct perf_event_attr attr = {}; + + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = 1; + attr.wakeup_events = 1; + + if (is_socket) { + prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + } else if (is_kprobe || is_kretprobe) { + prog_type = BPF_PROG_TYPE_KPROBE; + } else if (is_tracepoint) { + prog_type = BPF_PROG_TYPE_TRACEPOINT; + } else if (is_raw_tracepoint) { + prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; + } else if (is_xdp) { + prog_type = BPF_PROG_TYPE_XDP; + } else if (is_perf_event) { + prog_type = BPF_PROG_TYPE_PERF_EVENT; + } else if (is_cgroup_skb) { + prog_type = BPF_PROG_TYPE_CGROUP_SKB; + } else if (is_cgroup_sk) { + prog_type = BPF_PROG_TYPE_CGROUP_SOCK; + } else if (is_sockops) { + prog_type = BPF_PROG_TYPE_SOCK_OPS; + } else if (is_sk_skb) { + prog_type = BPF_PROG_TYPE_SK_SKB; + } else if (is_sk_msg) { + prog_type = BPF_PROG_TYPE_SK_MSG; + } else { + printf("Unknown event '%s'\n", event); + return -1; + } + + if (prog_cnt == MAX_PROGS) + return -1; + + fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, + bpf_log_buf, BPF_LOG_BUF_SIZE); + if (fd < 0) { + printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); + return -1; + } + + prog_fd[prog_cnt++] = fd; + + if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) + return 0; + + if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { + if (is_socket) + event += 6; + else + event += 7; + if (*event != '/') + return 0; + event++; + if (!isdigit(*event)) { + printf("invalid prog number\n"); + return -1; + } + return populate_prog_array(event, fd); + } + + if (is_raw_tracepoint) { + efd = bpf_raw_tracepoint_open(event + 15, fd); + if (efd < 0) { + printf("tracepoint %s %s\n", event + 15, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + return 0; + } + + if (is_kprobe || is_kretprobe) { + bool need_normal_check = true; + const char *event_prefix = ""; + + if (is_kprobe) + event += 7; + else + event += 10; + + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + + if (isdigit(*event)) + return populate_prog_array(event, fd); + +#ifdef __x86_64__ + if (strncmp(event, "sys_", 4) == 0) { + snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s", + is_kprobe ? 'p' : 'r', event, event); + err = write_kprobe_events(buf); + if (err >= 0) { + need_normal_check = false; + event_prefix = "__x64_"; + } + } +#endif + if (need_normal_check) { + snprintf(buf, sizeof(buf), "%c:%s %s", + is_kprobe ? 'p' : 'r', event, event); + err = write_kprobe_events(buf); + if (err < 0) { + printf("failed to create kprobe '%s' error '%s'\n", + event, strerror(errno)); + return -1; + } + } + + strcpy(buf, DEBUGFS); + strcat(buf, "events/kprobes/"); + strcat(buf, event_prefix); + strcat(buf, event); + strcat(buf, "/id"); + } else if (is_tracepoint) { + event += 11; + + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + strcpy(buf, DEBUGFS); + strcat(buf, "events/"); + strcat(buf, event); + strcat(buf, "/id"); + } + + efd = open(buf, O_RDONLY, 0); + if (efd < 0) { + printf("failed to open event %s\n", event); + return -1; + } + + err = read(efd, buf, sizeof(buf)); + if (err < 0 || err >= sizeof(buf)) { + printf("read from '%s' failed '%s'\n", event, strerror(errno)); + return -1; + } + + close(efd); + + buf[err] = 0; + id = atoi(buf); + attr.config = id; + + efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); + if (efd < 0) { + printf("event %d fd %d err %s\n", id, efd, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); + if (err < 0) { + printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", + strerror(errno)); + return -1; + } + err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); + if (err < 0) { + printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", + strerror(errno)); + return -1; + } + + return 0; +} + +static int load_maps(struct bpf_map_data *maps, int nr_maps, + fixup_map_cb fixup_map) +{ + int i, numa_node; + + for (i = 0; i < nr_maps; i++) { + if (fixup_map) { + fixup_map(&maps[i], i); + /* Allow userspace to assign map FD prior to creation */ + if (maps[i].fd != -1) { + map_fd[i] = maps[i].fd; + continue; + } + } + + numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? + maps[i].def.numa_node : -1; + + if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { + int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; + + map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, + maps[i].name, + maps[i].def.key_size, + inner_map_fd, + maps[i].def.max_entries, + maps[i].def.map_flags, + numa_node); + } else { + map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].name, + maps[i].def.key_size, + maps[i].def.value_size, + maps[i].def.max_entries, + maps[i].def.map_flags, + numa_node); + } + if (map_fd[i] < 0) { + printf("failed to create map %d (%s): %d %s\n", + i, maps[i].name, errno, strerror(errno)); + return 1; + } + maps[i].fd = map_fd[i]; + + if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) + prog_array_fd = map_fd[i]; + } + return 0; +} + +static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, + GElf_Shdr *shdr, Elf_Data **data) +{ + Elf_Scn *scn; + + scn = elf_getscn(elf, i); + if (!scn) + return 1; + + if (gelf_getshdr(scn, shdr) != shdr) + return 2; + + *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); + if (!*shname || !shdr->sh_size) + return 3; + + *data = elf_getdata(scn, 0); + if (!*data || elf_getdata(scn, *data) != NULL) + return 4; + + return 0; +} + +static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, + GElf_Shdr *shdr, struct bpf_insn *insn, + struct bpf_map_data *maps, int nr_maps) +{ + int i, nrels; + + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + GElf_Sym sym; + GElf_Rel rel; + unsigned int insn_idx; + bool match = false; + int j, map_idx; + + gelf_getrel(data, i, &rel); + + insn_idx = rel.r_offset / sizeof(struct bpf_insn); + + gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); + + if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { + printf("invalid relo for insn[%d].code 0x%x\n", + insn_idx, insn[insn_idx].code); + return 1; + } + insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + + /* Match FD relocation against recorded map_data[] offset */ + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + if (maps[map_idx].elf_offset == sym.st_value) { + match = true; + break; + } + } + if (match) { + insn[insn_idx].imm = maps[map_idx].fd; + } else { + printf("invalid relo for insn[%d] no map_data match\n", + insn_idx); + return 1; + } + } + + return 0; +} + +static int cmp_symbols(const void *l, const void *r) +{ + const GElf_Sym *lsym = (const GElf_Sym *)l; + const GElf_Sym *rsym = (const GElf_Sym *)r; + + if (lsym->st_value < rsym->st_value) + return -1; + else if (lsym->st_value > rsym->st_value) + return 1; + else + return 0; +} + +static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, + Elf *elf, Elf_Data *symbols, int strtabidx) +{ + int map_sz_elf, map_sz_copy; + bool validate_zero = false; + Elf_Data *data_maps; + int i, nr_maps; + GElf_Sym *sym; + Elf_Scn *scn; + int copy_sz; + + if (maps_shndx < 0) + return -EINVAL; + if (!symbols) + return -EINVAL; + + /* Get data for maps section via elf index */ + scn = elf_getscn(elf, maps_shndx); + if (scn) + data_maps = elf_getdata(scn, NULL); + if (!scn || !data_maps) { + printf("Failed to get Elf_Data from maps section %d\n", + maps_shndx); + return -EINVAL; + } + + /* For each map get corrosponding symbol table entry */ + sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); + for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { + assert(nr_maps < MAX_MAPS+1); + if (!gelf_getsym(symbols, i, &sym[nr_maps])) + continue; + if (sym[nr_maps].st_shndx != maps_shndx) + continue; + /* Only increment iif maps section */ + nr_maps++; + } + + /* Align to map_fd[] order, via sort on offset in sym.st_value */ + qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); + + /* Keeping compatible with ELF maps section changes + * ------------------------------------------------ + * The program size of struct bpf_load_map_def is known by loader + * code, but struct stored in ELF file can be different. + * + * Unfortunately sym[i].st_size is zero. To calculate the + * struct size stored in the ELF file, assume all struct have + * the same size, and simply divide with number of map + * symbols. + */ + map_sz_elf = data_maps->d_size / nr_maps; + map_sz_copy = sizeof(struct bpf_load_map_def); + if (map_sz_elf < map_sz_copy) { + /* + * Backward compat, loading older ELF file with + * smaller struct, keeping remaining bytes zero. + */ + map_sz_copy = map_sz_elf; + } else if (map_sz_elf > map_sz_copy) { + /* + * Forward compat, loading newer ELF file with larger + * struct with unknown features. Assume zero means + * feature not used. Thus, validate rest of struct + * data is zero. + */ + validate_zero = true; + } + + /* Memcpy relevant part of ELF maps data to loader maps */ + for (i = 0; i < nr_maps; i++) { + struct bpf_load_map_def *def; + unsigned char *addr, *end; + const char *map_name; + size_t offset; + + map_name = elf_strptr(elf, strtabidx, sym[i].st_name); + maps[i].name = strdup(map_name); + if (!maps[i].name) { + printf("strdup(%s): %s(%d)\n", map_name, + strerror(errno), errno); + free(sym); + return -errno; + } + + /* Symbol value is offset into ELF maps section data area */ + offset = sym[i].st_value; + def = (struct bpf_load_map_def *)(data_maps->d_buf + offset); + maps[i].elf_offset = offset; + memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def)); + memcpy(&maps[i].def, def, map_sz_copy); + + /* Verify no newer features were requested */ + if (validate_zero) { + addr = (unsigned char *) def + map_sz_copy; + end = (unsigned char *) def + map_sz_elf; + for (; addr < end; addr++) { + if (*addr != 0) { + free(sym); + return -EFBIG; + } + } + } + } + + free(sym); + return nr_maps; +} + +static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) +{ + int fd, i, ret, maps_shndx = -1, strtabidx = -1; + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr, shdr_prog; + Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; + char *shname, *shname_prog; + int nr_maps = 0; + + /* reset global variables */ + kern_version = 0; + memset(license, 0, sizeof(license)); + memset(processed_sec, 0, sizeof(processed_sec)); + + if (elf_version(EV_CURRENT) == EV_NONE) + return 1; + + fd = open(path, O_RDONLY, 0); + if (fd < 0) + return 1; + + elf = elf_begin(fd, ELF_C_READ, NULL); + + if (!elf) + return 1; + + if (gelf_getehdr(elf, &ehdr) != &ehdr) + return 1; + + /* clear all kprobes */ + i = write_kprobe_events(""); + + /* scan over all elf sections to get license and map info */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (0) /* helpful for llvm debugging */ + printf("section %d:%s data %p size %zd link %d flags %d\n", + i, shname, data->d_buf, data->d_size, + shdr.sh_link, (int) shdr.sh_flags); + + if (strcmp(shname, "license") == 0) { + processed_sec[i] = true; + memcpy(license, data->d_buf, data->d_size); + } else if (strcmp(shname, "version") == 0) { + processed_sec[i] = true; + if (data->d_size != sizeof(int)) { + printf("invalid size of version section %zd\n", + data->d_size); + return 1; + } + memcpy(&kern_version, data->d_buf, sizeof(int)); + } else if (strcmp(shname, "maps") == 0) { + int j; + + maps_shndx = i; + data_maps = data; + for (j = 0; j < MAX_MAPS; j++) + map_data[j].fd = -1; + } else if (shdr.sh_type == SHT_SYMTAB) { + strtabidx = shdr.sh_link; + symbols = data; + } + } + + ret = 1; + + if (!symbols) { + printf("missing SHT_SYMTAB section\n"); + goto done; + } + + if (data_maps) { + nr_maps = load_elf_maps_section(map_data, maps_shndx, + elf, symbols, strtabidx); + if (nr_maps < 0) { + printf("Error: Failed loading ELF maps (errno:%d):%s\n", + nr_maps, strerror(-nr_maps)); + goto done; + } + if (load_maps(map_data, nr_maps, fixup_map)) + goto done; + map_data_count = nr_maps; + + processed_sec[maps_shndx] = true; + } + + /* process all relo sections, and rewrite bpf insns for maps */ + for (i = 1; i < ehdr.e_shnum; i++) { + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (shdr.sh_type == SHT_REL) { + struct bpf_insn *insns; + + /* locate prog sec that need map fixup (relocations) */ + if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, + &shdr_prog, &data_prog)) + continue; + + if (shdr_prog.sh_type != SHT_PROGBITS || + !(shdr_prog.sh_flags & SHF_EXECINSTR)) + continue; + + insns = (struct bpf_insn *) data_prog->d_buf; + processed_sec[i] = true; /* relo section */ + + if (parse_relo_and_apply(data, symbols, &shdr, insns, + map_data, nr_maps)) + continue; + } + } + + /* load programs */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (memcmp(shname, "kprobe/", 7) == 0 || + memcmp(shname, "kretprobe/", 10) == 0 || + memcmp(shname, "tracepoint/", 11) == 0 || + memcmp(shname, "raw_tracepoint/", 15) == 0 || + memcmp(shname, "xdp", 3) == 0 || + memcmp(shname, "perf_event", 10) == 0 || + memcmp(shname, "socket", 6) == 0 || + memcmp(shname, "cgroup/", 7) == 0 || + memcmp(shname, "sockops", 7) == 0 || + memcmp(shname, "sk_skb", 6) == 0 || + memcmp(shname, "sk_msg", 6) == 0) { + ret = load_and_attach(shname, data->d_buf, + data->d_size); + if (ret != 0) + goto done; + } + } + +done: + close(fd); + return ret; +} + +int load_bpf_file(char *path) +{ + return do_load_bpf_file(path, NULL); +} + +int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) +{ + return do_load_bpf_file(path, fixup_map); +} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h new file mode 100644 index 000000000..4fcd258c6 --- /dev/null +++ b/samples/bpf/bpf_load.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_LOAD_H +#define __BPF_LOAD_H + +#include <bpf/bpf.h> + +#define MAX_MAPS 32 +#define MAX_PROGS 32 + +struct bpf_load_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; + unsigned int inner_map_idx; + unsigned int numa_node; +}; + +struct bpf_map_data { + int fd; + char *name; + size_t elf_offset; + struct bpf_load_map_def def; +}; + +typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); + +extern int prog_fd[MAX_PROGS]; +extern int event_fd[MAX_PROGS]; +extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; +extern int prog_cnt; + +/* There is a one-to-one mapping between map_fd[] and map_data[]. + * The map_data[] just contains more rich info on the given map. + */ +extern int map_fd[MAX_MAPS]; +extern struct bpf_map_data map_data[MAX_MAPS]; +extern int map_data_count; + +/* parses elf file compiled by llvm .c->.o + * . parses 'maps' section and creates maps via BPF syscall + * . parses 'license' section and passes it to syscall + * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by + * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD + * . loads eBPF programs via BPF syscall + * + * One ELF file can contain multiple BPF programs which will be loaded + * and their FDs stored stored in prog_fd array + * + * returns zero on success + */ +int load_bpf_file(char *path); +int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); +#endif diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c new file mode 100644 index 000000000..deb0e3e03 --- /dev/null +++ b/samples/bpf/cookie_uid_helper_example.c @@ -0,0 +1,323 @@ +/* This test is a demo of using get_socket_uid and get_socket_cookie + * helper function to do per socket based network traffic monitoring. + * It requires iptables version higher then 1.6.1. to load pinned eBPF + * program into the xt_bpf match. + * + * TEST: + * ./run_cookie_uid_helper_example.sh -option + * option: + * -t: do traffic monitoring test, the program will continuously + * print out network traffic happens after program started A sample + * output is shown below: + * + * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058 + * cookie: 132, uid: 0x0, Pakcet Count: 2, Bytes Count: 286 + * cookie: 812, uid: 0x3e8, Pakcet Count: 3, Bytes Count: 1726 + * cookie: 802, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104 + * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058 + * cookie: 831, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104 + * cookie: 0, uid: 0x0, Pakcet Count: 6, Bytes Count: 712 + * cookie: 880, uid: 0xfffe, Pakcet Count: 1, Bytes Count: 70 + * + * -s: do getsockopt SO_COOKIE test, the program will set up a pair of + * UDP sockets and send packets between them. And read out the traffic data + * directly from the ebpf map based on the socket cookie. + * + * Clean up: if using shell script, the script file will delete the iptables + * rule and unmount the bpf program when exit. Else the iptables rule need + * to be deleted by hand, see run_cookie_uid_helper_example.sh for detail. + */ + +#define _GNU_SOURCE + +#define offsetof(type, member) __builtin_offsetof(type, member) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <limits.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <net/if.h> +#include <signal.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <bpf/bpf.h> +#include "bpf_insn.h" + +#define PORT 8888 + +struct stats { + uint32_t uid; + uint64_t packets; + uint64_t bytes; +}; + +static int map_fd, prog_fd; + +static bool test_finish; + +static void maps_create(void) +{ + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t), + sizeof(struct stats), 100, 0); + if (map_fd < 0) + error(1, errno, "map create failed!\n"); +} + +static void prog_load(void) +{ + static char log_buf[1 << 16]; + + struct bpf_insn prog[] = { + /* + * Save sk_buff for future usage. value stored in R6 to R10 will + * not be reset after a bpf helper function call. + */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* + * pc1: BPF_FUNC_get_socket_cookie takes one parameter, + * R1: sk_buff + */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + /* pc2-4: save &socketCookie to r7 for future usage*/ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), + /* + * pc5-8: set up the registers for BPF_FUNC_map_lookup_elem, + * it takes two parameters (R1: map_fd, R2: &socket_cookie) + */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* + * pc9. if r0 != 0x0, go to pc+14, since we have the cookie + * stored already + * Otherwise do pc10-22 to setup a new data entry. + */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 14), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_uid), + /* + * Place a struct stats in the R10 stack and sequentially + * place the member value into the memory. Packets value + * is set by directly place a IMM value 1 into the stack. + */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, + -32 + (__s16)offsetof(struct stats, uid)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, + -32 + (__s16)offsetof(struct stats, packets), 1), + /* + * __sk_buff is a special struct used for eBPF program to + * directly access some sk_buff field. + */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, + -32 + (__s16)offsetof(struct stats, bytes)), + /* + * add new map entry using BPF_FUNC_map_update_elem, it takes + * 4 parameters (R1: map_fd, R2: &socket_cookie, R3: &stats, + * R4: flags) + */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -32), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_update_elem), + BPF_JMP_IMM(BPF_JA, 0, 0, 5), + /* + * pc24-30 update the packet info to a exist data entry, it can + * be done by directly write to pointers instead of using + * BPF_FUNC_map_update_elem helper function + */ + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, packets)), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, bytes)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_EXIT_INSN(), + }; + prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, + ARRAY_SIZE(prog), "GPL", 0, + log_buf, sizeof(log_buf)); + if (prog_fd < 0) + error(1, errno, "failed to load prog\n%s\n", log_buf); +} + +static void prog_attach_iptables(char *file) +{ + int ret; + char rules[100]; + + if (bpf_obj_pin(prog_fd, file)) + error(1, errno, "bpf_obj_pin"); + if (strlen(file) > 50) { + printf("file path too long: %s\n", file); + exit(1); + } + sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", + file); + ret = system(rules); + if (ret < 0) { + printf("iptables rule update failed: %d/n", WEXITSTATUS(ret)); + exit(1); + } +} + +static void print_table(void) +{ + struct stats curEntry; + uint32_t curN = UINT32_MAX; + uint32_t nextN; + int res; + + while (bpf_map_get_next_key(map_fd, &curN, &nextN) > -1) { + curN = nextN; + res = bpf_map_lookup_elem(map_fd, &curN, &curEntry); + if (res < 0) { + error(1, errno, "fail to get entry value of Key: %u\n", + curN); + } else { + printf("cookie: %u, uid: 0x%x, Packet Count: %lu," + " Bytes Count: %lu\n", curN, curEntry.uid, + curEntry.packets, curEntry.bytes); + } + } +} + +static void udp_client(void) +{ + struct sockaddr_in si_other = {0}; + struct sockaddr_in si_me = {0}; + struct stats dataEntry; + int s_rcv, s_send, i, recv_len; + char message = 'a'; + char buf; + uint64_t cookie; + int res; + socklen_t cookie_len = sizeof(cookie); + socklen_t slen = sizeof(si_other); + + s_rcv = socket(PF_INET, SOCK_DGRAM, 0); + if (s_rcv < 0) + error(1, errno, "rcv socket creat failed!\n"); + si_other.sin_family = AF_INET; + si_other.sin_port = htons(PORT); + if (inet_aton("127.0.0.1", &si_other.sin_addr) == 0) + error(1, errno, "inet_aton\n"); + if (bind(s_rcv, (struct sockaddr *)&si_other, sizeof(si_other)) == -1) + error(1, errno, "bind\n"); + s_send = socket(PF_INET, SOCK_DGRAM, 0); + if (s_send < 0) + error(1, errno, "send socket creat failed!\n"); + res = getsockopt(s_send, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len); + if (res < 0) + printf("get cookie failed: %s\n", strerror(errno)); + res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry); + if (res != -1) + error(1, errno, "socket stat found while flow not active\n"); + for (i = 0; i < 10; i++) { + res = sendto(s_send, &message, sizeof(message), 0, + (struct sockaddr *)&si_other, slen); + if (res == -1) + error(1, errno, "send\n"); + if (res != sizeof(message)) + error(1, 0, "%uB != %luB\n", res, sizeof(message)); + recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0, + (struct sockaddr *)&si_me, &slen); + if (recv_len < 0) + error(1, errno, "receive\n"); + res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr), + sizeof(si_me.sin_addr)); + if (res != 0) + error(1, EFAULT, "sender addr error: %d\n", res); + printf("Message received: %c\n", buf); + res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry); + if (res < 0) + error(1, errno, "lookup sk stat failed, cookie: %lu\n", + cookie); + printf("cookie: %lu, uid: 0x%x, Packet Count: %lu," + " Bytes Count: %lu\n\n", cookie, dataEntry.uid, + dataEntry.packets, dataEntry.bytes); + } + close(s_send); + close(s_rcv); +} + +static int usage(void) +{ + printf("Usage: ./run_cookie_uid_helper_example.sh" + " bpfObjName -option\n" + " -t traffic monitor test\n" + " -s getsockopt cookie test\n"); + return 1; +} + +static void finish(int ret) +{ + test_finish = true; +} + +int main(int argc, char *argv[]) +{ + int opt; + bool cfg_test_traffic = false; + bool cfg_test_cookie = false; + + if (argc != 3) + return usage(); + while ((opt = getopt(argc, argv, "ts")) != -1) { + switch (opt) { + case 't': + cfg_test_traffic = true; + break; + case 's': + cfg_test_cookie = true; + break; + + default: + printf("unknown option %c\n", opt); + usage(); + return -1; + } + } + maps_create(); + prog_load(); + prog_attach_iptables(argv[2]); + if (cfg_test_traffic) { + if (signal(SIGINT, finish) == SIG_ERR) + error(1, errno, "register SIGINT handler failed"); + if (signal(SIGTERM, finish) == SIG_ERR) + error(1, errno, "register SIGTERM handler failed"); + while (!test_finish) { + print_table(); + printf("\n"); + sleep(1); + }; + } else if (cfg_test_cookie) { + udp_client(); + } + close(prog_fd); + close(map_fd); + return 0; +} diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c new file mode 100644 index 000000000..5aefd19cd --- /dev/null +++ b/samples/bpf/cpustat_kern.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* + * The CPU number, cstate number and pstate number are based + * on 96boards Hikey with octa CA53 CPUs. + * + * Every CPU have three idle states for cstate: + * WFI, CPU_OFF, CLUSTER_OFF + * + * Every CPU have 5 operating points: + * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz + * + * This code is based on these assumption and other platforms + * need to adjust these definitions. + */ +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 + +static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; + +/* + * my_map structure is used to record cstate and pstate index and + * timestamp (Idx, Ts), when new event incoming we need to update + * combination for new state index and timestamp (Idx`, Ts`). + * + * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time + * interval for the previous state: Duration(Idx) = Ts` - Ts. + * + * Every CPU has one below array for recording state index and + * timestamp, and record for cstate and pstate saperately: + * + * +--------------------------+ + * | cstate timestamp | + * +--------------------------+ + * | cstate index | + * +--------------------------+ + * | pstate timestamp | + * +--------------------------+ + * | pstate index | + * +--------------------------+ + */ +#define MAP_OFF_CSTATE_TIME 0 +#define MAP_OFF_CSTATE_IDX 1 +#define MAP_OFF_PSTATE_TIME 2 +#define MAP_OFF_PSTATE_IDX 3 +#define MAP_OFF_NUM 4 + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAP_OFF_NUM); +} my_map SEC(".maps"); + +/* cstate_duration records duration time for every idle state per CPU */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES); +} cstate_duration SEC(".maps"); + +/* pstate_duration records duration time for every operating point per CPU */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES); +} pstate_duration SEC(".maps"); + +/* + * The trace events for cpu_idle and cpu_frequency are taken from: + * /sys/kernel/debug/tracing/events/power/cpu_idle/format + * /sys/kernel/debug/tracing/events/power/cpu_frequency/format + * + * These two events have same format, so define one common structure. + */ +struct cpu_args { + u64 pad; + u32 state; + u32 cpu_id; +}; + +/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */ +static u32 find_cpu_pstate_idx(u32 frequency) +{ + u32 i; + + for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) { + if (frequency == cpu_opps[i]) + return i; + } + + return i; +} + +SEC("tracepoint/power/cpu_idle") +int bpf_prog1(struct cpu_args *ctx) +{ + u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + if (ctx->cpu_id > MAX_CPU) + return 0; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME; + cts = bpf_map_lookup_elem(&my_map, &key); + if (!cts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + prev_state = *cstate; + *cstate = ctx->state; + + if (!*cts) { + *cts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *cts; + *cts = cur_ts; + + /* + * When state doesn't equal to (u32)-1, the cpu will enter + * one idle state; for this case we need to record interval + * for the pstate. + * + * OPP2 + * +---------------------+ + * OPP1 | | + * ---------+ | + * | Idle state + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + if (ctx->state != (u32)-1) { + + /* record pstate after have first cpu_frequency event */ + if (!*pts) + return 0; + + delta = cur_ts - *pts; + + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + /* + * When state equal to (u32)-1, the cpu just exits from one + * specific idle state; for this case we need to record + * interval for the pstate. + * + * OPP2 + * -----------+ + * | OPP1 + * | +----------- + * | Idle state | + * +---------------------+ + * + * |<- cstate duration ->| + * ^ ^ + * cts cur_ts + */ + } else { + + key = cpu * MAX_CSTATE_ENTRIES + prev_state; + val = bpf_map_lookup_elem(&cstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + } + + /* Update timestamp for pstate as new start time */ + if (*pts) + *pts = cur_ts; + + return 0; +} + +SEC("tracepoint/power/cpu_frequency") +int bpf_prog2(struct cpu_args *ctx) +{ + u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + prev_state = *pstate; + *pstate = ctx->state; + + if (!*pts) { + *pts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *pts; + *pts = cur_ts; + + /* When CPU is in idle, bail out to skip pstate statistics */ + if (*cstate != (u32)(-1)) + return 0; + + /* + * The cpu changes to another different OPP (in below diagram + * change frequency from OPP3 to OPP1), need recording interval + * for previous frequency OPP3 and update timestamp as start + * time for new frequency OPP1. + * + * OPP3 + * +---------------------+ + * OPP2 | | + * ---------+ | + * | OPP1 + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c new file mode 100644 index 000000000..96675985e --- /dev/null +++ b/samples/bpf/cpustat_user.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sched.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <locale.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/wait.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +static int cstate_map_fd, pstate_map_fd; + +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 +#define MAX_STARS 40 + +#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq" +#define CPUFREQ_LOWEST_FREQ "208000" +#define CPUFREQ_HIGHEST_FREQ "12000000" + +struct cpu_stat_data { + unsigned long cstate[MAX_CSTATE_ENTRIES]; + unsigned long pstate[MAX_PSTATE_ENTRIES]; +}; + +static struct cpu_stat_data stat_data[MAX_CPU]; + +static void cpu_stat_print(void) +{ + int i, j; + char state_str[sizeof("cstate-9")]; + struct cpu_stat_data *data; + + /* Clear screen */ + printf("\033[2J"); + + /* Header */ + printf("\nCPU states statistics:\n"); + printf("%-10s ", "state(ms)"); + + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + sprintf(state_str, "cstate-%d", i); + printf("%-11s ", state_str); + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + sprintf(state_str, "pstate-%d", i); + printf("%-11s ", state_str); + } + + printf("\n"); + + for (j = 0; j < MAX_CPU; j++) { + data = &stat_data[j]; + + printf("CPU-%-6d ", j); + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) + printf("%-11ld ", data->cstate[i] / 1000000); + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) + printf("%-11ld ", data->pstate[i] / 1000000); + + printf("\n"); + } +} + +static void cpu_stat_update(int cstate_fd, int pstate_fd) +{ + unsigned long key, value; + int c, i; + + for (c = 0; c < MAX_CPU; c++) { + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + key = c * MAX_CSTATE_ENTRIES + i; + bpf_map_lookup_elem(cstate_fd, &key, &value); + stat_data[c].cstate[i] = value; + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + key = c * MAX_PSTATE_ENTRIES + i; + bpf_map_lookup_elem(pstate_fd, &key, &value); + stat_data[c].pstate[i] = value; + } + } +} + +/* + * This function is copied from 'idlestat' tool function + * idlestat_wake_all() in idlestate.c. + * + * It sets the self running task affinity to cpus one by one so can wake up + * the specific CPU to handle scheduling; this results in all cpus can be + * waken up once and produce ftrace event 'trace_cpu_idle'. + */ +static int cpu_stat_inject_cpu_idle_event(void) +{ + int rcpu, i, ret; + cpu_set_t cpumask; + cpu_set_t original_cpumask; + + ret = sysconf(_SC_NPROCESSORS_CONF); + if (ret < 0) + return -1; + + rcpu = sched_getcpu(); + if (rcpu < 0) + return -1; + + /* Keep track of the CPUs we will run on */ + sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask); + + for (i = 0; i < ret; i++) { + + /* Pointless to wake up ourself */ + if (i == rcpu) + continue; + + /* Pointless to wake CPUs we will not run on */ + if (!CPU_ISSET(i, &original_cpumask)) + continue; + + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + sched_setaffinity(0, sizeof(cpumask), &cpumask); + } + + /* Enable all the CPUs of the original mask */ + sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask); + return 0; +} + +/* + * It's possible to have no any frequency change for long time and cannot + * get ftrace event 'trace_cpu_frequency' for long period, this introduces + * big deviation for pstate statistics. + * + * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz + * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to + * the maximum frequency value 1.2GHz. + */ +static int cpu_stat_inject_cpu_frequency_event(void) +{ + int len, fd; + + fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY); + if (fd < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + return fd; + } + + len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + + len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + +err: + close(fd); + return len; +} + +static void int_exit(int sig) +{ + cpu_stat_inject_cpu_idle_event(); + cpu_stat_inject_cpu_frequency_event(); + cpu_stat_update(cstate_map_fd, pstate_map_fd); + cpu_stat_print(); + exit(0); +} + +int main(int argc, char **argv) +{ + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int ret; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration"); + pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration"); + if (cstate_map_fd < 0 || pstate_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + ret = cpu_stat_inject_cpu_idle_event(); + if (ret < 0) + return 1; + + ret = cpu_stat_inject_cpu_frequency_event(); + if (ret < 0) + return 1; + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + while (1) { + cpu_stat_update(cstate_map_fd, pstate_map_fd); + cpu_stat_print(); + sleep(5); + } + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh new file mode 100755 index 000000000..ffe4c0607 --- /dev/null +++ b/samples/bpf/do_hbm_test.sh @@ -0,0 +1,442 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2019 Facebook +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. + +Usage() { + echo "Script for testing HBM (Host Bandwidth Manager) framework." + echo "It creates a cgroup to use for testing and load a BPF program to limit" + echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" + echo "loads. The output is the goodput in Mbps (unless -D was used)." + echo "" + echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]" + echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]" + echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" + echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]" + echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" + echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" + echo " Where:" + echo " out egress (default)" + echo " -b or --bpf BPF program filename to load and attach." + echo " Default is hbm_out_kern.o for egress," + echo " -c or -cc TCP congestion control (cubic or dctcp)" + echo " --debug print BPF trace buffer" + echo " -d or --delay add a delay in ms using netem" + echo " -D In addition to the goodput in Mbps, it also outputs" + echo " other detailed information. This information is" + echo " test dependent (i.e. iperf3 or netperf)." + echo " -E enable ECN (not required for dctcp)" + echo " --edt use fq's Earliest Departure Time (requires fq)" + echo " -f or --flows number of concurrent flows (default=1)" + echo " -i or --id cgroup id (an integer, default is 1)" + echo " -N use netperf instead of iperf3" + echo " --no_cn Do not return CN notifications" + echo " -l do not limit flows using loopback" + echo " -h Help" + echo " -p or --port iperf3 port (default is 5201)" + echo " -P use an iperf3 instance for each flow" + echo " -q use the specified qdisc" + echo " -r or --rate rate in Mbps (default 1s 1Gbps)" + echo " -R Use TCP_RR for netperf. 1st flow has req" + echo " size of 10KB, rest of 1MB. Reply in all" + echo " cases is 1 byte." + echo " More detailed output for each flow can be found" + echo " in the files netperf.<cg>.<flow>, where <cg> is the" + echo " cgroup id as specified with the -i flag, and <flow>" + echo " is the flow id starting at 1 and increasing by 1 for" + echo " flow (as specified by -f)." + echo " -s or --server hostname of netperf server. Used to create netperf" + echo " test traffic between to hosts (default is within host)" + echo " netserver must be running on the host." + echo " -S or --stats whether to update hbm stats (default is yes)." + echo " -t or --time duration of iperf3 in seconds (default=5)" + echo " -w Work conserving flag. cgroup can increase its" + echo " bandwidth beyond the rate limit specified" + echo " while there is available bandwidth. Current" + echo " implementation assumes there is only one NIC" + echo " (eth0), but can be extended to support multiple" + echo " NICs." + echo " cubic or dctcp specify which TCP CC to use" + echo " " + exit +} + +#set -x + +debug_flag=0 +args="$@" +name="$0" +netem=0 +cc=x +dir="-o" +dir_name="out" +dur=5 +flows=1 +id=1 +prog="" +port=5201 +rate=1000 +multi_iperf=0 +flow_cnt=1 +use_netperf=0 +rr=0 +ecn=0 +details=0 +server="" +qdisc="" +flags="" +do_stats=0 + +function start_hbm () { + rm -f hbm.out + echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out + echo " " >> hbm.out + ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 & + echo $! +} + +processArgs () { + for i in $args ; do + case $i in + # Support for upcomming ingress rate limiting + #in) # support for upcoming ingress rate limiting + # dir="-i" + # dir_name="in" + # ;; + out) + dir="-o" + dir_name="out" + ;; + -b=*|--bpf=*) + prog="${i#*=}" + ;; + -c=*|--cc=*) + cc="${i#*=}" + ;; + --no_cn) + flags="$flags --no_cn" + ;; + --debug) + flags="$flags -d" + debug_flag=1 + ;; + -d=*|--delay=*) + netem="${i#*=}" + ;; + -D) + details=1 + ;; + -E) + ecn=1 + ;; + --edt) + flags="$flags --edt" + qdisc="fq" + ;; + -f=*|--flows=*) + flows="${i#*=}" + ;; + -i=*|--id=*) + id="${i#*=}" + ;; + -l) + flags="$flags -l" + ;; + -N) + use_netperf=1 + ;; + -p=*|--port=*) + port="${i#*=}" + ;; + -P) + multi_iperf=1 + ;; + -q=*) + qdisc="${i#*=}" + ;; + -r=*|--rate=*) + rate="${i#*=}" + ;; + -R) + rr=1 + ;; + -s=*|--server=*) + server="${i#*=}" + ;; + -S|--stats) + flags="$flags -s" + do_stats=1 + ;; + -t=*|--time=*) + dur="${i#*=}" + ;; + -w) + flags="$flags -w" + ;; + cubic) + cc=cubic + ;; + dctcp) + cc=dctcp + ;; + *) + echo "Unknown arg:$i" + Usage + ;; + esac + done +} + +processArgs + +if [ $debug_flag -eq 1 ] ; then + rm -f hbm_out.log +fi + +hbm_pid=$(start_hbm) +usleep 100000 + +host=`hostname` +cg_base_dir=/sys/fs/cgroup +cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" + +echo $$ >> $cg_dir/cgroup.procs + +ulimit -l unlimited + +rm -f ss.out +rm -f hbm.[0-9]*.$dir_name +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=1 +fi + +if [ $use_netperf -eq 0 ] ; then + cur_cc=`sysctl -n net.ipv4.tcp_congestion_control` + if [ "$cc" != "x" ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc + fi +fi + +if [ "$netem" -ne "0" ] ; then + if [ "$qdisc" != "" ] ; then + echo "WARNING: Ignoring -q options because -d option used" + fi + tc qdisc del dev lo root > /dev/null 2>&1 + tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 +elif [ "$qdisc" != "" ] ; then + tc qdisc del dev eth0 root > /dev/null 2>&1 + tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1 +fi + +n=0 +m=$[$dur * 5] +hn="::1" +if [ $use_netperf -ne 0 ] ; then + if [ "$server" != "" ] ; then + hn=$server + fi +fi + +( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) & + +if [ $use_netperf -ne 0 ] ; then + begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \ + awk '{ print $1 }'` + if [ "$begNetserverPid" == "" ] ; then + if [ "$server" == "" ] ; then + ( ./netserver > /dev/null 2>&1) & + usleep 100000 + fi + fi + flow_cnt=1 + if [ "$server" == "" ] ; then + np_server=$host + else + np_server=$server + fi + if [ "$cc" == "x" ] ; then + np_cc="" + else + np_cc="-K $cc,$cc" + fi + replySize=1 + while [ $flow_cnt -le $flows ] ; do + if [ $rr -ne 0 ] ; then + reqSize=1M + if [ $flow_cnt -eq 1 ] ; then + reqSize=10K + fi + if [ "$dir" == "-i" ] ; then + replySize=$reqSize + reqSize=1 + fi + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + if [ "$dir" == "-i" ] ; then + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + fi + fi + flow_cnt=$[flow_cnt+1] + done + +# sleep for duration of test (plus some buffer) + n=$[dur+2] + sleep $n + +# force graceful termination of netperf + pids=`pgrep netperf` + for p in $pids ; do + kill -SIGALRM $p + done + + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + while [ $flow_cnt -le $flows ] ; do + if [ "$dir" == "-i" ] ; then + r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + else + r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + fi + echo "rate for flow $flow_cnt: $r" + rate=$[rate+r] + if [ $details -ne 0 ] ; then + echo "-----" + echo "Details for cgroup $id, flow $flow_cnt" + cat netperf.$id.$flow_cnt + fi + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + echo "" + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +elif [ $multi_iperf -eq 0 ] ; then + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + usleep 100000 + iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id + rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"` + rate=`echo $rates | grep -o "[0-9]*$"` + + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +else + flow_cnt=1 + while [ $flow_cnt -le $flows ] ; do + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) & + port=$[port+1] + flow_cnt=$[flow_cnt+1] + done + n=$[dur+1] + sleep $n + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + + while [ $flow_cnt -le $flows ] ; do + r=`cat iperf3.$id.$flow_cnt` +# echo "rate for flow $flow_cnt: $r" + if [ $details -ne 0 ] ; then + echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r" + fi + rate=$[rate+r] + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +fi + +if [ $use_netperf -eq 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc +fi +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=0 +fi +if [ "$netem" -ne "0" ] ; then + tc qdisc del dev lo root > /dev/null 2>&1 +fi +if [ "$qdisc" != "" ] ; then + tc qdisc del dev eth0 root > /dev/null 2>&1 +fi +sleep 2 + +hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` +if [ "$hbmPid" == "$hbm_pid" ] ; then + kill $hbm_pid +fi + +sleep 1 + +# Detach any BPF programs that may have lingered +ttx=`bpftool cgroup tree | grep hbm` +v=2 +for x in $ttx ; do + if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then + cg=$x ; v=0 + else + if [ $v -eq 0 ] ; then + id=$x ; v=1 + else + if [ $v -eq 1 ] ; then + type=$x ; bpftool cgroup detach $cg $type id $id + v=0 + fi + fi + fi +done + +if [ $use_netperf -ne 0 ] ; then + if [ "$server" == "" ] ; then + if [ "$begNetserverPid" == "" ] ; then + netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'` + if [ "$netserverPid" != "" ] ; then + kill $netserverPid + fi + fi + fi +fi +exit diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c new file mode 100644 index 000000000..59f45fef5 --- /dev/null +++ b/samples/bpf/fds_example.c @@ -0,0 +1,193 @@ +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <assert.h> +#include <errno.h> + +#include <sys/types.h> +#include <sys/socket.h> + +#include <bpf/bpf.h> + +#include <bpf/libbpf.h> +#include "bpf_insn.h" +#include "sock_example.h" + +#define BPF_F_PIN (1 << 0) +#define BPF_F_GET (1 << 1) +#define BPF_F_PIN_GET (BPF_F_PIN | BPF_F_GET) + +#define BPF_F_KEY (1 << 2) +#define BPF_F_VAL (1 << 3) +#define BPF_F_KEY_VAL (BPF_F_KEY | BPF_F_VAL) + +#define BPF_M_UNSPEC 0 +#define BPF_M_MAP 1 +#define BPF_M_PROG 2 + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static void usage(void) +{ + printf("Usage: fds_example [...]\n"); + printf(" -F <file> File to pin/get object\n"); + printf(" -P |- pin object\n"); + printf(" -G `- get object\n"); + printf(" -m eBPF map mode\n"); + printf(" -k <key> |- map key\n"); + printf(" -v <value> `- map value\n"); + printf(" -p eBPF prog mode\n"); + printf(" -o <object> `- object file\n"); + printf(" -h Display this help.\n"); +} + +static int bpf_map_create(void) +{ + return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), + sizeof(uint32_t), 1024, 0); +} + +static int bpf_prog_create(const char *object) +{ + static struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn); + struct bpf_object *obj; + int prog_fd; + + if (object) { + assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC, + &obj, &prog_fd)); + return prog_fd; + } else { + return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, + insns, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); + } +} + +static int bpf_do_map(const char *file, uint32_t flags, uint32_t key, + uint32_t value) +{ + int fd, ret; + + if (flags & BPF_F_PIN) { + fd = bpf_map_create(); + printf("bpf: map fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + + ret = bpf_obj_pin(fd, file); + printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno)); + assert(ret == 0); + } else { + fd = bpf_obj_get(file); + printf("bpf: get fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + } + + if ((flags & BPF_F_KEY_VAL) == BPF_F_KEY_VAL) { + ret = bpf_map_update_elem(fd, &key, &value, 0); + printf("bpf: fd:%d u->(%u:%u) ret:(%d,%s)\n", fd, key, value, + ret, strerror(errno)); + assert(ret == 0); + } else if (flags & BPF_F_KEY) { + ret = bpf_map_lookup_elem(fd, &key, &value); + printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value, + ret, strerror(errno)); + assert(ret == 0); + } + + return 0; +} + +static int bpf_do_prog(const char *file, uint32_t flags, const char *object) +{ + int fd, sock, ret; + + if (flags & BPF_F_PIN) { + fd = bpf_prog_create(object); + printf("bpf: prog fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + + ret = bpf_obj_pin(fd, file); + printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno)); + assert(ret == 0); + } else { + fd = bpf_obj_get(file); + printf("bpf: get fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + } + + sock = open_raw_sock("lo"); + assert(sock > 0); + + ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &fd, sizeof(fd)); + printf("bpf: sock:%d <- fd:%d attached ret:(%d,%s)\n", sock, fd, + ret, strerror(errno)); + assert(ret == 0); + + return 0; +} + +int main(int argc, char **argv) +{ + const char *file = NULL, *object = NULL; + uint32_t key = 0, value = 0, flags = 0; + int opt, mode = BPF_M_UNSPEC; + + while ((opt = getopt(argc, argv, "F:PGmk:v:po:")) != -1) { + switch (opt) { + /* General args */ + case 'F': + file = optarg; + break; + case 'P': + flags |= BPF_F_PIN; + break; + case 'G': + flags |= BPF_F_GET; + break; + /* Map-related args */ + case 'm': + mode = BPF_M_MAP; + break; + case 'k': + key = strtoul(optarg, NULL, 0); + flags |= BPF_F_KEY; + break; + case 'v': + value = strtoul(optarg, NULL, 0); + flags |= BPF_F_VAL; + break; + /* Prog-related args */ + case 'p': + mode = BPF_M_PROG; + break; + case 'o': + object = optarg; + break; + default: + goto out; + } + } + + if (!(flags & BPF_F_PIN_GET) || !file) + goto out; + + switch (mode) { + case BPF_M_MAP: + return bpf_do_map(file, flags, key, value); + case BPF_M_PROG: + return bpf_do_prog(file, flags, object); + } +out: + usage(); + return -1; +} diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h new file mode 100644 index 000000000..38255812e --- /dev/null +++ b/samples/bpf/hash_func01.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1 + * + * Based on Paul Hsieh's (LGPG 2.1) hash function + * From: http://www.azillionmonkeys.com/qed/hash.html + */ + +#define get16bits(d) (*((const __u16 *) (d))) + +static __always_inline +__u32 SuperFastHash (const char *data, int len, __u32 initval) { + __u32 hash = initval; + __u32 tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ +#pragma clang loop unroll(full) + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (__u16); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= ((signed char)data[sizeof (__u16)]) << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += (signed char)*data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c new file mode 100644 index 000000000..8e48489b9 --- /dev/null +++ b/samples/bpf/hbm.c @@ -0,0 +1,500 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Example program for Host Bandwidth Managment + * + * This program loads a cgroup skb BPF program to enforce cgroup output + * (egress) or input (ingress) bandwidth limits. + * + * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] + * Where: + * -d Print BPF trace debug buffer + * -l Also limit flows doing loopback + * -n <#> To create cgroup \"/hbm#\" and attach prog + * Default is /hbm1 + * --no_cn Do not return cn notifications + * -r <rate> Rate limit in Mbps + * -s Get HBM stats (marked, dropped, etc.) + * -t <time> Exit after specified seconds (default is 0) + * -w Work conserving flag. cgroup can increase its bandwidth + * beyond the rate limit specified while there is available + * bandwidth. Current implementation assumes there is only + * NIC (eth0), but can be extended to support multiple NICs. + * Currrently only supported for egress. + * -h Print this info + * prog BPF program file name. Name defaults to hbm_out_kern.o + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <sys/resource.h> +#include <sys/time.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/unistd.h> +#include <linux/compiler.h> + +#include <linux/bpf.h> +#include <bpf/bpf.h> +#include <getopt.h> + +#include "bpf_load.h" +#include "bpf_rlimit.h" +#include "cgroup_helpers.h" +#include "hbm.h" +#include "bpf_util.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +bool outFlag = true; +int minRate = 1000; /* cgroup rate limit in Mbps */ +int rate = 1000; /* can grow if rate conserving is enabled */ +int dur = 1; +bool stats_flag; +bool loopback_flag; +bool debugFlag; +bool work_conserving_flag; +bool no_cn_flag; +bool edt_flag; + +static void Usage(void); +static void read_trace_pipe2(void); +static void do_error(char *msg, bool errno_flag); + +#define DEBUGFS "/sys/kernel/debug/tracing/" + +struct bpf_object *obj; +int bpfprog_fd; +int cgroup_storage_fd; + +static void read_trace_pipe2(void) +{ + int trace_fd; + FILE *outf; + char *outFname = "hbm_out.log"; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) { + printf("Error opening trace_pipe\n"); + return; + } + +// Future support of ingress +// if (!outFlag) +// outFname = "hbm_in.log"; + outf = fopen(outFname, "w"); + + if (outf == NULL) + printf("Error creating %s\n", outFname); + + while (1) { + static char buf[4097]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + if (outf != NULL) { + fprintf(outf, "%s\n", buf); + fflush(outf); + } + } + } +} + +static void do_error(char *msg, bool errno_flag) +{ + if (errno_flag) + printf("ERROR: %s, errno: %d\n", msg, errno); + else + printf("ERROR: %s\n", msg); + exit(1); +} + +static int prog_load(char *prog) +{ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .file = prog, + .expected_attach_type = BPF_CGROUP_INET_EGRESS, + }; + int map_fd; + struct bpf_map *map; + + int ret = 0; + + if (access(prog, O_RDONLY) < 0) { + printf("Error accessing file %s: %s\n", prog, strerror(errno)); + return 1; + } + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) + ret = 1; + if (!ret) { + map = bpf_object__find_map_by_name(obj, "queue_stats"); + map_fd = bpf_map__fd(map); + if (map_fd < 0) { + printf("Map not found: %s\n", strerror(map_fd)); + ret = 1; + } + } + + if (ret) { + printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog); + printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); + ret = -1; + } else { + ret = map_fd; + } + + return ret; +} + +static int run_bpf_prog(char *prog, int cg_id) +{ + int map_fd; + int rc = 0; + int key = 0; + int cg1 = 0; + int type = BPF_CGROUP_INET_EGRESS; + char cg_dir[100]; + struct hbm_queue_stats qstats = {0}; + + sprintf(cg_dir, "/hbm%d", cg_id); + map_fd = prog_load(prog); + if (map_fd == -1) + return 1; + + if (setup_cgroup_environment()) { + printf("ERROR: setting cgroup environment\n"); + goto err; + } + cg1 = create_and_get_cgroup(cg_dir); + if (!cg1) { + printf("ERROR: create_and_get_cgroup\n"); + goto err; + } + if (join_cgroup(cg_dir)) { + printf("ERROR: join_cgroup\n"); + goto err; + } + + qstats.rate = rate; + qstats.stats = stats_flag ? 1 : 0; + qstats.loopback = loopback_flag ? 1 : 0; + qstats.no_cn = no_cn_flag ? 1 : 0; + if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { + printf("ERROR: Could not update map element\n"); + goto err; + } + + if (!outFlag) + type = BPF_CGROUP_INET_INGRESS; + if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { + printf("ERROR: bpf_prog_attach fails!\n"); + log_err("Attaching prog"); + goto err; + } + + if (work_conserving_flag) { + struct timeval t0, t_last, t_new; + FILE *fin; + unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; + signed long long last_cg_tx_bytes, new_cg_tx_bytes; + signed long long delta_time, delta_bytes, delta_rate; + int delta_ms; +#define DELTA_RATE_CHECK 10000 /* in us */ +#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ + + bpf_map_lookup_elem(map_fd, &key, &qstats); + if (gettimeofday(&t0, NULL) < 0) + do_error("gettimeofday failed", true); + t_last = t0; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); + if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + last_cg_tx_bytes = qstats.bytes_total; + while (true) { + usleep(DELTA_RATE_CHECK); + if (gettimeofday(&t_new, NULL) < 0) + do_error("gettimeofday failed", true); + delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + + (t_new.tv_usec - t0.tv_usec)/1000; + if (delta_ms > dur * 1000) + break; + delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + + (t_new.tv_usec - t_last.tv_usec); + if (delta_time == 0) + continue; + t_last = t_new; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", + "r"); + if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + printf(" new_eth_tx_bytes:%llu\n", + new_eth_tx_bytes); + bpf_map_lookup_elem(map_fd, &key, &qstats); + new_cg_tx_bytes = qstats.bytes_total; + delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; + last_eth_tx_bytes = new_eth_tx_bytes; + delta_rate = (delta_bytes * 8000000) / delta_time; + printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", + delta_ms, delta_rate/1000000000.0, + rate/1000.0); + if (delta_rate < RATE_THRESHOLD) { + /* can increase cgroup rate limit, but first + * check if we are using the current limit. + * Currently increasing by 6.25%, unknown + * if that is the optimal rate. + */ + int rate_diff100; + + delta_bytes = new_cg_tx_bytes - + last_cg_tx_bytes; + last_cg_tx_bytes = new_cg_tx_bytes; + delta_rate = (delta_bytes * 8000000) / + delta_time; + printf(" rate:%.3fGbps", + delta_rate/1000000000.0); + rate_diff100 = (((long long)rate)*1000000 - + delta_rate) * 100 / + (((long long) rate) * 1000000); + printf(" rdiff:%d", rate_diff100); + if (rate_diff100 <= 3) { + rate += (rate >> 4); + if (rate > RATE_THRESHOLD / 1000000) + rate = RATE_THRESHOLD / 1000000; + qstats.rate = rate; + printf(" INC\n"); + } else { + printf("\n"); + } + } else { + /* Need to decrease cgroup rate limit. + * Currently decreasing by 12.5%, unknown + * if that is optimal + */ + printf(" DEC\n"); + rate -= (rate >> 3); + if (rate < minRate) + rate = minRate; + qstats.rate = rate; + } + if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) + do_error("update map element fails", false); + } + } else { + sleep(dur); + } + // Get stats! + if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { + char fname[100]; + FILE *fout; + + if (!outFlag) + sprintf(fname, "hbm.%d.in", cg_id); + else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "ERROR: Could not lookup queue_stats\n"); + fclose(fout); + } else if (stats_flag && qstats.lastPacketTime > + qstats.firstPacketTime) { + long long delta_us = (qstats.lastPacketTime - + qstats.firstPacketTime)/1000; + unsigned int rate_mbps = ((qstats.bytes_total - + qstats.bytes_dropped) * 8 / + delta_us); + double percent_pkts, percent_bytes; + char fname[100]; + FILE *fout; + int k; + static const char *returnValNames[] = { + "DROP_PKT", + "ALLOW_PKT", + "DROP_PKT_CWR", + "ALLOW_PKT_CWR" + }; +#define RET_VAL_COUNT 4 + +// Future support of ingress +// if (!outFlag) +// sprintf(fname, "hbm.%d.in", cg_id); +// else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "rate_mbps:%d\n", rate_mbps); + fprintf(fout, "duration:%.1f secs\n", + (qstats.lastPacketTime - qstats.firstPacketTime) / + 1000000000.0); + fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); + fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / + 1000000)); + fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); + fprintf(fout, "bytes_dropped_MB:%d\n", + (int)(qstats.bytes_dropped / + 1000000)); + // Marked Pkts and Bytes + percent_pkts = (qstats.pkts_marked * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_marked * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); + + // Dropped Pkts and Bytes + percent_pkts = (qstats.pkts_dropped * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_dropped * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); + + // ECN CE markings + percent_pkts = (qstats.pkts_ecn_ce * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, + (int)qstats.pkts_ecn_ce); + + // Average cwnd + fprintf(fout, "avg cwnd:%d\n", + (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); + // Average rtt + fprintf(fout, "avg rtt:%d\n", + (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); + // Average credit + if (edt_flag) + fprintf(fout, "avg credit_ms:%.03f\n", + (qstats.sum_credit / + (qstats.pkts_total + 1.0)) / 1000000.0); + else + fprintf(fout, "avg credit:%d\n", + (int)(qstats.sum_credit / + (1500 * ((int)qstats.pkts_total ) + 1))); + + // Return values stats + for (k = 0; k < RET_VAL_COUNT; k++) { + percent_pkts = (qstats.returnValCount[k] * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], + percent_pkts, (int)qstats.returnValCount[k]); + } + fclose(fout); + } + + if (debugFlag) + read_trace_pipe2(); + return rc; +err: + rc = 1; + + if (cg1) + close(cg1); + cleanup_cgroup_environment(); + + return rc; +} + +static void Usage(void) +{ + printf("This program loads a cgroup skb BPF program to enforce\n" + "cgroup output (egress) bandwidth limits.\n\n" + "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" + " [-s] [-t <secs>] [-w] [-h] [prog]\n" + " Where:\n" + " -o indicates egress direction (default)\n" + " -d print BPF trace debug buffer\n" + " --edt use fq's Earliest Departure Time\n" + " -l also limit flows using loopback\n" + " -n <#> to create cgroup \"/hbm#\" and attach prog\n" + " Default is /hbm1\n" + " --no_cn disable CN notifications\n" + " -r <rate> Rate in Mbps\n" + " -s Update HBM stats\n" + " -t <time> Exit after specified seconds (default is 0)\n" + " -w Work conserving flag. cgroup can increase\n" + " bandwidth beyond the rate limit specified\n" + " while there is available bandwidth. Current\n" + " implementation assumes there is only eth0\n" + " but can be extended to support multiple NICs\n" + " -h print this info\n" + " prog BPF program file name. Name defaults to\n" + " hbm_out_kern.o\n"); +} + +int main(int argc, char **argv) +{ + char *prog = "hbm_out_kern.o"; + int k; + int cg_id = 1; + char *optstring = "iodln:r:st:wh"; + struct option loptions[] = { + {"no_cn", 0, NULL, 1}, + {"edt", 0, NULL, 2}, + {NULL, 0, NULL, 0} + }; + + while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { + switch (k) { + case 1: + no_cn_flag = true; + break; + case 2: + prog = "hbm_edt_kern.o"; + edt_flag = true; + break; + case'o': + break; + case 'd': + debugFlag = true; + break; + case 'l': + loopback_flag = true; + break; + case 'n': + cg_id = atoi(optarg); + break; + case 'r': + minRate = atoi(optarg) * 1.024; + rate = minRate; + break; + case 's': + stats_flag = true; + break; + case 't': + dur = atoi(optarg); + break; + case 'w': + work_conserving_flag = true; + break; + case '?': + if (optopt == 'n' || optopt == 'r' || optopt == 't') + fprintf(stderr, + "Option -%c requires an argument.\n\n", + optopt); + case 'h': + __fallthrough; + default: + Usage(); + return 0; + } + } + + if (optind < argc) + prog = argv[optind]; + printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); + + return run_bpf_prog(prog, cg_id); +} diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000..f0963ed6a --- /dev/null +++ b/samples/bpf/hbm.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for Host Bandwidth Management (HBM) programs + */ +struct hbm_vqueue { + struct bpf_spin_lock lock; + /* 4 byte hole */ + unsigned long long lasttime; /* In ns */ + int credit; /* In bytes */ + unsigned int rate; /* In bytes per NS << 20 */ +}; + +struct hbm_queue_stats { + unsigned long rate; /* in Mbps*/ + unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ + loopback:1, /* also limit flows using loopback */ + no_cn:1; /* do not use cn flags */ + unsigned long long pkts_marked; + unsigned long long bytes_marked; + unsigned long long pkts_dropped; + unsigned long long bytes_dropped; + unsigned long long pkts_total; + unsigned long long bytes_total; + unsigned long long firstPacketTime; + unsigned long long lastPacketTime; + unsigned long long pkts_ecn_ce; + unsigned long long returnValCount[4]; + unsigned long long sum_cwnd; + unsigned long long sum_rtt; + unsigned long long sum_cwnd_cnt; + long long sum_credit; +}; diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c new file mode 100644 index 000000000..a65b677ac --- /dev/null +++ b/samples/bpf/hbm_edt_kern.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * a user program, such as hbm, can access. + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + long long delta = 0, delta_send; + unsigned long long curtime, sendtime; + struct hbm_queue_stats *qsp = NULL; + unsigned int queue_index = 0; + bool congestion_flag = false; + bool ecn_ce_flag = false; + struct hbm_pkt_info pkti = {}; + struct hbm_vqueue *qdp; + bool drop_flag = false; + bool cwr_flag = false; + int len = skb->len; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + + // Check if we should ignore loopback traffic + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + if (qdp->lasttime == 0) + hbm_init_edt_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + delta = qdp->lasttime - curtime; + // bound bursts to 100us + if (delta < -BURST_SIZE_NS) { + // negative delta is a credit that allows bursts + qdp->lasttime = curtime - BURST_SIZE_NS; + delta = -BURST_SIZE_NS; + } + sendtime = qdp->lasttime; + delta_send = BYTES_TO_NS(len, qdp->rate); + __sync_add_and_fetch(&(qdp->lasttime), delta_send); + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Set EDT of packet + skb->tstamp = sendtime; + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) + qdp->rate = qsp->rate * 128; + + // Set flags (drop, congestion, cwr) + // last packet will be sent in the future, bound latency + if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS && + len > LARGE_PKT_THRESH)) { + drop_flag = true; + if (pkti.is_tcp && pkti.ecn == 0) + cwr_flag = true; + } else if (delta > MARK_THRESH_NS) { + if (pkti.is_tcp) + congestion_flag = true; + else + drop_flag = true; + } + + if (congestion_flag) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (delta >= MARK_THRESH_NS + + (rand % MARK_REGION_SIZE_NS)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + congestion_flag = false; + } + } + } + + if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) { + drop_flag = false; + cwr_flag = true; + congestion_flag = false; + } + + if (qsp != NULL && qsp->no_cn) + cwr_flag = false; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, (int) delta); + + if (drop_flag) { + __sync_add_and_fetch(&(qdp->lasttime), -delta_send); + rv = DROP_PKT; + } + + if (cwr_flag) + rv |= CWR; + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000..e00f26f6a --- /dev/null +++ b/samples/bpf/hbm_kern.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for sample Host Bandwidth Manager (HBM) BPF programs + */ +#define KBUILD_MODNAME "foo" +#include <stddef.h> +#include <stdbool.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <net/ipv6.h> +#include <net/inet_ecn.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> +#include "hbm.h" + +#define DROP_PKT 0 +#define ALLOW_PKT 1 +#define TCP_ECN_OK 1 +#define CWR 2 + +#ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging +#undef bpf_printk +#define bpf_printk(fmt, ...) +#endif + +#define INITIAL_CREDIT_PACKETS 100 +#define MAX_BYTES_PER_PACKET 1500 +#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) +#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) +#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) +#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) +#define LARGE_PKT_THRESH 120 +#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) +#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) + +// Time base accounting for fq's EDT +#define BURST_SIZE_NS 100000 // 100us +#define MARK_THRESH_NS 50000 // 50us +#define DROP_THRESH_NS 500000 // 500us +// Reserve 20us of queuing for small packets (less than 120 bytes) +#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000) +#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS) + +// rate in bytes per ns << 20 +#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate)) + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, struct hbm_vqueue); +} queue_state SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct hvm_queue_stats); +} queue_stats SEC(".maps"); + +struct hbm_pkt_info { + int cwnd; + int rtt; + int packets_out; + bool is_ip; + bool is_tcp; + short ecn; +}; + +static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) +{ + struct bpf_sock *sk; + struct bpf_tcp_sock *tp; + + sk = skb->sk; + if (sk) { + sk = bpf_sk_fullsock(sk); + if (sk) { + if (sk->protocol == IPPROTO_TCP) { + tp = bpf_tcp_sock(sk); + if (tp) { + pkti->cwnd = tp->snd_cwnd; + pkti->rtt = tp->srtt_us >> 3; + pkti->packets_out = tp->packets_out; + return 0; + } + } + } + } + pkti->cwnd = 0; + pkti->rtt = 0; + pkti->packets_out = 0; + return 1; +} + +static void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) +{ + struct iphdr iph; + struct ipv6hdr *ip6h; + + pkti->cwnd = 0; + pkti->rtt = 0; + bpf_skb_load_bytes(skb, 0, &iph, 12); + if (iph.version == 6) { + ip6h = (struct ipv6hdr *)&iph; + pkti->is_ip = true; + pkti->is_tcp = (ip6h->nexthdr == 6); + pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; + } else if (iph.version == 4) { + pkti->is_ip = true; + pkti->is_tcp = (iph.protocol == 6); + pkti->ecn = iph.tos & INET_ECN_MASK; + } else { + pkti->is_ip = false; + pkti->is_tcp = false; + pkti->ecn = 0; + } + if (pkti->is_tcp) + get_tcp_info(skb, pkti); +} + +static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) +{ + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp, + int rate) +{ + unsigned long long curtime; + + curtime = bpf_ktime_get_ns(); + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst + qdp->credit = 0; // not used + qdp->rate = rate * 128; +} + +static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, + int len, + unsigned long long curtime, + bool congestion_flag, + bool drop_flag, + bool cwr_flag, + bool ecn_ce_flag, + struct hbm_pkt_info *pkti, + int credit) +{ + int rv = ALLOW_PKT; + + if (qsp != NULL) { + // Following is needed for work conserving + __sync_add_and_fetch(&(qsp->bytes_total), len); + if (qsp->stats) { + // Optionally update statistics + if (qsp->firstPacketTime == 0) + qsp->firstPacketTime = curtime; + qsp->lastPacketTime = curtime; + __sync_add_and_fetch(&(qsp->pkts_total), 1); + if (congestion_flag) { + __sync_add_and_fetch(&(qsp->pkts_marked), 1); + __sync_add_and_fetch(&(qsp->bytes_marked), len); + } + if (drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_dropped), 1); + __sync_add_and_fetch(&(qsp->bytes_dropped), + len); + } + if (ecn_ce_flag) + __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1); + if (pkti->cwnd) { + __sync_add_and_fetch(&(qsp->sum_cwnd), + pkti->cwnd); + __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1); + } + if (pkti->rtt) + __sync_add_and_fetch(&(qsp->sum_rtt), + pkti->rtt); + __sync_add_and_fetch(&(qsp->sum_credit), credit); + + if (drop_flag) + rv = DROP_PKT; + if (cwr_flag) + rv |= 2; + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[0]), + 1); + else if (rv == ALLOW_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[1]), + 1); + else if (rv == 2) + __sync_add_and_fetch(&(qsp->returnValCount[2]), + 1); + else if (rv == 3) + __sync_add_and_fetch(&(qsp->returnValCount[3]), + 1); + } + } +} diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000..829934bd4 --- /dev/null +++ b/samples/bpf/hbm_out_kern.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * the user program can access. + * + * A latter patch provides such a program (hbm.c) + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + struct hbm_pkt_info pkti; + int len = skb->len; + unsigned int queue_index = 0; + unsigned long long curtime; + int credit; + signed long long delta = 0, new_credit; + int max_credit = MAX_CREDIT; + bool congestion_flag = false; + bool drop_flag = false; + bool cwr_flag = false; + bool ecn_ce_flag = false; + struct hbm_vqueue *qdp; + struct hbm_queue_stats *qsp = NULL; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + else if (qdp->lasttime == 0) + hbm_init_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + credit = qdp->credit; + delta = curtime - qdp->lasttime; + /* delta < 0 implies that another process with a curtime greater + * than ours beat us to the critical section and already added + * the new credit, so we should not add it ourselves + */ + if (delta > 0) { + qdp->lasttime = curtime; + new_credit = credit + CREDIT_PER_NS(delta, qdp->rate); + if (new_credit > MAX_CREDIT) + credit = MAX_CREDIT; + else + credit = new_credit; + } + credit -= len; + qdp->credit = credit; + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { + qdp->rate = qsp->rate * 128; + bpf_printk("Updating rate: %d (1sec:%llu bits)\n", + (int)qdp->rate, + CREDIT_PER_NS(1000000000, qdp->rate) * 8); + } + + // Set flags (drop, congestion, cwr) + // Dropping => we are congested, so ignore congestion flag + if (credit < -DROP_THRESH || + (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop packet + drop_flag = true; + if (pkti.ecn) + congestion_flag = true; + else if (pkti.is_tcp) + cwr_flag = true; + } else if (credit < 0) { + // Congested, set congestion flag + if (pkti.ecn || pkti.is_tcp) { + if (credit < -MARK_THRESH) + congestion_flag = true; + else + congestion_flag = false; + } else { + congestion_flag = true; + } + } + + if (congestion_flag) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (-credit >= MARK_THRESH + + (rand % MARK_REGION_SIZE)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + } + } + } + + if (qsp != NULL) + if (qsp->no_cn) + cwr_flag = false; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, credit); + + if (drop_flag) { + __sync_add_and_fetch(&(qdp->credit), len); + rv = DROP_PKT; + } + + if (cwr_flag) + rv |= 2; + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c new file mode 100644 index 000000000..3a91b4c19 --- /dev/null +++ b/samples/bpf/ibumad_kern.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + +/** + * ibumad BPF sample kernel side + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Copyright(c) 2018 Ira Weiny, Intel Corporation + */ + +#define KBUILD_MODNAME "ibumad_count_pkts_by_class" +#include <uapi/linux/bpf.h> + +#include <bpf/bpf_helpers.h> + + +struct bpf_map_def SEC("maps") read_count = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), /* class; u32 required */ + .value_size = sizeof(u64), /* count of mads read */ + .max_entries = 256, /* Room for all Classes */ +}; + +struct bpf_map_def SEC("maps") write_count = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), /* class; u32 required */ + .value_size = sizeof(u64), /* count of mads written */ + .max_entries = 256, /* Room for all Classes */ +}; + +#undef DEBUG +#ifndef DEBUG +#undef bpf_printk +#define bpf_printk(fmt, ...) +#endif + +/* Taken from the current format defined in + * include/trace/events/ib_umad.h + * and + * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_read/format + * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_write/format + */ +struct ib_umad_rw_args { + u64 pad; + u8 port_num; + u8 sl; + u8 path_bits; + u8 grh_present; + u32 id; + u32 status; + u32 timeout_ms; + u32 retires; + u32 length; + u32 qpn; + u32 qkey; + u8 gid_index; + u8 hop_limit; + u16 lid; + u16 attr_id; + u16 pkey_index; + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + u32 flow_label; + u16 mad_status; + u16 class_specific; + u32 attr_mod; + u64 tid; + u8 gid[16]; + u32 dev_index; + u8 traffic_class; +}; + +SEC("tracepoint/ib_umad/ib_umad_read_recv") +int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx) +{ + u64 zero = 0, *val; + u8 class = ctx->mgmt_class; + + bpf_printk("ib_umad read recv : class 0x%x\n", class); + + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) { + bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST); + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) + return 0; + } + + (*val) += 1; + + return 0; +} +SEC("tracepoint/ib_umad/ib_umad_read_send") +int on_ib_umad_read_send(struct ib_umad_rw_args *ctx) +{ + u64 zero = 0, *val; + u8 class = ctx->mgmt_class; + + bpf_printk("ib_umad read send : class 0x%x\n", class); + + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) { + bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST); + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) + return 0; + } + + (*val) += 1; + + return 0; +} +SEC("tracepoint/ib_umad/ib_umad_write") +int on_ib_umad_write(struct ib_umad_rw_args *ctx) +{ + u64 zero = 0, *val; + u8 class = ctx->mgmt_class; + + bpf_printk("ib_umad write : class 0x%x\n", class); + + val = bpf_map_lookup_elem(&write_count, &class); + if (!val) { + bpf_map_update_elem(&write_count, &class, &zero, BPF_NOEXIST); + val = bpf_map_lookup_elem(&write_count, &class); + if (!val) + return 0; + } + + (*val) += 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c new file mode 100644 index 000000000..fa06eef31 --- /dev/null +++ b/samples/bpf/ibumad_user.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + +/** + * ibumad BPF sample user side + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Copyright(c) 2018 Ira Weiny, Intel Corporation + */ + +#include <linux/bpf.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <limits.h> + +#include <sys/resource.h> +#include <getopt.h> +#include <net/if.h> + +#include "bpf_load.h" +#include "bpf_util.h" +#include <bpf/libbpf.h> + +static void dump_counts(int fd) +{ + __u32 key; + __u64 value; + + for (key = 0; key < 256; key++) { + if (bpf_map_lookup_elem(fd, &key, &value)) { + printf("failed to read key %u\n", key); + continue; + } + if (value) + printf("0x%02x : %llu\n", key, value); + } +} + +static void dump_all_counts(void) +{ + printf("Read 'Class : count'\n"); + dump_counts(map_fd[0]); + printf("Write 'Class : count'\n"); + dump_counts(map_fd[1]); +} + +static void dump_exit(int sig) +{ + dump_all_counts(); + exit(0); +} + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h'}, + {"delay", required_argument, NULL, 'd'}, +}; + +static void usage(char *cmd) +{ + printf("eBPF test program to count packets from various IP addresses\n" + "Usage: %s <options>\n" + " --help, -h this menu\n" + " --delay, -d <delay> wait <delay> sec between prints [1 - 1000000]\n" + , cmd + ); +} + +int main(int argc, char **argv) +{ + unsigned long delay = 5; + int longindex = 0; + int opt; + char bpf_file[256]; + + /* Create the eBPF kernel code path name. + * This follows the pattern of all of the other bpf samples + */ + snprintf(bpf_file, sizeof(bpf_file), "%s_kern.o", argv[0]); + + /* Do one final dump when exiting */ + signal(SIGINT, dump_exit); + signal(SIGTERM, dump_exit); + + while ((opt = getopt_long(argc, argv, "hd:rSw", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + delay = strtoul(optarg, NULL, 0); + if (delay == ULONG_MAX || delay < 0 || + delay > 1000000) { + fprintf(stderr, "ERROR: invalid delay : %s\n", + optarg); + usage(argv[0]); + return 1; + } + break; + default: + case 'h': + usage(argv[0]); + return 1; + } + } + + if (load_bpf_file(bpf_file)) { + fprintf(stderr, "ERROR: failed to load eBPF from file : %s\n", + bpf_file); + return 1; + } + + while (1) { + sleep(delay); + dump_all_counts(); + } + + return 0; +} diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c new file mode 100644 index 000000000..4adfcbbe6 --- /dev/null +++ b/samples/bpf/lathist_kern.c @@ -0,0 +1,99 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2015 BMW Car IT GmbH + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#define MAX_ENTRIES 20 +#define MAX_CPU 4 + +/* We need to stick to static allocated memory (an array instead of + * hash table) because managing dynamic memory from the + * trace_preempt_[on|off] tracepoints hooks is not supported. + */ + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, u64); + __uint(max_entries, MAX_CPU); +} my_map SEC(".maps"); + +SEC("kprobe/trace_preempt_off") +int bpf_prog1(struct pt_regs *ctx) +{ + int cpu = bpf_get_smp_processor_id(); + u64 *ts = bpf_map_lookup_elem(&my_map, &cpu); + + if (ts) + *ts = bpf_ktime_get_ns(); + + return 0; +} + +static unsigned int log2(unsigned int v) +{ + unsigned int r; + unsigned int shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + + return r; +} + +static unsigned int log2l(unsigned long v) +{ + unsigned int hi = v >> 32; + + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, long); + __uint(max_entries, MAX_CPU * MAX_ENTRIES); +} my_lat SEC(".maps"); + +SEC("kprobe/trace_preempt_on") +int bpf_prog2(struct pt_regs *ctx) +{ + u64 *ts, cur_ts, delta; + int key, cpu; + long *val; + + cpu = bpf_get_smp_processor_id(); + ts = bpf_map_lookup_elem(&my_map, &cpu); + if (!ts) + return 0; + + cur_ts = bpf_ktime_get_ns(); + delta = log2l(cur_ts - *ts); + + if (delta > MAX_ENTRIES - 1) + delta = MAX_ENTRIES - 1; + + key = cpu * MAX_ENTRIES + delta; + val = bpf_map_lookup_elem(&my_lat, &key); + if (val) + __sync_fetch_and_add((long *)val, 1); + + return 0; + +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c new file mode 100644 index 000000000..7d8ff2418 --- /dev/null +++ b/samples/bpf/lathist_user.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2015 BMW Car IT GmbH + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> + +#define MAX_ENTRIES 20 +#define MAX_CPU 4 +#define MAX_STARS 40 + +struct cpu_hist { + long data[MAX_ENTRIES]; + long max; +}; + +static struct cpu_hist cpu_hist[MAX_CPU]; + +static void stars(char *str, long val, long max, int width) +{ + int i; + + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) + str[i] = '*'; + if (val > max) + str[i - 1] = '+'; + str[i] = '\0'; +} + +static void print_hist(void) +{ + char starstr[MAX_STARS]; + struct cpu_hist *hist; + int i, j; + + /* clear screen */ + printf("\033[2J"); + + for (j = 0; j < MAX_CPU; j++) { + hist = &cpu_hist[j]; + + /* ignore CPUs without data (maybe offline?) */ + if (hist->max == 0) + continue; + + printf("CPU %d\n", j); + printf(" latency : count distribution\n"); + for (i = 1; i <= MAX_ENTRIES; i++) { + stars(starstr, hist->data[i - 1], hist->max, MAX_STARS); + printf("%8ld -> %-8ld : %-8ld |%-*s|\n", + (1l << i) >> 1, (1l << i) - 1, + hist->data[i - 1], MAX_STARS, starstr); + } + } +} + +static void get_data(int fd) +{ + long key, value; + int c, i; + + for (i = 0; i < MAX_CPU; i++) + cpu_hist[i].max = 0; + + for (c = 0; c < MAX_CPU; c++) { + for (i = 0; i < MAX_ENTRIES; i++) { + key = c * MAX_ENTRIES + i; + bpf_map_lookup_elem(fd, &key, &value); + + cpu_hist[c].data[i] = value; + if (value > cpu_hist[c].max) + cpu_hist[c].max = value; + } + } +} + +int main(int argc, char **argv) +{ + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int map_fd, i = 0; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; + } + + while (1) { + get_data(map_fd); + print_hist(); + sleep(5); + } + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh new file mode 100755 index 000000000..0eda9754f --- /dev/null +++ b/samples/bpf/lwt_len_hist.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +NS1=lwt_ns1 +VETH0=tst_lwt1a +VETH1=tst_lwt1b + +TRACE_ROOT=/sys/kernel/debug/tracing + +function cleanup { + # To reset saved histogram, remove pinned map + rm /sys/fs/bpf/tc/globals/lwt_len_hist_map + ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null + ip link del $VETH0 2> /dev/null + ip link del $VETH1 2> /dev/null + ip netns exec $NS1 killall netserver + ip netns delete $NS1 2> /dev/null +} + +cleanup + +ip netns add $NS1 +ip link add $VETH0 type veth peer name $VETH1 +ip link set dev $VETH0 up +ip addr add 192.168.253.1/24 dev $VETH0 +ip link set $VETH1 netns $NS1 +ip netns exec $NS1 ip link set dev $VETH1 up +ip netns exec $NS1 ip addr add 192.168.253.2/24 dev $VETH1 +ip netns exec $NS1 netserver + +echo 1 > ${TRACE_ROOT}/tracing_on +cp /dev/null ${TRACE_ROOT}/trace +ip route add 192.168.253.2/32 encap bpf out obj lwt_len_hist_kern.o section len_hist dev $VETH0 +netperf -H 192.168.253.2 -t TCP_STREAM +cat ${TRACE_ROOT}/trace | grep -v '^#' +./lwt_len_hist +cleanup +echo 0 > ${TRACE_ROOT}/tracing_on + +exit 0 diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c new file mode 100644 index 000000000..9ed63e10e --- /dev/null +++ b/samples/bpf/lwt_len_hist_kern.c @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/in.h> +#include <bpf/bpf_helpers.h> + +# define printk(fmt, ...) \ + ({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ + }) + +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +}; + +struct bpf_elf_map SEC("maps") lwt_len_hist_map = { + .type = BPF_MAP_TYPE_PERCPU_HASH, + .size_key = sizeof(__u64), + .size_value = sizeof(__u64), + .pinning = 2, + .max_elem = 1024, +}; + +static unsigned int log2(unsigned int v) +{ + unsigned int r; + unsigned int shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +static unsigned int log2l(unsigned long v) +{ + unsigned int hi = v >> 32; + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +SEC("len_hist") +int do_len_hist(struct __sk_buff *skb) +{ + __u64 *value, key, init_val = 1; + + key = log2l(skb->len); + + value = bpf_map_lookup_elem(&lwt_len_hist_map, &key); + if (value) + __sync_fetch_and_add(value, 1); + else + bpf_map_update_elem(&lwt_len_hist_map, &key, &init_val, BPF_ANY); + + return BPF_OK; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c new file mode 100644 index 000000000..430a4b7e3 --- /dev/null +++ b/samples/bpf/lwt_len_hist_user.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <arpa/inet.h> + +#include <bpf/bpf.h> +#include "bpf_util.h" + +#define MAX_INDEX 64 +#define MAX_STARS 38 + +static void stars(char *str, long val, long max, int width) +{ + int i; + + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) + str[i] = '*'; + if (val > max) + str[i - 1] = '+'; + str[i] = '\0'; +} + +int main(int argc, char **argv) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + const char *map_filename = "/sys/fs/bpf/tc/globals/lwt_len_hist_map"; + uint64_t values[nr_cpus], sum, max_value = 0, data[MAX_INDEX] = {}; + uint64_t key = 0, next_key, max_key = 0; + char starstr[MAX_STARS]; + int i, map_fd; + + map_fd = bpf_obj_get(map_filename); + if (map_fd < 0) { + fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", + map_filename, strerror(errno), errno); + return -1; + } + + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + if (next_key >= MAX_INDEX) { + fprintf(stderr, "Key %lu out of bounds\n", next_key); + continue; + } + + bpf_map_lookup_elem(map_fd, &next_key, values); + + sum = 0; + for (i = 0; i < nr_cpus; i++) + sum += values[i]; + + data[next_key] = sum; + if (sum && next_key > max_key) + max_key = next_key; + + if (sum > max_value) + max_value = sum; + + key = next_key; + } + + for (i = 1; i <= max_key + 1; i++) { + stars(starstr, data[i - 1], max_value, MAX_STARS); + printf("%8ld -> %-8ld : %-8ld |%-*s|\n", + (1l << i) >> 1, (1l << i) - 1, data[i - 1], + MAX_STARS, starstr); + } + + close(map_fd); + + return 0; +} diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c new file mode 100644 index 000000000..8773f22b6 --- /dev/null +++ b/samples/bpf/map_perf_test_kern.c @@ -0,0 +1,291 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "trace_common.h" + +#define MAX_ENTRIES 1000 +#define MAX_NR_CPUS 1024 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); +} lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_COMMON_LRU); +} nocommon_lru_hash_map SEC(".maps"); + +struct inner_lru { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NUMA_NODE); + __uint(numa_node, 0); +} inner_lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_NR_CPUS); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_lru); /* use inner_lru as inner map */ +} array_of_lru_hashs SEC(".maps") = { + /* statically initialize the first element */ + .values = { &inner_lru_hash_map }, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); +} percpu_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} percpu_hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(long)); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_trie_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} array_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} lru_hash_lookup_map SEC(".maps"); + +SEC("kprobe/" SYSCALL(sys_getuid)) +int stress_hmap(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + + bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) + bpf_map_delete_elem(&hash_map, &key); + + return 0; +} + +SEC("kprobe/" SYSCALL(sys_geteuid)) +int stress_percpu_hmap(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + + bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map, &key); + return 0; +} + +SEC("kprobe/" SYSCALL(sys_getgid)) +int stress_hmap_alloc(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + + bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&hash_map_alloc, &key); + return 0; +} + +SEC("kprobe/" SYSCALL(sys_getegid)) +int stress_percpu_hmap_alloc(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + + bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map_alloc, &key); + return 0; +} + +SEC("kprobe/" SYSCALL(sys_connect)) +int stress_lru_hmap_alloc(struct pt_regs *ctx) +{ + struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); + char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; + union { + u16 dst6[8]; + struct { + u16 magic0; + u16 magic1; + u16 tcase; + u16 unused16; + u32 unused32; + u32 key; + }; + } test_params; + struct sockaddr_in6 *in6; + u16 test_case; + int addrlen, ret; + long val = 1; + u32 key = 0; + + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); + addrlen = (int)PT_REGS_PARM3_CORE(real_regs); + + if (addrlen != sizeof(*in6)) + return 0; + + ret = bpf_probe_read_user(test_params.dst6, sizeof(test_params.dst6), + &in6->sin6_addr); + if (ret) + goto done; + + if (test_params.magic0 != 0xdead || + test_params.magic1 != 0xbeef) + return 0; + + test_case = test_params.tcase; + if (test_case != 3) + key = bpf_get_prandom_u32(); + + if (test_case == 0) { + ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); + } else if (test_case == 1) { + ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val, + BPF_ANY); + } else if (test_case == 2) { + void *nolocal_lru_map; + int cpu = bpf_get_smp_processor_id(); + + nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs, + &cpu); + if (!nolocal_lru_map) { + ret = -ENOENT; + goto done; + } + + ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, + BPF_ANY); + } else if (test_case == 3) { + u32 i; + + key = test_params.key; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; i++) { + bpf_map_lookup_elem(&lru_hash_lookup_map, &key); + key++; + } + } else { + ret = -EINVAL; + } + +done: + if (ret) + bpf_trace_printk(fmt, sizeof(fmt), ret); + + return 0; +} + +SEC("kprobe/" SYSCALL(sys_gettid)) +int stress_lpm_trie_map_alloc(struct pt_regs *ctx) +{ + union { + u32 b32[2]; + u8 b8[8]; + } key; + unsigned int i; + + key.b32[0] = 32; + key.b8[4] = 192; + key.b8[5] = 168; + key.b8[6] = 0; + key.b8[7] = 1; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; ++i) + bpf_map_lookup_elem(&lpm_trie_map_alloc, &key); + + return 0; +} + +SEC("kprobe/" SYSCALL(sys_getpgid)) +int stress_hash_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&hash_map, &key); + + return 0; +} + +SEC("kprobe/" SYSCALL(sys_getppid)) +int stress_array_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&array_map, &key); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c new file mode 100644 index 000000000..8b13230b4 --- /dev/null +++ b/samples/bpf/map_perf_test_user.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Facebook + */ +#define _GNU_SOURCE +#include <sched.h> +#include <stdio.h> +#include <sys/types.h> +#include <asm/unistd.h> +#include <unistd.h> +#include <assert.h> +#include <sys/wait.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <time.h> +#include <sys/resource.h> +#include <arpa/inet.h> +#include <errno.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#define TEST_BIT(t) (1U << (t)) +#define MAX_NR_CPUS 1024 + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +enum test_type { + HASH_PREALLOC, + PERCPU_HASH_PREALLOC, + HASH_KMALLOC, + PERCPU_HASH_KMALLOC, + LRU_HASH_PREALLOC, + NOCOMMON_LRU_HASH_PREALLOC, + LPM_KMALLOC, + HASH_LOOKUP, + ARRAY_LOOKUP, + INNER_LRU_HASH_PREALLOC, + LRU_HASH_LOOKUP, + NR_TESTS, +}; + +const char *test_map_names[NR_TESTS] = { + [HASH_PREALLOC] = "hash_map", + [PERCPU_HASH_PREALLOC] = "percpu_hash_map", + [HASH_KMALLOC] = "hash_map_alloc", + [PERCPU_HASH_KMALLOC] = "percpu_hash_map_alloc", + [LRU_HASH_PREALLOC] = "lru_hash_map", + [NOCOMMON_LRU_HASH_PREALLOC] = "nocommon_lru_hash_map", + [LPM_KMALLOC] = "lpm_trie_map_alloc", + [HASH_LOOKUP] = "hash_map", + [ARRAY_LOOKUP] = "array_map", + [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map", + [LRU_HASH_LOOKUP] = "lru_hash_lookup_map", +}; + +enum map_idx { + array_of_lru_hashs_idx, + hash_map_alloc_idx, + lru_hash_lookup_idx, + NR_IDXES, +}; + +static int map_fd[NR_IDXES]; + +static int test_flags = ~0; +static uint32_t num_map_entries; +static uint32_t inner_lru_hash_size; +static int lru_hash_lookup_test_entries = 32; +static uint32_t max_cnt = 1000000; + +static int check_test_flags(enum test_type t) +{ + return test_flags & TEST_BIT(t); +} + +static void test_hash_prealloc(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_getuid); + printf("%d:hash_map_perf pre-alloc %lld events per sec\n", + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); +} + +static int pre_test_lru_hash_lookup(int tasks) +{ + int fd = map_fd[lru_hash_lookup_idx]; + uint32_t key; + long val = 1; + int ret; + + if (num_map_entries > lru_hash_lookup_test_entries) + lru_hash_lookup_test_entries = num_map_entries; + + /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test. + * + * It is fine that the user requests for a map with + * num_map_entries < 32 and some of the later lru hash lookup + * may return not found. For LRU map, we are not interested + * in such small map performance. + */ + for (key = 0; key < lru_hash_lookup_test_entries; key++) { + ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST); + if (ret) + return ret; + } + + return 0; +} + +static void do_test_lru(enum test_type test, int cpu) +{ + static int inner_lru_map_fds[MAX_NR_CPUS]; + + struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; + const char *test_name; + __u64 start_time; + int i, ret; + + if (test == INNER_LRU_HASH_PREALLOC && cpu) { + /* If CPU is not 0, create inner_lru hash map and insert the fd + * value into the array_of_lru_hash map. In case of CPU 0, + * 'inner_lru_hash_map' was statically inserted on the map init + */ + int outer_fd = map_fd[array_of_lru_hashs_idx]; + unsigned int mycpu, mynode; + + assert(cpu < MAX_NR_CPUS); + + ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); + assert(!ret); + + inner_lru_map_fds[cpu] = + bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], + sizeof(uint32_t), + sizeof(long), + inner_lru_hash_size, 0, + mynode); + if (inner_lru_map_fds[cpu] == -1) { + printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", + strerror(errno), errno); + exit(1); + } + + ret = bpf_map_update_elem(outer_fd, &cpu, + &inner_lru_map_fds[cpu], + BPF_ANY); + if (ret) { + printf("cannot update ARRAY_OF_LRU_HASHS with key:%u. %s(%d)\n", + cpu, strerror(errno), errno); + exit(1); + } + } + + in6.sin6_addr.s6_addr16[0] = 0xdead; + in6.sin6_addr.s6_addr16[1] = 0xbeef; + + if (test == LRU_HASH_PREALLOC) { + test_name = "lru_hash_map_perf"; + in6.sin6_addr.s6_addr16[2] = 0; + } else if (test == NOCOMMON_LRU_HASH_PREALLOC) { + test_name = "nocommon_lru_hash_map_perf"; + in6.sin6_addr.s6_addr16[2] = 1; + } else if (test == INNER_LRU_HASH_PREALLOC) { + test_name = "inner_lru_hash_map_perf"; + in6.sin6_addr.s6_addr16[2] = 2; + } else if (test == LRU_HASH_LOOKUP) { + test_name = "lru_hash_lookup_perf"; + in6.sin6_addr.s6_addr16[2] = 3; + in6.sin6_addr.s6_addr32[3] = 0; + } else { + assert(0); + } + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) { + ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6)); + assert(ret == -1 && errno == EBADF); + if (in6.sin6_addr.s6_addr32[3] < + lru_hash_lookup_test_entries - 32) + in6.sin6_addr.s6_addr32[3] += 32; + else + in6.sin6_addr.s6_addr32[3] = 0; + } + printf("%d:%s pre-alloc %lld events per sec\n", + cpu, test_name, + max_cnt * 1000000000ll / (time_get_ns() - start_time)); +} + +static void test_lru_hash_prealloc(int cpu) +{ + do_test_lru(LRU_HASH_PREALLOC, cpu); +} + +static void test_nocommon_lru_hash_prealloc(int cpu) +{ + do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu); +} + +static void test_inner_lru_hash_prealloc(int cpu) +{ + do_test_lru(INNER_LRU_HASH_PREALLOC, cpu); +} + +static void test_lru_hash_lookup(int cpu) +{ + do_test_lru(LRU_HASH_LOOKUP, cpu); +} + +static void test_percpu_hash_prealloc(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_geteuid); + printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n", + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); +} + +static void test_hash_kmalloc(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_getgid); + printf("%d:hash_map_perf kmalloc %lld events per sec\n", + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); +} + +static void test_percpu_hash_kmalloc(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_getegid); + printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n", + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); +} + +static void test_lpm_kmalloc(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_gettid); + printf("%d:lpm_perf kmalloc %lld events per sec\n", + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); +} + +static void test_hash_lookup(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_getpgid, 0); + printf("%d:hash_lookup %lld lookups per sec\n", + cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); +} + +static void test_array_lookup(int cpu) +{ + __u64 start_time; + int i; + + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_getppid, 0); + printf("%d:array_lookup %lld lookups per sec\n", + cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); +} + +typedef int (*pre_test_func)(int tasks); +const pre_test_func pre_test_funcs[] = { + [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup, +}; + +typedef void (*test_func)(int cpu); +const test_func test_funcs[] = { + [HASH_PREALLOC] = test_hash_prealloc, + [PERCPU_HASH_PREALLOC] = test_percpu_hash_prealloc, + [HASH_KMALLOC] = test_hash_kmalloc, + [PERCPU_HASH_KMALLOC] = test_percpu_hash_kmalloc, + [LRU_HASH_PREALLOC] = test_lru_hash_prealloc, + [NOCOMMON_LRU_HASH_PREALLOC] = test_nocommon_lru_hash_prealloc, + [LPM_KMALLOC] = test_lpm_kmalloc, + [HASH_LOOKUP] = test_hash_lookup, + [ARRAY_LOOKUP] = test_array_lookup, + [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc, + [LRU_HASH_LOOKUP] = test_lru_hash_lookup, +}; + +static int pre_test(int tasks) +{ + int i; + + for (i = 0; i < NR_TESTS; i++) { + if (pre_test_funcs[i] && check_test_flags(i)) { + int ret = pre_test_funcs[i](tasks); + + if (ret) + return ret; + } + } + + return 0; +} + +static void loop(int cpu) +{ + cpu_set_t cpuset; + int i; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + for (i = 0; i < NR_TESTS; i++) { + if (check_test_flags(i)) + test_funcs[i](cpu); + } +} + +static void run_perf_test(int tasks) +{ + pid_t pid[tasks]; + int i; + + assert(!pre_test(tasks)); + + for (i = 0; i < tasks; i++) { + pid[i] = fork(); + if (pid[i] == 0) { + loop(i); + exit(0); + } else if (pid[i] == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } + } + for (i = 0; i < tasks; i++) { + int status; + + assert(waitpid(pid[i], &status, 0) == pid[i]); + assert(status == 0); + } +} + +static void fill_lpm_trie(void) +{ + struct bpf_lpm_trie_key *key; + unsigned long value = 0; + unsigned int i; + int r; + + key = alloca(sizeof(*key) + 4); + key->prefixlen = 32; + + for (i = 0; i < 512; ++i) { + key->prefixlen = rand() % 33; + key->data[0] = rand() & 0xff; + key->data[1] = rand() & 0xff; + key->data[2] = rand() & 0xff; + key->data[3] = rand() & 0xff; + r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], + key, &value, 0); + assert(!r); + } + + key->prefixlen = 32; + key->data[0] = 192; + key->data[1] = 168; + key->data[2] = 0; + key->data[3] = 1; + value = 128; + + r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], key, &value, 0); + assert(!r); +} + +static void fixup_map(struct bpf_object *obj) +{ + struct bpf_map *map; + int i; + + bpf_object__for_each_map(map, obj) { + const char *name = bpf_map__name(map); + + /* Only change the max_entries for the enabled test(s) */ + for (i = 0; i < NR_TESTS; i++) { + if (!strcmp(test_map_names[i], name) && + (check_test_flags(i))) { + bpf_map__resize(map, num_map_entries); + continue; + } + } + } + + inner_lru_hash_size = num_map_entries; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + struct bpf_link *links[8]; + struct bpf_program *prog; + struct bpf_object *obj; + struct bpf_map *map; + char filename[256]; + int i = 0; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (argc > 1) + test_flags = atoi(argv[1]) ? : test_flags; + + if (argc > 2) + nr_cpus = atoi(argv[2]) ? : nr_cpus; + + if (argc > 3) + num_map_entries = atoi(argv[3]); + + if (argc > 4) + max_cnt = atoi(argv[4]); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + map = bpf_object__find_map_by_name(obj, "inner_lru_hash_map"); + if (libbpf_get_error(map)) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + inner_lru_hash_size = bpf_map__max_entries(map); + if (!inner_lru_hash_size) { + fprintf(stderr, "ERROR: failed to get map attribute\n"); + goto cleanup; + } + + /* resize BPF map prior to loading */ + if (num_map_entries > 0) + fixup_map(obj); + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "array_of_lru_hashs"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "hash_map_alloc"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "lru_hash_lookup_map"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; + } + + fill_lpm_trie(); + + run_perf_test(nr_cpus); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c new file mode 100644 index 000000000..14b792915 --- /dev/null +++ b/samples/bpf/offwaketime_kern.c @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/ptrace.h> +#include <uapi/linux/perf_event.h> +#include <linux/version.h> +#include <linux/sched.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define _(P) \ + ({ \ + typeof(P) val; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +#define MINBLOCK_US 1 + +struct key_t { + char waker[TASK_COMM_LEN]; + char target[TASK_COMM_LEN]; + u32 wret; + u32 tret; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, 10000); +} counts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 10000); +} start SEC(".maps"); + +struct wokeby_t { + char name[TASK_COMM_LEN]; + u32 ret; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct wokeby_t); + __uint(max_entries, 10000); +} wokeby SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); + +#define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) + +SEC("kprobe/try_to_wake_up") +int waker(struct pt_regs *ctx) +{ + struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + struct wokeby_t woke; + u32 pid; + + pid = _(p->pid); + + bpf_get_current_comm(&woke.name, sizeof(woke.name)); + woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS); + + bpf_map_update_elem(&wokeby, &pid, &woke, BPF_ANY); + return 0; +} + +static inline int update_counts(void *ctx, u32 pid, u64 delta) +{ + struct wokeby_t *woke; + u64 zero = 0, *val; + struct key_t key; + + __builtin_memset(&key.waker, 0, sizeof(key.waker)); + bpf_get_current_comm(&key.target, sizeof(key.target)); + key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS); + key.wret = 0; + + woke = bpf_map_lookup_elem(&wokeby, &pid); + if (woke) { + key.wret = woke->ret; + __builtin_memcpy(&key.waker, woke->name, sizeof(key.waker)); + bpf_map_delete_elem(&wokeby, &pid); + } + + val = bpf_map_lookup_elem(&counts, &key); + if (!val) { + bpf_map_update_elem(&counts, &key, &zero, BPF_NOEXIST); + val = bpf_map_lookup_elem(&counts, &key); + if (!val) + return 0; + } + (*val) += delta; + return 0; +} + +#if 1 +/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ +struct sched_switch_args { + unsigned long long pad; + char prev_comm[16]; + int prev_pid; + int prev_prio; + long long prev_state; + char next_comm[16]; + int next_pid; + int next_prio; +}; +SEC("tracepoint/sched/sched_switch") +int oncpu(struct sched_switch_args *ctx) +{ + /* record previous thread sleep time */ + u32 pid = ctx->prev_pid; +#else +SEC("kprobe/finish_task_switch") +int oncpu(struct pt_regs *ctx) +{ + struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + /* record previous thread sleep time */ + u32 pid = _(p->pid); +#endif + u64 delta, ts, *tsp; + + ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); + + /* calculate current thread's delta time */ + pid = bpf_get_current_pid_tgid(); + tsp = bpf_map_lookup_elem(&start, &pid); + if (!tsp) + /* missed start or filtered */ + return 0; + + delta = bpf_ktime_get_ns() - *tsp; + bpf_map_delete_elem(&start, &pid); + delta = delta / 1000; + if (delta < MINBLOCK_US) + return 0; + + return update_counts(ctx, pid, delta); +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c new file mode 100644 index 000000000..5734cfdaa --- /dev/null +++ b/samples/bpf/offwaketime_user.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Facebook + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/perf_event.h> +#include <errno.h> +#include <stdbool.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "trace_helpers.h" + +#define PRINT_RAW_ADDR 0 + +/* counts, stackmap */ +static int map_fd[2]; + +static void print_ksym(__u64 addr) +{ + struct ksym *sym; + + if (!addr) + return; + sym = ksym_search(addr); + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + return; + } + + if (PRINT_RAW_ADDR) + printf("%s/%llx;", sym->name, addr); + else + printf("%s;", sym->name); +} + +#define TASK_COMM_LEN 16 + +struct key_t { + char waker[TASK_COMM_LEN]; + char target[TASK_COMM_LEN]; + __u32 wret; + __u32 tret; +}; + +static void print_stack(struct key_t *key, __u64 count) +{ + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; + static bool warned; + int i; + + printf("%s;", key->target); + if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) { + printf("---;"); + } else { + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) + print_ksym(ip[i]); + } + printf("-;"); + if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) { + printf("---;"); + } else { + for (i = 0; i < PERF_MAX_STACK_DEPTH; i++) + print_ksym(ip[i]); + } + printf(";%s %lld\n", key->waker, count); + + if ((key->tret == -EEXIST || key->wret == -EEXIST) && !warned) { + printf("stackmap collisions seen. Consider increasing size\n"); + warned = true; + } else if (((int)(key->tret) < 0 || (int)(key->wret) < 0)) { + printf("err stackid %d %d\n", key->tret, key->wret); + } +} + +static void print_stacks(int fd) +{ + struct key_t key = {}, next_key; + __u64 value; + + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &value); + print_stack(&next_key, value); + key = next_key; + } +} + +static void int_exit(int sig) +{ + print_stacks(map_fd[0]); + exit(0); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; + struct bpf_link *links[2]; + struct bpf_program *prog; + int delay = 1, i = 0; + char filename[256]; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return 2; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; + } + + if (argc > 1) + delay = atoi(argv[1]); + sleep(delay); + print_stacks(map_fd[0]); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c new file mode 100644 index 000000000..c6f65f90a --- /dev/null +++ b/samples/bpf/parse_ldabs.c @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" + +#define DEFAULT_PKTGEN_UDP_PORT 9 +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +SEC("ldabs") +int handle_ingress(struct __sk_buff *skb) +{ + __u64 troff = ETH_HLEN + sizeof(struct iphdr); + + if (load_half(skb, offsetof(struct ethhdr, h_proto)) != ETH_P_IP) + return 0; + if (load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)) != IPPROTO_UDP || + load_byte(skb, ETH_HLEN) != 0x45) + return 0; + if (ip_is_fragment(skb, ETH_HLEN)) + return 0; + if (load_half(skb, troff + offsetof(struct udphdr, dest)) == DEFAULT_PKTGEN_UDP_PORT) + return TC_ACT_SHOT; + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c new file mode 100644 index 000000000..4a486cb1e --- /dev/null +++ b/samples/bpf/parse_simple.c @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <uapi/linux/bpf.h> +#include <net/ip.h> +#include <bpf/bpf_helpers.h> + +#define DEFAULT_PKTGEN_UDP_PORT 9 + +/* copy of 'struct ethhdr' without __packed */ +struct eth_hdr { + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_proto; +}; + +SEC("simple") +int handle_ingress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + struct iphdr *iph = data + sizeof(*eth); + struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); + void *data_end = (void *)(long)skb->data_end; + + /* single length check */ + if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) + return 0; + + if (eth->h_proto != htons(ETH_P_IP)) + return 0; + if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) + return 0; + if (ip_is_fragment(iph)) + return 0; + if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT)) + return TC_ACT_SHOT; + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c new file mode 100644 index 000000000..d8623846e --- /dev/null +++ b/samples/bpf/parse_varlen.c @@ -0,0 +1,150 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <uapi/linux/bpf.h> +#include <net/ip.h> +#include <bpf/bpf_helpers.h> + +#define DEFAULT_PKTGEN_UDP_PORT 9 +#define DEBUG 0 + +static int tcp(void *data, uint64_t tp_off, void *data_end) +{ + struct tcphdr *tcp = data + tp_off; + + if (tcp + 1 > data_end) + return 0; + if (tcp->dest == htons(80) || tcp->source == htons(80)) + return TC_ACT_SHOT; + return 0; +} + +static int udp(void *data, uint64_t tp_off, void *data_end) +{ + struct udphdr *udp = data + tp_off; + + if (udp + 1 > data_end) + return 0; + if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) || + udp->source == htons(DEFAULT_PKTGEN_UDP_PORT)) { + if (DEBUG) { + char fmt[] = "udp port 9 indeed\n"; + + bpf_trace_printk(fmt, sizeof(fmt)); + } + return TC_ACT_SHOT; + } + return 0; +} + +static int parse_ipv4(void *data, uint64_t nh_off, void *data_end) +{ + struct iphdr *iph; + uint64_t ihl_len; + + iph = data + nh_off; + if (iph + 1 > data_end) + return 0; + + if (ip_is_fragment(iph)) + return 0; + ihl_len = iph->ihl * 4; + + if (iph->protocol == IPPROTO_IPIP) { + iph = data + nh_off + ihl_len; + if (iph + 1 > data_end) + return 0; + ihl_len += iph->ihl * 4; + } + + if (iph->protocol == IPPROTO_TCP) + return tcp(data, nh_off + ihl_len, data_end); + else if (iph->protocol == IPPROTO_UDP) + return udp(data, nh_off + ihl_len, data_end); + return 0; +} + +static int parse_ipv6(void *data, uint64_t nh_off, void *data_end) +{ + struct ipv6hdr *ip6h; + struct iphdr *iph; + uint64_t ihl_len = sizeof(struct ipv6hdr); + uint64_t nexthdr; + + ip6h = data + nh_off; + if (ip6h + 1 > data_end) + return 0; + + nexthdr = ip6h->nexthdr; + + if (nexthdr == IPPROTO_IPIP) { + iph = data + nh_off + ihl_len; + if (iph + 1 > data_end) + return 0; + ihl_len += iph->ihl * 4; + nexthdr = iph->protocol; + } else if (nexthdr == IPPROTO_IPV6) { + ip6h = data + nh_off + ihl_len; + if (ip6h + 1 > data_end) + return 0; + ihl_len += sizeof(struct ipv6hdr); + nexthdr = ip6h->nexthdr; + } + + if (nexthdr == IPPROTO_TCP) + return tcp(data, nh_off + ihl_len, data_end); + else if (nexthdr == IPPROTO_UDP) + return udp(data, nh_off + ihl_len, data_end); + return 0; +} + +SEC("varlen") +int handle_ingress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + struct ethhdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + uint64_t h_proto, nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return 0; + + h_proto = eth->h_proto; + + if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return 0; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return 0; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_IP)) + return parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + return parse_ipv6(data, nh_off, data_end); + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/run_cookie_uid_helper_example.sh b/samples/bpf/run_cookie_uid_helper_example.sh new file mode 100755 index 000000000..fc6bc0451 --- /dev/null +++ b/samples/bpf/run_cookie_uid_helper_example.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +local_dir="$(pwd)" +root_dir=$local_dir/../.. +mnt_dir=$(mktemp -d --tmp) + +on_exit() { + iptables -D OUTPUT -m bpf --object-pinned ${mnt_dir}/bpf_prog -j ACCEPT + umount ${mnt_dir} + rm -r ${mnt_dir} +} + +trap on_exit EXIT +mount -t bpf bpf ${mnt_dir} +./per_socket_stats_example ${mnt_dir}/bpf_prog $1 diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c new file mode 100644 index 000000000..f24806ac2 --- /dev/null +++ b/samples/bpf/sampleip_kern.c @@ -0,0 +1,39 @@ +/* Copyright 2016 Netflix, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/bpf_perf_event.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define MAX_IPS 8192 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u32); + __uint(max_entries, MAX_IPS); +} ip_map SEC(".maps"); + +SEC("perf_event") +int do_sample(struct bpf_perf_event_data *ctx) +{ + u64 ip; + u32 *value, init_val = 1; + + ip = PT_REGS_IP(&ctx->regs); + value = bpf_map_lookup_elem(&ip_map, &ip); + if (value) + *value += 1; + else + /* E2BIG not tested for this example only */ + bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST); + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c new file mode 100644 index 000000000..921c505bb --- /dev/null +++ b/samples/bpf/sampleip_user.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sampleip: sample instruction pointer and frequency count in a BPF map. + * + * Copyright 2016 Netflix, Inc. + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <string.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> +#include <linux/bpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "perf-sys.h" +#include "trace_helpers.h" + +#define DEFAULT_FREQ 99 +#define DEFAULT_SECS 5 +#define MAX_IPS 8192 +#define PAGE_OFFSET 0xffff880000000000 + +static int map_fd; +static int nr_cpus; + +static void usage(void) +{ + printf("USAGE: sampleip [-F freq] [duration]\n"); + printf(" -F freq # sample frequency (Hertz), default 99\n"); + printf(" duration # sampling duration (seconds), default 5\n"); +} + +static int sampling_start(int freq, struct bpf_program *prog, + struct bpf_link *links[]) +{ + int i, pmu_fd; + + struct perf_event_attr pe_sample_attr = { + .type = PERF_TYPE_SOFTWARE, + .freq = 1, + .sample_period = freq, + .config = PERF_COUNT_SW_CPU_CLOCK, + .inherit = 1, + }; + + for (i = 0; i < nr_cpus; i++) { + pmu_fd = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i, + -1 /* group_fd */, 0 /* flags */); + if (pmu_fd < 0) { + fprintf(stderr, "ERROR: Initializing perf sampling\n"); + return 1; + } + links[i] = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: Attach perf event\n"); + links[i] = NULL; + close(pmu_fd); + return 1; + } + } + + return 0; +} + +static void sampling_end(struct bpf_link *links[]) +{ + int i; + + for (i = 0; i < nr_cpus; i++) + bpf_link__destroy(links[i]); +} + +struct ipcount { + __u64 ip; + __u32 count; +}; + +/* used for sorting */ +struct ipcount counts[MAX_IPS]; + +static int count_cmp(const void *p1, const void *p2) +{ + return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count; +} + +static void print_ip_map(int fd) +{ + struct ksym *sym; + __u64 key, next_key; + __u32 value; + int i, max; + + printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT"); + + /* fetch IPs and counts */ + key = 0, i = 0; + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &value); + counts[i].ip = next_key; + counts[i++].count = value; + key = next_key; + } + max = i; + + /* sort and print */ + qsort(counts, max, sizeof(struct ipcount), count_cmp); + for (i = 0; i < max; i++) { + if (counts[i].ip > PAGE_OFFSET) { + sym = ksym_search(counts[i].ip); + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + continue; + } + + printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name, + counts[i].count); + } else { + printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)", + counts[i].count); + } + } + + if (max == MAX_IPS) { + printf("WARNING: IP hash was full (max %d entries); ", max); + printf("may have dropped samples\n"); + } +} + +static void int_exit(int sig) +{ + printf("\n"); + print_ip_map(map_fd); + exit(0); +} + +int main(int argc, char **argv) +{ + int opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS, error = 1; + struct bpf_object *obj = NULL; + struct bpf_program *prog; + struct bpf_link **links; + char filename[256]; + + /* process arguments */ + while ((opt = getopt(argc, argv, "F:h")) != -1) { + switch (opt) { + case 'F': + freq = atoi(optarg); + break; + case 'h': + default: + usage(); + return 0; + } + } + if (argc - optind == 1) + secs = atoi(argv[optind]); + if (freq == 0 || secs == 0) { + usage(); + return 1; + } + + /* initialize kernel symbol translation */ + if (load_kallsyms()) { + fprintf(stderr, "ERROR: loading /proc/kallsyms\n"); + return 2; + } + + /* create perf FDs for each CPU */ + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + links = calloc(nr_cpus, sizeof(struct bpf_link *)); + if (!links) { + fprintf(stderr, "ERROR: malloc of links\n"); + goto cleanup; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "do_sample"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "ip_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + /* do sampling */ + printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n", + freq, secs); + if (sampling_start(freq, prog, links) != 0) + goto cleanup; + + sleep(secs); + error = 0; + +cleanup: + sampling_end(links); + /* output sample counts */ + if (!error) + print_ip_map(map_fd); + + free(links); + bpf_object__close(obj); + return error; +} diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c new file mode 100644 index 000000000..00aae1d33 --- /dev/null +++ b/samples/bpf/sock_example.c @@ -0,0 +1,106 @@ +/* eBPF example program: + * - creates arraymap in kernel with key 4 bytes and value 8 bytes + * + * - loads eBPF program: + * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)]; + * *(u32*)(fp - 4) = r0; + * // assuming packet is IPv4, lookup ip->proto in a map + * value = bpf_map_lookup_elem(map_fd, fp - 4); + * if (value) + * (*(u64*)value) += 1; + * + * - attaches this program to loopback interface "lo" raw socket + * + * - every second user space reads map[tcp], map[udp], map[icmp] to see + * how many packets of given protocol were seen on "lo" + */ +#include <stdio.h> +#include <unistd.h> +#include <assert.h> +#include <linux/bpf.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <stddef.h> +#include <bpf/bpf.h> +#include "bpf_insn.h" +#include "sock_example.h" + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static int test_sock(void) +{ + int sock = -1, map_fd, prog_fd, i, key; + long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), + 256, 0); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + goto cleanup; + } + + struct bpf_insn prog[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + + prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt, + "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + if (prog_fd < 0) { + printf("failed to load prog '%s'\n", strerror(errno)); + goto cleanup; + } + + sock = open_raw_sock("lo"); + + if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) < 0) { + printf("setsockopt %s\n", strerror(errno)); + goto cleanup; + } + + for (i = 0; i < 10; i++) { + key = IPPROTO_TCP; + assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0); + + key = IPPROTO_UDP; + assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0); + + key = IPPROTO_ICMP; + assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0); + + printf("TCP %lld UDP %lld ICMP %lld packets\n", + tcp_cnt, udp_cnt, icmp_cnt); + sleep(1); + } + +cleanup: + /* maps, programs, raw sockets will auto cleanup on process exit */ + return 0; +} + +int main(void) +{ + FILE *f; + + f = popen("ping -4 -c5 localhost", "r"); + (void)f; + + return test_sock(); +} diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h new file mode 100644 index 000000000..a27d7579b --- /dev/null +++ b/samples/bpf/sock_example.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <stdlib.h> +#include <stdio.h> +#include <linux/unistd.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <linux/if_ether.h> +#include <net/if.h> +#include <linux/if_packet.h> +#include <arpa/inet.h> + +static inline int open_raw_sock(const char *name) +{ + struct sockaddr_ll sll; + int sock; + + sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL)); + if (sock < 0) { + printf("cannot create raw socket\n"); + return -1; + } + + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = if_nametoindex(name); + sll.sll_protocol = htons(ETH_P_ALL); + if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) { + printf("bind to %s: %s\n", name, strerror(errno)); + close(sock); + return -1; + } + + return sock; +} diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c new file mode 100644 index 000000000..6d0ac7569 --- /dev/null +++ b/samples/bpf/sock_flags_kern.c @@ -0,0 +1,49 @@ +#include <uapi/linux/bpf.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <uapi/linux/in.h> +#include <uapi/linux/in6.h> +#include <bpf/bpf_helpers.h> + +SEC("cgroup/sock1") +int bpf_prog1(struct bpf_sock *sk) +{ + char fmt[] = "socket: family %d type %d protocol %d\n"; + char fmt2[] = "socket: uid %u gid %u\n"; + __u64 gid_uid = bpf_get_current_uid_gid(); + __u32 uid = gid_uid & 0xffffffff; + __u32 gid = gid_uid >> 32; + + bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); + bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid); + + /* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets + * ie., make ping6 fail + */ + if (sk->family == PF_INET6 && + sk->type == SOCK_RAW && + sk->protocol == IPPROTO_ICMPV6) + return 0; + + return 1; +} + +SEC("cgroup/sock2") +int bpf_prog2(struct bpf_sock *sk) +{ + char fmt[] = "socket: family %d type %d protocol %d\n"; + + bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); + + /* block PF_INET, SOCK_RAW, IPPROTO_ICMP sockets + * ie., make ping fail + */ + if (sk->family == PF_INET && + sk->type == SOCK_RAW && + sk->protocol == IPPROTO_ICMP) + return 0; + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c new file mode 100644 index 000000000..431c95646 --- /dev/null +++ b/samples/bpf/sockex1_kern.c @@ -0,0 +1,30 @@ +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, 256); +} my_map SEC(".maps"); + +SEC("socket1") +int bpf_prog1(struct __sk_buff *skb) +{ + int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + long *value; + + if (skb->pkt_type != PACKET_OUTGOING) + return 0; + + value = bpf_map_lookup_elem(&my_map, &index); + if (value) + __sync_fetch_and_add(value, skb->len); + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c new file mode 100644 index 000000000..3c8372287 --- /dev/null +++ b/samples/bpf/sockex1_user.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "sock_example.h" +#include <unistd.h> +#include <arpa/inet.h> + +int main(int ac, char **argv) +{ + struct bpf_object *obj; + int map_fd, prog_fd; + char filename[256]; + int i, sock; + FILE *f; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, + &obj, &prog_fd)) + return 1; + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) == 0); + + f = popen("ping -4 -c5 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + long long tcp_cnt, udp_cnt, icmp_cnt; + int key; + + key = IPPROTO_TCP; + assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0); + + key = IPPROTO_UDP; + assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0); + + key = IPPROTO_ICMP; + assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0); + + printf("TCP %lld UDP %lld ICMP %lld bytes\n", + tcp_cnt, udp_cnt, icmp_cnt); + sleep(1); + } + + return 0; +} diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c new file mode 100644 index 000000000..b7997541f --- /dev/null +++ b/samples/bpf/sockex2_kern.c @@ -0,0 +1,223 @@ +#include <uapi/linux/bpf.h> +#include <uapi/linux/in.h> +#include <uapi/linux/if.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/if_tunnel.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct flow_key_record { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u16 thoff; + __u8 ip_proto; +}; + +static inline int proto_ports_offset(__u64 proto) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + return 0; + case IPPROTO_AH: + return 4; + default: + return 0; + } +} + +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) +{ + __u64 w0 = load_word(ctx, off); + __u64 w1 = load_word(ctx, off + 4); + __u64 w2 = load_word(ctx, off + 8); + __u64 w3 = load_word(ctx, off + 12); + + return (__u32)(w0 ^ w1 ^ w2 ^ w3); +} + +static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, + struct flow_key_record *flow) +{ + __u64 verlen; + + if (unlikely(ip_is_fragment(skb, nhoff))) + *ip_proto = 0; + else + *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); + + if (*ip_proto != IPPROTO_GRE) { + flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); + flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); + } + + verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); + if (likely(verlen == 0x45)) + nhoff += 20; + else + nhoff += (verlen & 0xF) << 2; + + return nhoff; +} + +static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, + struct flow_key_record *flow) +{ + *ip_proto = load_byte(skb, + nhoff + offsetof(struct ipv6hdr, nexthdr)); + flow->src = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, saddr)); + flow->dst = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, daddr)); + nhoff += sizeof(struct ipv6hdr); + + return nhoff; +} + +static inline bool flow_dissector(struct __sk_buff *skb, + struct flow_key_record *flow) +{ + __u64 nhoff = ETH_HLEN; + __u64 ip_proto; + __u64 proto = load_half(skb, 12); + int poff; + + if (proto == ETH_P_8021AD) { + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (proto == ETH_P_8021Q) { + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (likely(proto == ETH_P_IP)) + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + else if (proto == ETH_P_IPV6) + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + else + return false; + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + }; + + __u64 gre_flags = load_half(skb, + nhoff + offsetof(struct gre_hdr, flags)); + __u64 gre_proto = load_half(skb, + nhoff + offsetof(struct gre_hdr, proto)); + + if (gre_flags & (GRE_VERSION|GRE_ROUTING)) + break; + + proto = gre_proto; + nhoff += 4; + if (gre_flags & GRE_CSUM) + nhoff += 4; + if (gre_flags & GRE_KEY) + nhoff += 4; + if (gre_flags & GRE_SEQ) + nhoff += 4; + + if (proto == ETH_P_8021Q) { + proto = load_half(skb, + nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (proto == ETH_P_IP) + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + else if (proto == ETH_P_IPV6) + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + else + return false; + break; + } + case IPPROTO_IPIP: + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + break; + case IPPROTO_IPV6: + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + break; + default: + break; + } + + flow->ip_proto = ip_proto; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += poff; + flow->ports = load_word(skb, nhoff); + } + + flow->thoff = (__u16) nhoff; + + return true; +} + +struct pair { + long packets; + long bytes; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, struct pair); + __uint(max_entries, 1024); +} hash_map SEC(".maps"); + +SEC("socket2") +int bpf_prog2(struct __sk_buff *skb) +{ + struct flow_key_record flow = {}; + struct pair *value; + u32 key; + + if (!flow_dissector(skb, &flow)) + return 0; + + key = flow.dst; + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) { + __sync_fetch_and_add(&value->packets, 1); + __sync_fetch_and_add(&value->bytes, skb->len); + } else { + struct pair val = {1, skb->len}; + + bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c new file mode 100644 index 000000000..af925a5af --- /dev/null +++ b/samples/bpf/sockex2_user.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "sock_example.h" +#include <unistd.h> +#include <arpa/inet.h> +#include <sys/resource.h> + +struct pair { + __u64 packets; + __u64 bytes; +}; + +int main(int ac, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj; + int map_fd, prog_fd; + char filename[256]; + int i, sock; + FILE *f; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + setrlimit(RLIMIT_MEMLOCK, &r); + + if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, + &obj, &prog_fd)) + return 1; + + map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) == 0); + + f = popen("ping -4 -c5 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + int key = 0, next_key; + struct pair value; + + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd, &next_key, &value); + printf("ip %s bytes %lld packets %lld\n", + inet_ntoa((struct in_addr){htonl(next_key)}), + value.bytes, value.packets); + key = next_key; + } + sleep(1); + } + return 0; +} diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c new file mode 100644 index 000000000..b36350335 --- /dev/null +++ b/samples/bpf/sockex3_kern.c @@ -0,0 +1,293 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/in.h> +#include <uapi/linux/if.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/if_tunnel.h> +#include <uapi/linux/mpls.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 8); +} jmp_table SEC(".maps"); + +#define PARSE_VLAN 1 +#define PARSE_MPLS 2 +#define PARSE_IP 3 +#define PARSE_IPV6 4 + +/* Protocol dispatch routine. It tail-calls next BPF program depending + * on eth proto. Note, we could have used ... + * + * bpf_tail_call(skb, &jmp_table, proto); + * + * ... but it would need large prog_array and cannot be optimised given + * the map key is not static. + */ +static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) +{ + switch (proto) { + case ETH_P_8021Q: + case ETH_P_8021AD: + bpf_tail_call(skb, &jmp_table, PARSE_VLAN); + break; + case ETH_P_MPLS_UC: + case ETH_P_MPLS_MC: + bpf_tail_call(skb, &jmp_table, PARSE_MPLS); + break; + case ETH_P_IP: + bpf_tail_call(skb, &jmp_table, PARSE_IP); + break; + case ETH_P_IPV6: + bpf_tail_call(skb, &jmp_table, PARSE_IPV6); + break; + } +} + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct flow_key_record { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u32 ip_proto; +}; + +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) +{ + __u64 w0 = load_word(ctx, off); + __u64 w1 = load_word(ctx, off + 4); + __u64 w2 = load_word(ctx, off + 8); + __u64 w3 = load_word(ctx, off + 12); + + return (__u32)(w0 ^ w1 ^ w2 ^ w3); +} + +struct globals { + struct flow_key_record flow; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct globals); + __uint(max_entries, 32); +} percpu_map SEC(".maps"); + +/* user poor man's per_cpu until native support is ready */ +static struct globals *this_cpu_globals(void) +{ + u32 key = bpf_get_smp_processor_id(); + + return bpf_map_lookup_elem(&percpu_map, &key); +} + +/* some simple stats for user space consumption */ +struct pair { + __u64 packets; + __u64 bytes; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct flow_key_record); + __type(value, struct pair); + __uint(max_entries, 1024); +} hash_map SEC(".maps"); + +static void update_stats(struct __sk_buff *skb, struct globals *g) +{ + struct flow_key_record key = g->flow; + struct pair *value; + + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) { + __sync_fetch_and_add(&value->packets, 1); + __sync_fetch_and_add(&value->bytes, skb->len); + } else { + struct pair val = {1, skb->len}; + + bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); + } +} + +static __always_inline void parse_ip_proto(struct __sk_buff *skb, + struct globals *g, __u32 ip_proto) +{ + __u32 nhoff = skb->cb[0]; + int poff; + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + }; + + __u32 gre_flags = load_half(skb, + nhoff + offsetof(struct gre_hdr, flags)); + __u32 gre_proto = load_half(skb, + nhoff + offsetof(struct gre_hdr, proto)); + + if (gre_flags & (GRE_VERSION|GRE_ROUTING)) + break; + + nhoff += 4; + if (gre_flags & GRE_CSUM) + nhoff += 4; + if (gre_flags & GRE_KEY) + nhoff += 4; + if (gre_flags & GRE_SEQ) + nhoff += 4; + + skb->cb[0] = nhoff; + parse_eth_proto(skb, gre_proto); + break; + } + case IPPROTO_IPIP: + parse_eth_proto(skb, ETH_P_IP); + break; + case IPPROTO_IPV6: + parse_eth_proto(skb, ETH_P_IPV6); + break; + case IPPROTO_TCP: + case IPPROTO_UDP: + g->flow.ports = load_word(skb, nhoff); + case IPPROTO_ICMP: + g->flow.ip_proto = ip_proto; + update_stats(skb, g); + break; + default: + break; + } +} + +PROG(PARSE_IP)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, verlen, ip_proto; + + if (!g) + return 0; + + nhoff = skb->cb[0]; + + if (unlikely(ip_is_fragment(skb, nhoff))) + return 0; + + ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); + + if (ip_proto != IPPROTO_GRE) { + g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); + g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); + } + + verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); + nhoff += (verlen & 0xF) << 2; + + skb->cb[0] = nhoff; + parse_ip_proto(skb, g, ip_proto); + return 0; +} + +PROG(PARSE_IPV6)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, ip_proto; + + if (!g) + return 0; + + nhoff = skb->cb[0]; + + ip_proto = load_byte(skb, + nhoff + offsetof(struct ipv6hdr, nexthdr)); + g->flow.src = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, saddr)); + g->flow.dst = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, daddr)); + nhoff += sizeof(struct ipv6hdr); + + skb->cb[0] = nhoff; + parse_ip_proto(skb, g, ip_proto); + return 0; +} + +PROG(PARSE_VLAN)(struct __sk_buff *skb) +{ + __u32 nhoff, proto; + + nhoff = skb->cb[0]; + + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + skb->cb[0] = nhoff; + + parse_eth_proto(skb, proto); + + return 0; +} + +PROG(PARSE_MPLS)(struct __sk_buff *skb) +{ + __u32 nhoff, label; + + nhoff = skb->cb[0]; + + label = load_word(skb, nhoff); + nhoff += sizeof(struct mpls_label); + skb->cb[0] = nhoff; + + if (label & MPLS_LS_S_MASK) { + __u8 verlen = load_byte(skb, nhoff); + if ((verlen & 0xF0) == 4) + parse_eth_proto(skb, ETH_P_IP); + else + parse_eth_proto(skb, ETH_P_IPV6); + } else { + parse_eth_proto(skb, ETH_P_MPLS_UC); + } + + return 0; +} + +SEC("socket/0") +int main_prog(struct __sk_buff *skb) +{ + __u32 nhoff = ETH_HLEN; + __u32 proto = load_half(skb, 12); + + skb->cb[0] = nhoff; + parse_eth_proto(skb, proto); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c new file mode 100644 index 000000000..7793f6a6a --- /dev/null +++ b/samples/bpf/sockex3_user.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <assert.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "sock_example.h" +#include <unistd.h> +#include <arpa/inet.h> +#include <sys/resource.h> + +struct flow_key_record { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u32 ip_proto; +}; + +struct pair { + __u64 packets; + __u64 bytes; +}; + +int main(int argc, char **argv) +{ + int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; + struct bpf_object *obj; + const char *section; + char filename[256]; + FILE *f; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + setrlimit(RLIMIT_MEMLOCK, &r); + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table"); + hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); + if (jmp_table_fd < 0 || hash_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + fd = bpf_program__fd(prog); + + section = bpf_program__section_name(prog); + if (sscanf(section, "socket/%d", &key) != 1) { + fprintf(stderr, "ERROR: finding prog failed\n"); + goto cleanup; + } + + if (key == 0) + main_prog_fd = fd; + else + bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY); + } + + sock = open_raw_sock("lo"); + + /* attach BPF program to socket */ + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd, + sizeof(__u32)) == 0); + + if (argc > 1) + f = popen("ping -4 -c5 localhost", "r"); + else + f = popen("netperf -l 4 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + struct flow_key_record key = {}, next_key; + struct pair value; + + sleep(1); + printf("IP src.port -> dst.port bytes packets\n"); + while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(hash_map_fd, &next_key, &value); + printf("%s.%05d -> %s.%05d %12lld %12lld\n", + inet_ntoa((struct in_addr){htonl(next_key.src)}), + next_key.port16[0], + inet_ntoa((struct in_addr){htonl(next_key.dst)}), + next_key.port16[1], + value.bytes, value.packets); + key = next_key; + } + } + +cleanup: + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c new file mode 100644 index 000000000..455da7731 --- /dev/null +++ b/samples/bpf/spintest_kern.c @@ -0,0 +1,69 @@ +/* Copyright (c) 2016, Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/perf_event.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_map2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); + +#define PROG(foo) \ +int foo(struct pt_regs *ctx) \ +{ \ + long v = PT_REGS_IP(ctx), *val; \ +\ + val = bpf_map_lookup_elem(&my_map, &v); \ + bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \ + bpf_map_update_elem(&my_map2, &v, &v, BPF_ANY); \ + bpf_map_delete_elem(&my_map2, &v); \ + bpf_get_stackid(ctx, &stackmap, BPF_F_REUSE_STACKID); \ + return 0; \ +} + +/* add kprobes to all possible *spin* functions */ +SEC("kprobe/spin_unlock")PROG(p1) +SEC("kprobe/spin_lock")PROG(p2) +SEC("kprobe/mutex_spin_on_owner")PROG(p3) +SEC("kprobe/rwsem_spin_on_owner")PROG(p4) +SEC("kprobe/spin_unlock_irqrestore")PROG(p5) +SEC("kprobe/_raw_spin_unlock_irqrestore")PROG(p6) +SEC("kprobe/_raw_spin_unlock_bh")PROG(p7) +SEC("kprobe/_raw_spin_unlock")PROG(p8) +SEC("kprobe/_raw_spin_lock_irqsave")PROG(p9) +SEC("kprobe/_raw_spin_trylock_bh")PROG(p10) +SEC("kprobe/_raw_spin_lock_irq")PROG(p11) +SEC("kprobe/_raw_spin_trylock")PROG(p12) +SEC("kprobe/_raw_spin_lock")PROG(p13) +SEC("kprobe/_raw_spin_lock_bh")PROG(p14) +/* and to inner bpf helpers */ +SEC("kprobe/htab_map_update_elem")PROG(p15) +SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16) +SEC("kprobe/htab_map_alloc")PROG(p17) + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c new file mode 100644 index 000000000..f090d0dc6 --- /dev/null +++ b/samples/bpf/spintest_user.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <assert.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "trace_helpers.h" + +int main(int ac, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256], symbol[256]; + struct bpf_object *obj = NULL; + struct bpf_link *links[20]; + long key, next_key, value; + struct bpf_program *prog; + int map_fd, i, j = 0; + const char *section; + struct ksym *sym; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return 2; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + section = bpf_program__section_name(prog); + if (sscanf(section, "kprobe/%s", symbol) != 1) + continue; + + /* Attach prog only when symbol exists */ + if (ksym_get_addr(symbol)) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + } + + for (i = 0; i < 5; i++) { + key = 0; + printf("kprobing funcs:"); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd, &next_key, &value); + assert(next_key == value); + sym = ksym_search(value); + key = next_key; + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + continue; + } + + printf(" %s", sym->name); + } + if (key) + printf("\n"); + key = 0; + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) + bpf_map_delete_elem(map_fd, &next_key); + sleep(1); + } + +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c new file mode 100644 index 000000000..88f940052 --- /dev/null +++ b/samples/bpf/syscall_nrs.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <uapi/linux/unistd.h> +#include <linux/kbuild.h> + +#define SYSNR(_NR) DEFINE(SYS ## _NR, _NR) + +void syscall_defines(void) +{ + COMMENT("Linux system call numbers."); + SYSNR(__NR_write); + SYSNR(__NR_read); +#ifdef __NR_mmap2 + SYSNR(__NR_mmap2); +#endif +#ifdef __NR_mmap + SYSNR(__NR_mmap); +#endif + +} diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c new file mode 100644 index 000000000..50231c2ef --- /dev/null +++ b/samples/bpf/syscall_tp_kern.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2017 Facebook + */ +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct syscalls_enter_open_args { + unsigned long long unused; + long syscall_nr; + long filename_ptr; + long flags; + long mode; +}; + +struct syscalls_exit_open_args { + unsigned long long unused; + long syscall_nr; + long ret; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} enter_open_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} exit_open_map SEC(".maps"); + +static __always_inline void count(void *map) +{ + u32 key = 0; + u32 *value, init_val = 1; + + value = bpf_map_lookup_elem(map, &key); + if (value) + *value += 1; + else + bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST); +} + +SEC("tracepoint/syscalls/sys_enter_open") +int trace_enter_open(struct syscalls_enter_open_args *ctx) +{ + count(&enter_open_map); + return 0; +} + +SEC("tracepoint/syscalls/sys_enter_openat") +int trace_enter_open_at(struct syscalls_enter_open_args *ctx) +{ + count(&enter_open_map); + return 0; +} + +SEC("tracepoint/syscalls/sys_exit_open") +int trace_enter_exit(struct syscalls_exit_open_args *ctx) +{ + count(&exit_open_map); + return 0; +} + +SEC("tracepoint/syscalls/sys_exit_openat") +int trace_enter_exit_at(struct syscalls_exit_open_args *ctx) +{ + count(&exit_open_map); + return 0; +} diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c new file mode 100644 index 000000000..76a1d0012 --- /dev/null +++ b/samples/bpf/syscall_tp_user.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2017 Facebook + */ +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <linux/perf_event.h> +#include <errno.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> + +/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*. + * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. + */ + +static void usage(const char *cmd) +{ + printf("USAGE: %s [-i num_progs] [-h]\n", cmd); + printf(" -i num_progs # number of progs of the test\n"); + printf(" -h # help\n"); +} + +static void verify_map(int map_id) +{ + __u32 key = 0; + __u32 val; + + if (bpf_map_lookup_elem(map_id, &key, &val) != 0) { + fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); + return; + } + if (val == 0) { + fprintf(stderr, "failed: map #%d returns value 0\n", map_id); + return; + } + val = 0; + if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { + fprintf(stderr, "map_update failed: %s\n", strerror(errno)); + return; + } +} + +static int test(char *filename, int num_progs) +{ + int map0_fds[num_progs], map1_fds[num_progs], fd, i, j = 0; + struct bpf_link *links[num_progs * 4]; + struct bpf_object *objs[num_progs]; + struct bpf_program *prog; + + for (i = 0; i < num_progs; i++) { + objs[i] = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(objs[i])) { + fprintf(stderr, "opening BPF object file failed\n"); + objs[i] = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(objs[i])) { + fprintf(stderr, "loading BPF object file failed\n"); + goto cleanup; + } + + map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "enter_open_map"); + map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "exit_open_map"); + if (map0_fds[i] < 0 || map1_fds[i] < 0) { + fprintf(stderr, "finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, objs[i]) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]); + } + + /* current load_bpf_file has perf_event_open default pid = -1 + * and cpu = 0, which permits attached bpf execution on + * all cpus for all pid's. bpf program execution ignores + * cpu affinity. + */ + /* trigger some "open" operations */ + fd = open(filename, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "open failed: %s\n", strerror(errno)); + return 1; + } + close(fd); + + /* verify the map */ + for (i = 0; i < num_progs; i++) { + verify_map(map0_fds[i]); + verify_map(map1_fds[i]); + } + +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + for (i--; i >= 0; i--) + bpf_object__close(objs[i]); + return 0; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int opt, num_progs = 1; + char filename[256]; + + while ((opt = getopt(argc, argv, "i:h")) != -1) { + switch (opt) { + case 'i': + num_progs = atoi(optarg); + break; + case 'h': + default: + usage(argv[0]); + return 0; + } + } + + setrlimit(RLIMIT_MEMLOCK, &r); + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + return test(filename, num_progs); +} diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c new file mode 100644 index 000000000..c821294e1 --- /dev/null +++ b/samples/bpf/task_fd_query_kern.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("kprobe/blk_mq_start_request") +int bpf_prog1(struct pt_regs *ctx) +{ + return 0; +} + +SEC("kretprobe/blk_account_io_done") +int bpf_prog2(struct pt_regs *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c new file mode 100644 index 000000000..b68bd2f8f --- /dev/null +++ b/samples/bpf/task_fd_query_user.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <stdint.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <sys/ioctl.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <linux/perf_event.h> + +#include <bpf/libbpf.h> +#include "bpf_load.h" +#include "bpf_util.h" +#include "perf-sys.h" +#include "trace_helpers.h" + +#define CHECK_PERROR_RET(condition) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("FAIL: %s:\n", __func__); \ + perror(" "); \ + return -1; \ + } \ +}) + +#define CHECK_AND_RET(condition) ({ \ + int __ret = !!(condition); \ + if (__ret) \ + return -1; \ +}) + +static __u64 ptr_to_u64(void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type" +static int bpf_find_probe_type(const char *event_type) +{ + char buf[256]; + int fd, ret; + + ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + fd = open(buf, O_RDONLY); + CHECK_PERROR_RET(fd < 0); + + ret = read(fd, buf, sizeof(buf)); + close(fd); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + errno = 0; + ret = (int)strtol(buf, NULL, 10); + CHECK_PERROR_RET(errno); + return ret; +} + +#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe" +static int bpf_get_retprobe_bit(const char *event_type) +{ + char buf[256]; + int fd, ret; + + ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + fd = open(buf, O_RDONLY); + CHECK_PERROR_RET(fd < 0); + + ret = read(fd, buf, sizeof(buf)); + close(fd); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + CHECK_PERROR_RET(strlen(buf) < strlen("config:")); + + errno = 0; + ret = (int)strtol(buf + strlen("config:"), NULL, 10); + CHECK_PERROR_RET(errno); + return ret; +} + +static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, + __u32 expected_fd_type) +{ + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + char buf[256]; + int err; + + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len, + &prog_id, &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", + __func__, prog_fd_idx, fn_name); + perror(" :"); + return -1; + } + if (strcmp(buf, fn_name) != 0 || + fd_type != expected_fd_type || + probe_offset != 0x0 || probe_addr != 0x0) { + printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", + prog_fd_idx); + printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," + " probe_addr: 0x%llx\n", + buf, fd_type, probe_offset, probe_addr); + return -1; + } + return 0; +} + +static int test_nondebug_fs_kuprobe_common(const char *event_type, + const char *name, __u64 offset, __u64 addr, bool is_return, + char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr) +{ + int is_return_bit = bpf_get_retprobe_bit(event_type); + int type = bpf_find_probe_type(event_type); + struct perf_event_attr attr = {}; + int fd; + + if (type < 0 || is_return_bit < 0) { + printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", + __func__, type, is_return_bit); + return -1; + } + + attr.sample_period = 1; + attr.wakeup_events = 1; + if (is_return) + attr.config |= 1 << is_return_bit; + + if (name) { + attr.config1 = ptr_to_u64((void *)name); + attr.config2 = offset; + } else { + attr.config1 = 0; + attr.config2 = addr; + } + attr.size = sizeof(attr); + attr.type = type; + + fd = sys_perf_event_open(&attr, -1, 0, -1, 0); + CHECK_PERROR_RET(fd < 0); + + CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0); + CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); + CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, + prog_id, fd_type, probe_offset, probe_addr) < 0); + + return 0; +} + +static int test_nondebug_fs_probe(const char *event_type, const char *name, + __u64 offset, __u64 addr, bool is_return, + __u32 expected_fd_type, + __u32 expected_ret_fd_type, + char *buf, __u32 buf_len) +{ + __u64 probe_offset, probe_addr; + __u32 prog_id, fd_type; + int err; + + err = test_nondebug_fs_kuprobe_common(event_type, name, + offset, addr, is_return, + buf, &buf_len, &prog_id, + &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, " + "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n", + __func__, name ? name : "", offset, addr, is_return); + perror(" :"); + return -1; + } + if ((is_return && fd_type != expected_ret_fd_type) || + (!is_return && fd_type != expected_fd_type)) { + printf("FAIL: %s, incorrect fd_type %u\n", + __func__, fd_type); + return -1; + } + if (name) { + if (strcmp(name, buf) != 0) { + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); + return -1; + } + if (probe_offset != offset) { + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", + __func__, probe_offset); + return -1; + } + } else { + if (buf_len != 0) { + printf("FAIL: %s, incorrect buf %p\n", + __func__, buf); + return -1; + } + + if (probe_addr != addr) { + printf("FAIL: %s, incorrect probe_addr 0x%llx\n", + __func__, probe_addr); + return -1; + } + } + return 0; +} + +static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) +{ + const char *event_type = "uprobe"; + struct perf_event_attr attr = {}; + char buf[256], event_alias[sizeof("test_1234567890")]; + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + int err, res, kfd, efd; + ssize_t bytes; + + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", + event_type); + kfd = open(buf, O_WRONLY | O_APPEND, 0); + CHECK_PERROR_RET(kfd < 0); + + res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); + CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias)); + + res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", + is_return ? 'r' : 'p', event_type, event_alias, + binary_path, offset); + CHECK_PERROR_RET(res < 0 || res >= sizeof(buf)); + CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0); + + close(kfd); + kfd = -1; + + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id", + event_type, event_alias); + efd = open(buf, O_RDONLY, 0); + CHECK_PERROR_RET(efd < 0); + + bytes = read(efd, buf, sizeof(buf)); + CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf)); + close(efd); + buf[bytes] = '\0'; + + attr.config = strtol(buf, NULL, 0); + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_period = 1; + attr.wakeup_events = 1; + kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + CHECK_PERROR_RET(kfd < 0); + CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); + CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0); + + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, + &prog_id, &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, binary_path %s\n", __func__, binary_path); + perror(" :"); + return -1; + } + if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) || + (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) { + printf("FAIL: %s, incorrect fd_type %u\n", __func__, + fd_type); + return -1; + } + if (strcmp(binary_path, buf) != 0) { + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); + return -1; + } + if (probe_offset != offset) { + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__, + probe_offset); + return -1; + } + + close(kfd); + return 0; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + extern char __executable_start; + char filename[256], buf[256]; + __u64 uprobe_file_offset; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return 1; + } + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + /* test two functions in the corresponding *_kern.c file */ + CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request", + BPF_FD_TYPE_KPROBE)); + CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_done", + BPF_FD_TYPE_KRETPROBE)); + + /* test nondebug fs kprobe */ + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, + false, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); +#ifdef __x86_64__ + /* set a kprobe on "bpf_check + 0x5", which is x64 specific */ + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0, + false, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); +#endif + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, + true, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), false, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), false, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + NULL, 0)); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), true, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), true, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + 0, 0)); + + /* test nondebug fs uprobe */ + /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64 + * and the default linker script, which defines __executable_start as + * the start of the .text section. The calculation could be different + * on different systems with different compilers. The right way is + * to parse the ELF file. We took a shortcut here. + */ + uprobe_file_offset = (__u64)main - (__u64)&__executable_start; + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], + uprobe_file_offset, 0x0, false, + BPF_FD_TYPE_UPROBE, + BPF_FD_TYPE_URETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], + uprobe_file_offset, 0x0, true, + BPF_FD_TYPE_UPROBE, + BPF_FD_TYPE_URETPROBE, + buf, sizeof(buf))); + + /* test debug fs uprobe */ + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, + false)); + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, + true)); + + return 0; +} diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh new file mode 100755 index 000000000..37d95ef3c --- /dev/null +++ b/samples/bpf/tc_l2_redirect.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +[[ -z $TC ]] && TC='tc' +[[ -z $IP ]] && IP='ip' + +REDIRECT_USER='./tc_l2_redirect' +REDIRECT_BPF='./tc_l2_redirect_kern.o' + +RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter) +IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding) + +function config_common { + local tun_type=$1 + + $IP netns add ns1 + $IP netns add ns2 + $IP link add ve1 type veth peer name vens1 + $IP link add ve2 type veth peer name vens2 + $IP link set dev ve1 up + $IP link set dev ve2 up + $IP link set dev ve1 mtu 1500 + $IP link set dev ve2 mtu 1500 + $IP link set dev vens1 netns ns1 + $IP link set dev vens2 netns ns2 + + $IP -n ns1 link set dev lo up + $IP -n ns1 link set dev vens1 up + $IP -n ns1 addr add 10.1.1.101/24 dev vens1 + $IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad + $IP -n ns1 route add default via 10.1.1.1 dev vens1 + $IP -n ns1 route add default via 2401:db01::1 dev vens1 + + $IP -n ns2 link set dev lo up + $IP -n ns2 link set dev vens2 up + $IP -n ns2 addr add 10.2.1.102/24 dev vens2 + $IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad + $IP -n ns2 addr add 10.10.1.102 dev lo + $IP -n ns2 addr add 2401:face::66/64 dev lo nodad + $IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1 + $IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1 + $IP -n ns2 link set dev ipt2 up + $IP -n ns2 link set dev ip6t2 up + $IP netns exec ns2 $TC qdisc add dev vens2 clsact + $IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip + if [[ $tun_type == "ipip" ]]; then + $IP -n ns2 route add 10.1.1.0/24 dev ipt2 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0 + else + $IP -n ns2 route add 10.1.1.0/24 dev ip6t2 + $IP -n ns2 route add 2401:db01::/64 dev ip6t2 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0 + fi + + $IP addr add 10.1.1.1/24 dev ve1 + $IP addr add 2401:db01::1/64 dev ve1 nodad + $IP addr add 10.2.1.1/24 dev ve2 + $IP addr add 2401:db02::1/64 dev ve2 nodad + + $TC qdisc add dev ve2 clsact + $TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward + + sysctl -q -w net.ipv4.conf.all.rp_filter=0 + sysctl -q -w net.ipv6.conf.all.forwarding=1 +} + +function cleanup { + set +e + [[ -z $DEBUG ]] || set +x + $IP netns delete ns1 >& /dev/null + $IP netns delete ns2 >& /dev/null + $IP link del ve1 >& /dev/null + $IP link del ve2 >& /dev/null + $IP link del ipt >& /dev/null + $IP link del ip6t >& /dev/null + sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER + sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING + rm -f /sys/fs/bpf/tc/globals/tun_iface + [[ -z $DEBUG ]] || set -x + set -e +} + +function l2_to_ipip { + echo -n "l2_to_ipip $1: " + + local dir=$1 + + config_common ipip + + $IP link add ipt type ipip external + $IP link set dev ipt up + sysctl -q -w net.ipv4.conf.ipt.rp_filter=0 + sysctl -q -w net.ipv4.conf.ipt.forwarding=1 + + if [[ $dir == "egress" ]]; then + $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 + $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect + sysctl -q -w net.ipv4.conf.ve1.forwarding=1 + else + $TC qdisc add dev ve1 clsact + $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect + fi + + $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex) + + $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null + + if [[ $dir == "egress" ]]; then + # test direct egress to ve2 (i.e. not forwarding from + # ve1 to ve2). + ping -c1 10.10.1.102 >& /dev/null + fi + + cleanup + + echo "OK" +} + +function l2_to_ip6tnl { + echo -n "l2_to_ip6tnl $1: " + + local dir=$1 + + config_common ip6tnl + + $IP link add ip6t type ip6tnl mode any external + $IP link set dev ip6t up + sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0 + sysctl -q -w net.ipv4.conf.ip6t.forwarding=1 + + if [[ $dir == "egress" ]]; then + $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 + $IP route add 2401:face::/64 via 2401:db02::66 dev ve2 + $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect + sysctl -q -w net.ipv4.conf.ve1.forwarding=1 + else + $TC qdisc add dev ve1 clsact + $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect + fi + + $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex) + + $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null + $IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null + + if [[ $dir == "egress" ]]; then + # test direct egress to ve2 (i.e. not forwarding from + # ve1 to ve2). + ping -c1 10.10.1.102 >& /dev/null + ping -6 -c1 2401:face::66 >& /dev/null + fi + + cleanup + + echo "OK" +} + +cleanup +test_names="l2_to_ipip l2_to_ip6tnl" +test_dirs="ingress egress" +if [[ $# -ge 2 ]]; then + test_names=$1 + test_dirs=$2 +elif [[ $# -ge 1 ]]; then + test_names=$1 +fi + +for t in $test_names; do + for d in $test_dirs; do + $t $d + done +done diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c new file mode 100644 index 000000000..fd2fa0004 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_kern.c @@ -0,0 +1,237 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <net/ipv6.h> +#include <bpf/bpf_helpers.h> + +#define _htonl __builtin_bswap32 + +#define PIN_GLOBAL_NS 2 +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +}; + +/* copy of 'struct ethhdr' without __packed */ +struct eth_hdr { + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_proto; +}; + +struct bpf_elf_map SEC("maps") tun_iface = { + .type = BPF_MAP_TYPE_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr) +{ + if (eth_proto == htons(ETH_P_IP)) + return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100); + else if (eth_proto == htons(ETH_P_IPV6)) + return (daddr == _htonl(0x2401face)); + + return false; +} + +SEC("l2_to_iptun_ingress_forward") +int _l2_to_iptun_ingress_forward(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + int key = 0, *ifindex; + + int ret; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + ifindex = bpf_map_lookup_elem(&tun_iface, &key); + if (!ifindex) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n"; + struct iphdr *iph = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (iph->protocol != IPPROTO_IPIP) + return TC_ACT_OK; + + bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex, + _htonl(iph->daddr)); + return bpf_redirect(*ifindex, BPF_F_INGRESS); + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n"; + struct ipv6hdr *ip6h = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (ip6h->nexthdr != IPPROTO_IPIP && + ip6h->nexthdr != IPPROTO_IPV6) + return TC_ACT_OK; + + bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex, + _htonl(ip6h->daddr.s6_addr32[0]), + _htonl(ip6h->daddr.s6_addr32[3])); + return bpf_redirect(*ifindex, BPF_F_INGRESS); + } + + return TC_ACT_OK; +} + +SEC("l2_to_iptun_ingress_redirect") +int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + int key = 0, *ifindex; + + int ret; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + ifindex = bpf_map_lookup_elem(&tun_iface, &key); + if (!ifindex) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; + struct iphdr *iph = data + sizeof(*eth); + __be32 daddr = iph->daddr; + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (!is_vip_addr(eth->h_proto, daddr)) + return TC_ACT_OK; + + bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex); + } else { + return TC_ACT_OK; + } + + tkey.tunnel_id = 10000; + tkey.tunnel_ttl = 64; + tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */ + bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0); + return bpf_redirect(*ifindex, 0); +} + +SEC("l2_to_ip6tun_ingress_redirect") +int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + int key = 0, *ifindex; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + ifindex = bpf_map_lookup_elem(&tun_iface, &key); + if (!ifindex) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; + struct iphdr *iph = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (!is_vip_addr(eth->h_proto, iph->daddr)) + return TC_ACT_OK; + + bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr), + *ifindex); + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n"; + struct ipv6hdr *ip6h = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) + return TC_ACT_OK; + + bpf_trace_printk(fmt6, sizeof(fmt6), + _htonl(ip6h->daddr.s6_addr32[0]), *ifindex); + } else { + return TC_ACT_OK; + } + + tkey.tunnel_id = 10000; + tkey.tunnel_ttl = 64; + /* 2401:db02:0:0:0:0:0:66 */ + tkey.remote_ipv6[0] = _htonl(0x2401db02); + tkey.remote_ipv6[1] = 0; + tkey.remote_ipv6[2] = 0; + tkey.remote_ipv6[3] = _htonl(0x00000066); + bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6); + return bpf_redirect(*ifindex, 0); +} + +SEC("drop_non_tun_vip") +int _drop_non_tun_vip(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + struct iphdr *iph = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (is_vip_addr(eth->h_proto, iph->daddr)) + return TC_ACT_SHOT; + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c new file mode 100644 index 000000000..d11a6e1e9 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_user.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Facebook + */ +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> + +#include <bpf/bpf.h> + +static void usage(void) +{ + printf("Usage: tc_l2_ipip_redirect [...]\n"); + printf(" -U <file> Update an already pinned BPF array\n"); + printf(" -i <ifindex> Interface index\n"); + printf(" -h Display this help\n"); +} + +int main(int argc, char **argv) +{ + const char *pinned_file = NULL; + int ifindex = -1; + int array_key = 0; + int array_fd = -1; + int ret = -1; + int opt; + + while ((opt = getopt(argc, argv, "F:U:i:")) != -1) { + switch (opt) { + /* General args */ + case 'U': + pinned_file = optarg; + break; + case 'i': + ifindex = atoi(optarg); + break; + default: + usage(); + goto out; + } + } + + if (ifindex < 0 || !pinned_file) { + usage(); + goto out; + } + + array_fd = bpf_obj_get(pinned_file); + if (array_fd < 0) { + fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + + /* bpf_tunnel_key.remote_ipv4 expects host byte orders */ + ret = bpf_map_update_elem(array_fd, &array_key, &ifindex, 0); + if (ret) { + perror("bpf_map_update_elem"); + goto out; + } + +out: + if (array_fd != -1) + close(array_fd); + return ret; +} diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c new file mode 100644 index 000000000..e9356130f --- /dev/null +++ b/samples/bpf/tcbpf1_kern.c @@ -0,0 +1,91 @@ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" + +/* compiler workaround */ +#define _htonl __builtin_bswap32 + +static inline void set_dst_mac(struct __sk_buff *skb, char *mac) +{ + bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); +} + +#define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) +#define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) + +static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) +{ + __u8 old_tos = load_byte(skb, TOS_OFF); + + bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); + bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); +} + +#define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check)) +#define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr)) + +#define IS_PSEUDO 0x10 + +static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) +{ + __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); + bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); + bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); +} + +#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) +static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) +{ + __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); + bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); +} + +SEC("classifier") +int bpf_prog1(struct __sk_buff *skb) +{ + __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + long *value; + + if (proto == IPPROTO_TCP) { + set_ip_tos(skb, 8); + set_tcp_ip_src(skb, 0xA010101); + set_tcp_dest_port(skb, 5001); + } + + return 0; +} +SEC("redirect_xmit") +int _redirect_xmit(struct __sk_buff *skb) +{ + return bpf_redirect(skb->ifindex + 1, 0); +} +SEC("redirect_recv") +int _redirect_recv(struct __sk_buff *skb) +{ + return bpf_redirect(skb->ifindex + 1, 1); +} +SEC("clone_redirect_xmit") +int _clone_redirect_xmit(struct __sk_buff *skb) +{ + bpf_clone_redirect(skb, skb->ifindex + 1, 0); + return TC_ACT_SHOT; +} +SEC("clone_redirect_recv") +int _clone_redirect_recv(struct __sk_buff *skb) +{ + bpf_clone_redirect(skb, skb->ifindex + 1, 1); + return TC_ACT_SHOT; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c new file mode 100644 index 000000000..822b0742b --- /dev/null +++ b/samples/bpf/tcp_basertt_kern.c @@ -0,0 +1,71 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set base_rtt to 80us when host is running TCP-NV and + * both hosts are in the same datacenter (as determined by IPv6 prefix). + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char cong[20]; + char nv[] = "nv"; + int rv = 0, n; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_BASE_RTT: + n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + if (!n && !__builtin_memcmp(cong, nv, sizeof(nv))) { + /* Set base_rtt to 80us */ + rv = 80; + } else if (n) { + rv = n; + } else { + rv = -1; + } + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme new file mode 100644 index 000000000..78e247f62 --- /dev/null +++ b/samples/bpf/tcp_bpf.readme @@ -0,0 +1,28 @@ +This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) +programs. These programs attach to a cgroupv2. The following commands create +a cgroupv2 and attach a bash shell to the group. + + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + bash + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs + +Anything that runs under this shell belongs to the foo cgroupv2. To load +(attach) one of the tcp_*_kern.o programs: + + bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog + bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog + bpftool prog tracelog + +"bpftool prog tracelog" will continue to run printing the BPF log buffer. +The tcp_*_kern.o programs use special print functions to print logging +information (if enabled by the ifdef). + +If using netperf/netserver to create traffic, you need to run them under the +cgroupv2 to which the BPF programs are attached (i.e. under bash shell +attached to the cgroupv2). + +To remove (unattach) a socket_ops BPF program from a cgroupv2: + + bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c new file mode 100644 index 000000000..6a80d0895 --- /dev/null +++ b/samples/bpf/tcp_bufs_kern.c @@ -0,0 +1,81 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial receive window to 40 packets and send + * and receive buffers to 1.5MB. This would usually be done after + * doing appropriate checks that indicate the hosts are far enough + * away (i.e. large RTT). + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_bufs(struct bpf_sock_ops *skops) +{ + int bufsize = 1500000; + int rwnd_init = 40; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + + /* Usually there would be a check to insure the hosts are far + * from each other so it makes sense to increase buffer sizes + */ + switch (op) { + case BPF_SOCK_OPS_RWND_INIT: + rv = rwnd_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + /* Nothing to do */ + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c new file mode 100644 index 000000000..e88bd9ab0 --- /dev/null +++ b/samples/bpf/tcp_clamp_kern.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp + * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within + * the same datacenter. For his example, we assume they are within the same + * datacenter when the first 5.5 bytes of their IPv6 addresses are the same. + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_clamp(struct bpf_sock_ops *skops) +{ + int bufsize = 150000; + int to_init = 10; + int clamp = 100; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 0; + } + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check that both hosts are within same datacenter. For this example + * it is the case when the first 5.5 bytes of their IPv6 addresses are + * the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_TIMEOUT_INIT: + rv = to_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, + &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, + TCP_BPF_SNDCWND_CLAMP, + &clamp, sizeof(clamp)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_TCP, + TCP_BPF_SNDCWND_CLAMP, + &clamp, sizeof(clamp)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c new file mode 100644 index 000000000..2311fc9dd --- /dev/null +++ b/samples/bpf/tcp_cong_kern.c @@ -0,0 +1,78 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set congestion control to dctcp when both hosts are + * in the same datacenter (as deteremined by IPv6 prefix). + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_cong(struct bpf_sock_ops *skops) +{ + char cong[] = "dctcp"; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_NEEDS_ECN: + rv = 1; + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c new file mode 100644 index 000000000..e80d3afd2 --- /dev/null +++ b/samples/bpf/tcp_dumpstats_kern.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Refer to samples/bpf/tcp_bpf.readme for the instructions on + * how to run this sample program. + */ +#include <linux/bpf.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define INTERVAL 1000000000ULL + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +struct { + __u32 type; + __u32 map_flags; + int *key; + __u64 *value; +} bpf_next_dump SEC(".maps") = { + .type = BPF_MAP_TYPE_SK_STORAGE, + .map_flags = BPF_F_NO_PREALLOC, +}; + +SEC("sockops") +int _sockops(struct bpf_sock_ops *ctx) +{ + struct bpf_tcp_sock *tcp_sk; + struct bpf_sock *sk; + __u64 *next_dump; + __u64 now; + + switch (ctx->op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG); + return 1; + case BPF_SOCK_OPS_RTT_CB: + break; + default: + return 1; + } + + sk = ctx->sk; + if (!sk) + return 1; + + next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!next_dump) + return 1; + + now = bpf_ktime_get_ns(); + if (now < *next_dump) + return 1; + + tcp_sk = bpf_tcp_sock(sk); + if (!tcp_sk) + return 1; + + *next_dump = now + INTERVAL; + + bpf_printk("dsack_dups=%u delivered=%u\n", + tcp_sk->dsack_dups, tcp_sk->delivered); + bpf_printk("delivered_ce=%u icsk_retransmits=%u\n", + tcp_sk->delivered_ce, tcp_sk->icsk_retransmits); + + return 1; +} diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c new file mode 100644 index 000000000..d14445573 --- /dev/null +++ b/samples/bpf/tcp_iw_kern.c @@ -0,0 +1,83 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial congestion window and initial receive + * window to 40 packets and send and receive buffers to 1.5MB. This + * would usually be done after doing appropriate checks that indicate + * the hosts are far enough away (i.e. large RTT). + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_iw(struct bpf_sock_ops *skops) +{ + int bufsize = 1500000; + int rwnd_init = 40; + int iw = 40; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Usually there would be a check to insure the hosts are far + * from each other so it makes sense to increase buffer sizes + */ + switch (op) { + case BPF_SOCK_OPS_RWND_INIT: + rv = rwnd_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw, + sizeof(iw)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c new file mode 100644 index 000000000..223d9c23b --- /dev/null +++ b/samples/bpf/tcp_rwnd_kern.c @@ -0,0 +1,64 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial receive window to 40 packets when using IPv6 + * and the first 5.5 bytes of the IPv6 addresses are not the same (in this + * example that means both hosts are not the same datacenter). + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_rwnd(struct bpf_sock_ops *skops) +{ + int rv = -1; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != + 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 1; + } + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check for RWND_INIT operation and IPv6 addresses */ + if (op == BPF_SOCK_OPS_RWND_INIT && + skops->family == AF_INET6) { + + /* If the first 5.5 bytes of the IPv6 address are not the same + * then both hosts are not in the same datacenter + * so use a larger initial advertized window (40 packets) + */ + if (skops->local_ip6[0] != skops->remote_ip6[0] || + (bpf_ntohl(skops->local_ip6[1]) & 0xfffff000) != + (bpf_ntohl(skops->remote_ip6[1]) & 0xfffff000)) + rv = 40; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c new file mode 100644 index 000000000..d58004eef --- /dev/null +++ b/samples/bpf/tcp_synrto_kern.c @@ -0,0 +1,64 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses + * and the first 5.5 bytes of the IPv6 addresses are the same (in this example + * that means both hosts are in the same datacenter). + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_synrto(struct bpf_sock_ops *skops) +{ + int rv = -1; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check for TIMEOUT_INIT operation and IPv6 addresses */ + if (op == BPF_SOCK_OPS_TIMEOUT_INIT && + skops->family == AF_INET6) { + + /* If the first 5.5 bytes of the IPv6 address are the same + * then both hosts are in the same datacenter + * so use an RTO of 10ms + */ + if (skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) + rv = 10; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c new file mode 100644 index 000000000..953fedc79 --- /dev/null +++ b/samples/bpf/tcp_tos_reflect_kern.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + * + * BPF program to automatically reflect TOS option from received syn packet + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char header[sizeof(struct ipv6hdr)]; + struct ipv6hdr *hdr6; + struct iphdr *hdr; + int hdr_size = 0; + int save_syn = 1; + int tos = 0; + int rv = 0; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + switch (op) { + case BPF_SOCK_OPS_TCP_LISTEN_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, + &save_syn, sizeof(save_syn)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + if (skops->family == AF_INET) + hdr_size = sizeof(struct iphdr); + else + hdr_size = sizeof(struct ipv6hdr); + rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN, + header, hdr_size); + if (!rv) { + if (skops->family == AF_INET) { + hdr = (struct iphdr *) header; + tos = hdr->tos; + if (tos != 0) + bpf_setsockopt(skops, SOL_IP, IP_TOS, + &tos, sizeof(tos)); + } else { + hdr6 = (struct ipv6hdr *) header; + tos = ((hdr6->priority) << 4 | + (hdr6->flow_lbl[0]) >> 4); + if (tos) + bpf_setsockopt(skops, SOL_IPV6, + IPV6_TCLASS, + &tos, sizeof(tos)); + } + rv = 0; + } + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c new file mode 100644 index 000000000..6d564aa75 --- /dev/null +++ b/samples/bpf/test_cgrp2_array_pin.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Facebook + */ +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <fcntl.h> + +#include <bpf/bpf.h> + +static void usage(void) +{ + printf("Usage: test_cgrp2_array_pin [...]\n"); + printf(" -F <file> File to pin an BPF cgroup array\n"); + printf(" -U <file> Update an already pinned BPF cgroup array\n"); + printf(" -v <value> Full path of the cgroup2\n"); + printf(" -h Display this help\n"); +} + +int main(int argc, char **argv) +{ + const char *pinned_file = NULL, *cg2 = NULL; + int create_array = 1; + int array_key = 0; + int array_fd = -1; + int cg2_fd = -1; + int ret = -1; + int opt; + + while ((opt = getopt(argc, argv, "F:U:v:")) != -1) { + switch (opt) { + /* General args */ + case 'F': + pinned_file = optarg; + break; + case 'U': + pinned_file = optarg; + create_array = 0; + break; + case 'v': + cg2 = optarg; + break; + default: + usage(); + goto out; + } + } + + if (!cg2 || !pinned_file) { + usage(); + goto out; + } + + cg2_fd = open(cg2, O_RDONLY); + if (cg2_fd < 0) { + fprintf(stderr, "open(%s,...): %s(%d)\n", + cg2, strerror(errno), errno); + goto out; + } + + if (create_array) { + array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY, + sizeof(uint32_t), sizeof(uint32_t), + 1, 0); + if (array_fd < 0) { + fprintf(stderr, + "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n", + strerror(errno), errno); + goto out; + } + } else { + array_fd = bpf_obj_get(pinned_file); + if (array_fd < 0) { + fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + } + + ret = bpf_map_update_elem(array_fd, &array_key, &cg2_fd, 0); + if (ret) { + perror("bpf_map_update_elem"); + goto out; + } + + if (create_array) { + ret = bpf_obj_pin(array_fd, pinned_file); + if (ret) { + fprintf(stderr, "bpf_obj_pin(..., %s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + } + +out: + if (array_fd != -1) + close(array_fd); + if (cg2_fd != -1) + close(cg2_fd); + return ret; +} diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c new file mode 100644 index 000000000..20fbd1241 --- /dev/null +++ b/samples/bpf/test_cgrp2_attach.c @@ -0,0 +1,172 @@ +/* eBPF example program: + * + * - Creates arraymap in kernel with 4 bytes keys and 8 byte values + * + * - Loads eBPF program + * + * The eBPF program accesses the map passed in to store two pieces of + * information. The number of invocations of the program, which maps + * to the number of packets received, is stored to key 0. Key 1 is + * incremented on each iteration by the number of bytes stored in + * the skb. + * + * - Attaches the new program to a cgroup using BPF_PROG_ATTACH + * + * - Every second, reads map[0] and map[1] to see how many bytes and + * packets were seen on any socket of tasks in the given cgroup. + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> + +#include <linux/bpf.h> +#include <bpf/bpf.h> + +#include "bpf_insn.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static int prog_load(int map_fd, int verdict) +{ + struct bpf_insn prog[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */ + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + + return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); +} + +static int usage(const char *argv0) +{ + printf("Usage: %s [-d] [-D] <cg-path> <egress|ingress>\n", argv0); + printf(" -d Drop Traffic\n"); + printf(" -D Detach filter, and exit\n"); + return EXIT_FAILURE; +} + +static int attach_filter(int cg_fd, int type, int verdict) +{ + int prog_fd, map_fd, ret, key; + long long pkt_cnt, byte_cnt; + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, + sizeof(key), sizeof(byte_cnt), + 256, 0); + if (map_fd < 0) { + printf("Failed to create map: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + prog_fd = prog_load(map_fd, verdict); + printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); + + if (prog_fd < 0) { + printf("Failed to load prog: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + ret = bpf_prog_attach(prog_fd, cg_fd, type, 0); + if (ret < 0) { + printf("Failed to attach prog to cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + while (1) { + key = MAP_KEY_PACKETS; + assert(bpf_map_lookup_elem(map_fd, &key, &pkt_cnt) == 0); + + key = MAP_KEY_BYTES; + assert(bpf_map_lookup_elem(map_fd, &key, &byte_cnt) == 0); + + printf("cgroup received %lld packets, %lld bytes\n", + pkt_cnt, byte_cnt); + sleep(1); + } + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int detach_only = 0, verdict = 1; + enum bpf_attach_type type; + int opt, cg_fd, ret; + + while ((opt = getopt(argc, argv, "Dd")) != -1) { + switch (opt) { + case 'd': + verdict = 0; + break; + case 'D': + detach_only = 1; + break; + default: + return usage(argv[0]); + } + } + + if (argc - optind < 2) + return usage(argv[0]); + + if (strcmp(argv[optind + 1], "ingress") == 0) + type = BPF_CGROUP_INET_INGRESS; + else if (strcmp(argv[optind + 1], "egress") == 0) + type = BPF_CGROUP_INET_EGRESS; + else + return usage(argv[0]); + + cg_fd = open(argv[optind], O_DIRECTORY | O_RDONLY); + if (cg_fd < 0) { + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + if (detach_only) { + ret = bpf_prog_detach(cg_fd, type); + printf("bpf_prog_detach() returned '%s' (%d)\n", + strerror(errno), errno); + } else + ret = attach_filter(cg_fd, type, verdict); + + return ret; +} diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c new file mode 100644 index 000000000..b0811da5a --- /dev/null +++ b/samples/bpf/test_cgrp2_sock.c @@ -0,0 +1,290 @@ +/* eBPF example program: + * + * - Loads eBPF program + * + * The eBPF program sets the sk_bound_dev_if index in new AF_INET{6} + * sockets opened by processes in the cgroup. + * + * - Attaches the new program to a cgroup using BPF_PROG_ATTACH + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <net/if.h> +#include <inttypes.h> +#include <linux/bpf.h> +#include <bpf/bpf.h> + +#include "bpf_insn.h" + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static int prog_load(__u32 idx, __u32 mark, __u32 prio) +{ + /* save pointer to context */ + struct bpf_insn prog_start[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + }; + struct bpf_insn prog_end[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + + /* set sk_bound_dev_if on socket */ + struct bpf_insn prog_dev[] = { + BPF_MOV64_IMM(BPF_REG_3, idx), + BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)), + }; + + /* set mark on socket */ + struct bpf_insn prog_mark[] = { + /* get uid of process */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_current_uid_gid), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff), + + /* if uid is 0, use given mark, else use the uid as the mark */ + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, mark), + + /* set the mark on the new socket */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)), + }; + + /* set priority on socket */ + struct bpf_insn prog_prio[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_3, prio), + BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)), + }; + + struct bpf_insn *prog; + size_t insns_cnt; + void *p; + int ret; + + insns_cnt = sizeof(prog_start) + sizeof(prog_end); + if (idx) + insns_cnt += sizeof(prog_dev); + + if (mark) + insns_cnt += sizeof(prog_mark); + + if (prio) + insns_cnt += sizeof(prog_prio); + + p = prog = malloc(insns_cnt); + if (!prog) { + fprintf(stderr, "Failed to allocate memory for instructions\n"); + return EXIT_FAILURE; + } + + memcpy(p, prog_start, sizeof(prog_start)); + p += sizeof(prog_start); + + if (idx) { + memcpy(p, prog_dev, sizeof(prog_dev)); + p += sizeof(prog_dev); + } + + if (mark) { + memcpy(p, prog_mark, sizeof(prog_mark)); + p += sizeof(prog_mark); + } + + if (prio) { + memcpy(p, prog_prio, sizeof(prog_prio)); + p += sizeof(prog_prio); + } + + memcpy(p, prog_end, sizeof(prog_end)); + p += sizeof(prog_end); + + insns_cnt /= sizeof(struct bpf_insn); + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, + "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + + free(prog); + + return ret; +} + +static int get_bind_to_device(int sd, char *name, size_t len) +{ + socklen_t optlen = len; + int rc; + + name[0] = '\0'; + rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen); + if (rc < 0) + perror("setsockopt(SO_BINDTODEVICE)"); + + return rc; +} + +static unsigned int get_somark(int sd) +{ + unsigned int mark = 0; + socklen_t optlen = sizeof(mark); + int rc; + + rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen); + if (rc < 0) + perror("getsockopt(SO_MARK)"); + + return mark; +} + +static unsigned int get_priority(int sd) +{ + unsigned int prio = 0; + socklen_t optlen = sizeof(prio); + int rc; + + rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen); + if (rc < 0) + perror("getsockopt(SO_PRIORITY)"); + + return prio; +} + +static int show_sockopts(int family) +{ + unsigned int mark, prio; + char name[16]; + int sd; + + sd = socket(family, SOCK_DGRAM, 17); + if (sd < 0) { + perror("socket"); + return 1; + } + + if (get_bind_to_device(sd, name, sizeof(name)) < 0) + return 1; + + mark = get_somark(sd); + prio = get_priority(sd); + + close(sd); + + printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio); + + return 0; +} + +static int usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" Attach a program\n"); + printf(" %s -b bind-to-dev -m mark -p prio cg-path\n", argv0); + printf("\n"); + printf(" Detach a program\n"); + printf(" %s -d cg-path\n", argv0); + printf("\n"); + printf(" Show inherited socket settings (mark, priority, and device)\n"); + printf(" %s [-6]\n", argv0); + return EXIT_FAILURE; +} + +int main(int argc, char **argv) +{ + __u32 idx = 0, mark = 0, prio = 0; + const char *cgrp_path = NULL; + int cg_fd, prog_fd, ret; + int family = PF_INET; + int do_attach = 1; + int rc; + + while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) { + switch (rc) { + case 'd': + do_attach = 0; + break; + case 'b': + idx = if_nametoindex(optarg); + if (!idx) { + idx = strtoumax(optarg, NULL, 0); + if (!idx) { + printf("Invalid device name\n"); + return EXIT_FAILURE; + } + } + break; + case 'm': + mark = strtoumax(optarg, NULL, 0); + break; + case 'p': + prio = strtoumax(optarg, NULL, 0); + break; + case '6': + family = PF_INET6; + break; + default: + return usage(argv[0]); + } + } + + if (optind == argc) + return show_sockopts(family); + + cgrp_path = argv[optind]; + if (!cgrp_path) { + fprintf(stderr, "cgroup path not given\n"); + return EXIT_FAILURE; + } + + if (do_attach && !idx && !mark && !prio) { + fprintf(stderr, + "One of device, mark or priority must be given\n"); + return EXIT_FAILURE; + } + + cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY); + if (cg_fd < 0) { + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + if (do_attach) { + prog_fd = prog_load(idx, mark, prio); + if (prog_fd < 0) { + printf("Failed to load prog: '%s'\n", strerror(errno)); + printf("Output from kernel verifier:\n%s\n-------\n", + bpf_log_buf); + return EXIT_FAILURE; + } + + ret = bpf_prog_attach(prog_fd, cg_fd, + BPF_CGROUP_INET_SOCK_CREATE, 0); + if (ret < 0) { + printf("Failed to attach prog to cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + } else { + ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE); + if (ret < 0) { + printf("Failed to detach prog from cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + } + + close(cg_fd); + return EXIT_SUCCESS; +} diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh new file mode 100755 index 000000000..9f6174236 --- /dev/null +++ b/samples/bpf/test_cgrp2_sock.sh @@ -0,0 +1,135 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Test various socket options that can be set by attaching programs to cgroups. + +CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock" + +################################################################################ +# +print_result() +{ + local rc=$1 + local status=" OK " + + [ $rc -ne 0 ] && status="FAIL" + + printf "%-50s [%4s]\n" "$2" "$status" +} + +check_sock() +{ + out=$(test_cgrp2_sock) + echo $out | grep -q "$1" + if [ $? -ne 0 ]; then + print_result 1 "IPv4: $2" + echo " expected: $1" + echo " have: $out" + rc=1 + else + print_result 0 "IPv4: $2" + fi +} + +check_sock6() +{ + out=$(test_cgrp2_sock -6) + echo $out | grep -q "$1" + if [ $? -ne 0 ]; then + print_result 1 "IPv6: $2" + echo " expected: $1" + echo " have: $out" + rc=1 + else + print_result 0 "IPv6: $2" + fi +} + +################################################################################ +# + +cleanup() +{ + echo $$ >> ${CGRP_MNT}/cgroup.procs + rmdir ${CGRP_MNT}/sockopts +} + +cleanup_and_exit() +{ + local rc=$1 + local msg="$2" + + [ -n "$msg" ] && echo "ERROR: $msg" + + test_cgrp2_sock -d ${CGRP_MNT}/sockopts + ip li del cgrp2_sock + umount ${CGRP_MNT} + + exit $rc +} + + +################################################################################ +# main + +rc=0 + +ip li add cgrp2_sock type dummy 2>/dev/null + +set -e +mkdir -p ${CGRP_MNT} +mount -t cgroup2 none ${CGRP_MNT} +set +e + + +# make sure we have a known start point +cleanup 2>/dev/null + +mkdir -p ${CGRP_MNT}/sockopts +[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy" + + +# set pid into cgroup +echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs + +# no bpf program attached, so socket should show no settings +check_sock "dev , mark 0, priority 0" "No programs attached" +check_sock6 "dev , mark 0, priority 0" "No programs attached" + +# verify device is set +# +test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set device" +fi +check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set" +check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set" + +# verify mark is set +# +test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set mark" +fi +check_sock "dev , mark 666, priority 0" "Mark set" +check_sock6 "dev , mark 666, priority 0" "Mark set" + +# verify priority is set +# +test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set priority" +fi +check_sock "dev , mark 0, priority 123" "Priority set" +check_sock6 "dev , mark 0, priority 123" "Priority set" + +# all 3 at once +# +test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set device, mark and priority" +fi +check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set" +check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set" + +cleanup_and_exit $rc diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c new file mode 100644 index 000000000..a9277b118 --- /dev/null +++ b/samples/bpf/test_cgrp2_sock2.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* eBPF example program: + * + * - Loads eBPF program + * + * The eBPF program loads a filter from file and attaches the + * program to a cgroup using BPF_PROG_ATTACH + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <net/if.h> +#include <linux/bpf.h> +#include <bpf/bpf.h> + +#include "bpf_insn.h" +#include "bpf_load.h" + +static int usage(const char *argv0) +{ + printf("Usage: %s cg-path filter-path [filter-id]\n", argv0); + return EXIT_FAILURE; +} + +int main(int argc, char **argv) +{ + int cg_fd, ret, filter_id = 0; + + if (argc < 3) + return usage(argv[0]); + + cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); + if (cg_fd < 0) { + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + if (load_bpf_file(argv[2])) + return EXIT_FAILURE; + + printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); + + if (argc > 3) + filter_id = atoi(argv[3]); + + if (filter_id >= prog_cnt) { + printf("Invalid program id; program not found in file\n"); + return EXIT_FAILURE; + } + + ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, + BPF_CGROUP_INET_SOCK_CREATE, 0); + if (ret < 0) { + printf("Failed to attach prog to cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh new file mode 100755 index 000000000..0f396a86e --- /dev/null +++ b/samples/bpf/test_cgrp2_sock2.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +function config_device { + ip netns add at_ns0 + ip link add veth0 type veth peer name veth0b + ip link set veth0b up + ip link set veth0 netns at_ns0 + ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 + ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad + ip netns exec at_ns0 ip link set dev veth0 up + ip addr add 172.16.1.101/24 dev veth0b + ip addr add 2401:db00::2/64 dev veth0b nodad +} + +function config_cgroup { + rm -rf /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs +} + + +function attach_bpf { + test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 + [ $? -ne 0 ] && exit 1 +} + +function cleanup { + if [ -d /tmp/cgroupv2/foo ]; then + test_cgrp2_sock -d /tmp/cgroupv2/foo + fi + ip link del veth0b + ip netns delete at_ns0 + umount /tmp/cgroupv2 + rm -rf /tmp/cgroupv2 +} + +cleanup 2>/dev/null + +set -e +config_device +config_cgroup +set +e + +# +# Test 1 - fail ping6 +# +attach_bpf 0 +ping -c1 -w1 172.16.1.100 +if [ $? -ne 0 ]; then + echo "ping failed when it should succeed" + cleanup + exit 1 +fi + +ping6 -c1 -w1 2401:db00::1 +if [ $? -eq 0 ]; then + echo "ping6 succeeded when it should not" + cleanup + exit 1 +fi + +# +# Test 2 - fail ping +# +attach_bpf 1 +ping6 -c1 -w1 2401:db00::1 +if [ $? -ne 0 ]; then + echo "ping6 failed when it should succeed" + cleanup + exit 1 +fi + +ping -c1 -w1 172.16.1.100 +if [ $? -eq 0 ]; then + echo "ping succeeded when it should not" + cleanup + exit 1 +fi + +cleanup +echo +echo "*** PASS ***" diff --git a/samples/bpf/test_cgrp2_tc.sh b/samples/bpf/test_cgrp2_tc.sh new file mode 100755 index 000000000..12faf5847 --- /dev/null +++ b/samples/bpf/test_cgrp2_tc.sh @@ -0,0 +1,185 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +MY_DIR=$(dirname $0) +# Details on the bpf prog +BPF_CGRP2_ARRAY_NAME='test_cgrp2_array_pin' +BPF_PROG="$MY_DIR/test_cgrp2_tc_kern.o" +BPF_SECTION='filter' + +[ -z "$TC" ] && TC='tc' +[ -z "$IP" ] && IP='ip' + +# Names of the veth interface, net namespace...etc. +HOST_IFC='ve' +NS_IFC='vens' +NS='ns' + +find_mnt() { + cat /proc/mounts | \ + awk '{ if ($3 == "'$1'" && mnt == "") { mnt = $2 }} END { print mnt }' +} + +# Init cgroup2 vars +init_cgrp2_vars() { + CGRP2_ROOT=$(find_mnt cgroup2) + if [ -z "$CGRP2_ROOT" ] + then + CGRP2_ROOT='/mnt/cgroup2' + MOUNT_CGRP2="yes" + fi + CGRP2_TC="$CGRP2_ROOT/tc" + CGRP2_TC_LEAF="$CGRP2_TC/leaf" +} + +# Init bpf fs vars +init_bpf_fs_vars() { + local bpf_fs_root=$(find_mnt bpf) + [ -n "$bpf_fs_root" ] || return -1 + BPF_FS_TC_SHARE="$bpf_fs_root/tc/globals" +} + +setup_cgrp2() { + case $1 in + start) + if [ "$MOUNT_CGRP2" == 'yes' ] + then + [ -d $CGRP2_ROOT ] || mkdir -p $CGRP2_ROOT + mount -t cgroup2 none $CGRP2_ROOT || return $? + fi + mkdir -p $CGRP2_TC_LEAF + ;; + *) + rmdir $CGRP2_TC_LEAF && rmdir $CGRP2_TC + [ "$MOUNT_CGRP2" == 'yes' ] && umount $CGRP2_ROOT + ;; + esac +} + +setup_bpf_cgrp2_array() { + local bpf_cgrp2_array="$BPF_FS_TC_SHARE/$BPF_CGRP2_ARRAY_NAME" + case $1 in + start) + $MY_DIR/test_cgrp2_array_pin -U $bpf_cgrp2_array -v $CGRP2_TC + ;; + *) + [ -d "$BPF_FS_TC_SHARE" ] && rm -f $bpf_cgrp2_array + ;; + esac +} + +setup_net() { + case $1 in + start) + $IP link add $HOST_IFC type veth peer name $NS_IFC || return $? + $IP link set dev $HOST_IFC up || return $? + sysctl -q net.ipv6.conf.$HOST_IFC.accept_dad=0 + + $IP netns add ns || return $? + $IP link set dev $NS_IFC netns ns || return $? + $IP -n $NS link set dev $NS_IFC up || return $? + $IP netns exec $NS sysctl -q net.ipv6.conf.$NS_IFC.accept_dad=0 + $TC qdisc add dev $HOST_IFC clsact || return $? + $TC filter add dev $HOST_IFC egress bpf da obj $BPF_PROG sec $BPF_SECTION || return $? + ;; + *) + $IP netns del $NS + $IP link del $HOST_IFC + ;; + esac +} + +run_in_cgrp() { + # Fork another bash and move it under the specified cgroup. + # It makes the cgroup cleanup easier at the end of the test. + cmd='echo $$ > ' + cmd="$cmd $1/cgroup.procs; exec $2" + bash -c "$cmd" +} + +do_test() { + run_in_cgrp $CGRP2_TC_LEAF "ping -6 -c3 ff02::1%$HOST_IFC >& /dev/null" + local dropped=$($TC -s qdisc show dev $HOST_IFC | tail -3 | \ + awk '/drop/{print substr($7, 0, index($7, ",")-1)}') + if [[ $dropped -eq 0 ]] + then + echo "FAIL" + return 1 + else + echo "Successfully filtered $dropped packets" + return 0 + fi +} + +do_exit() { + if [ "$DEBUG" == "yes" ] && [ "$MODE" != 'cleanuponly' ] + then + echo "------ DEBUG ------" + echo "mount: "; mount | egrep '(cgroup2|bpf)'; echo + echo "$CGRP2_TC_LEAF: "; ls -l $CGRP2_TC_LEAF; echo + if [ -d "$BPF_FS_TC_SHARE" ] + then + echo "$BPF_FS_TC_SHARE: "; ls -l $BPF_FS_TC_SHARE; echo + fi + echo "Host net:" + $IP netns + $IP link show dev $HOST_IFC + $IP -6 a show dev $HOST_IFC + $TC -s qdisc show dev $HOST_IFC + echo + echo "$NS net:" + $IP -n $NS link show dev $NS_IFC + $IP -n $NS -6 link show dev $NS_IFC + echo "------ DEBUG ------" + echo + fi + + if [ "$MODE" != 'nocleanup' ] + then + setup_net stop + setup_bpf_cgrp2_array stop + setup_cgrp2 stop + fi +} + +init_cgrp2_vars +init_bpf_fs_vars + +while [[ $# -ge 1 ]] +do + a="$1" + case $a in + debug) + DEBUG='yes' + shift 1 + ;; + cleanup-only) + MODE='cleanuponly' + shift 1 + ;; + no-cleanup) + MODE='nocleanup' + shift 1 + ;; + *) + echo "test_cgrp2_tc [debug] [cleanup-only | no-cleanup]" + echo " debug: Print cgrp and network setup details at the end of the test" + echo " cleanup-only: Try to cleanup things from last test. No test will be run" + echo " no-cleanup: Run the test but don't do cleanup at the end" + echo "[Note: If no arg is given, it will run the test and do cleanup at the end]" + echo + exit -1 + ;; + esac +done + +trap do_exit 0 + +[ "$MODE" == 'cleanuponly' ] && exit + +setup_cgrp2 start || exit $? +setup_net start || exit $? +init_bpf_fs_vars || exit $? +setup_bpf_cgrp2_array start || exit $? +do_test +echo diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c new file mode 100644 index 000000000..4dd532a31 --- /dev/null +++ b/samples/bpf/test_cgrp2_tc_kern.c @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/if_ether.h> +#include <uapi/linux/in6.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/pkt_cls.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* copy of 'struct ethhdr' without __packed */ +struct eth_hdr { + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_proto; +}; + +#define PIN_GLOBAL_NS 2 +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +}; + +struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = { + .type = BPF_MAP_TYPE_CGROUP_ARRAY, + .size_key = sizeof(uint32_t), + .size_value = sizeof(uint32_t), + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +SEC("filter") +int handle_egress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + struct ipv6hdr *ip6h = data + sizeof(*eth); + void *data_end = (void *)(long)skb->data_end; + char dont_care_msg[] = "dont care %04x %d\n"; + char pass_msg[] = "pass\n"; + char reject_msg[] = "reject\n"; + + /* single length check */ + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (eth->h_proto != htons(ETH_P_IPV6) || + ip6h->nexthdr != IPPROTO_ICMPV6) { + bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg), + eth->h_proto, ip6h->nexthdr); + return TC_ACT_OK; + } else if (bpf_skb_under_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) { + bpf_trace_printk(pass_msg, sizeof(pass_msg)); + return TC_ACT_OK; + } else { + bpf_trace_printk(reject_msg, sizeof(reject_msg)); + return TC_ACT_SHOT; + } +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_cls_bpf.sh b/samples/bpf/test_cls_bpf.sh new file mode 100755 index 000000000..aaddd67b3 --- /dev/null +++ b/samples/bpf/test_cls_bpf.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +function pktgen { + ../pktgen/pktgen_bench_xmit_mode_netif_receive.sh -i $IFC -s 64 \ + -m 90:e2:ba:ff:ff:ff -d 192.168.0.1 -t 4 + local dropped=`tc -s qdisc show dev $IFC | tail -3 | awk '/drop/{print $7}'` + if [ "$dropped" == "0," ]; then + echo "FAIL" + else + echo "Successfully filtered " $dropped " packets" + fi +} + +function test { + echo -n "Loading bpf program '$2'... " + tc qdisc add dev $IFC clsact + tc filter add dev $IFC ingress bpf da obj $1 sec $2 + local status=$? + if [ $status -ne 0 ]; then + echo "FAIL" + else + echo "ok" + pktgen + fi + tc qdisc del dev $IFC clsact +} + +IFC=test_veth + +ip link add name $IFC type veth peer name pair_$IFC +ip link set $IFC up +ip link set pair_$IFC up + +test ./parse_simple.o simple +test ./parse_varlen.o varlen +test ./parse_ldabs.o ldabs +ip link del dev $IFC diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c new file mode 100644 index 000000000..fbd43e2bb --- /dev/null +++ b/samples/bpf/test_current_task_under_cgroup_kern.c @@ -0,0 +1,44 @@ +/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include <bpf/bpf_helpers.h> +#include <uapi/linux/utsname.h> +#include "trace_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 1); +} perf_map SEC(".maps"); + +/* Writes the last PID that called sync to a map at index 0 */ +SEC("kprobe/" SYSCALL(sys_sync)) +int bpf_prog1(struct pt_regs *ctx) +{ + u64 pid = bpf_get_current_pid_tgid(); + int idx = 0; + + if (!bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + bpf_map_update_elem(&perf_map, &idx, &pid, BPF_ANY); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c new file mode 100644 index 000000000..ac251a417 --- /dev/null +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <unistd.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "cgroup_helpers.h" + +#define CGROUP_PATH "/my-cgroup" + +int main(int argc, char **argv) +{ + pid_t remote_pid, local_pid = getpid(); + struct bpf_link *link = NULL; + struct bpf_program *prog; + int cg2, idx = 0, rc = 1; + struct bpf_object *obj; + char filename[256]; + int map_fd[2]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "cgroup_map"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "perf_map"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + if (setup_cgroup_environment()) + goto err; + + cg2 = create_and_get_cgroup(CGROUP_PATH); + + if (cg2 < 0) + goto err; + + if (bpf_map_update_elem(map_fd[0], &idx, &cg2, BPF_ANY)) { + log_err("Adding target cgroup to map"); + goto err; + } + + if (join_cgroup(CGROUP_PATH)) + goto err; + + /* + * The installed helper program catched the sync call, and should + * write it to the map. + */ + + sync(); + bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid); + + if (local_pid != remote_pid) { + fprintf(stderr, + "BPF Helper didn't write correct PID to map, but: %d\n", + remote_pid); + goto err; + } + + /* Verify the negative scenario; leave the cgroup */ + if (join_cgroup("/")) + goto err; + + remote_pid = 0; + bpf_map_update_elem(map_fd[1], &idx, &remote_pid, BPF_ANY); + + sync(); + bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid); + + if (local_pid == remote_pid) { + fprintf(stderr, "BPF cgroup negative test did not work\n"); + goto err; + } + + rc = 0; + +err: + close(cg2); + cleanup_cgroup_environment(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return rc; +} diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh new file mode 100755 index 000000000..9e507c305 --- /dev/null +++ b/samples/bpf/test_ipip.sh @@ -0,0 +1,179 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +function config_device { + ip netns add at_ns0 + ip netns add at_ns1 + ip netns add at_ns2 + ip link add veth0 type veth peer name veth0b + ip link add veth1 type veth peer name veth1b + ip link add veth2 type veth peer name veth2b + ip link set veth0b up + ip link set veth1b up + ip link set veth2b up + ip link set dev veth0b mtu 1500 + ip link set dev veth1b mtu 1500 + ip link set dev veth2b mtu 1500 + ip link set veth0 netns at_ns0 + ip link set veth1 netns at_ns1 + ip link set veth2 netns at_ns2 + ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 + ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad + ip netns exec at_ns0 ip link set dev veth0 up + ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1 + ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad + ip netns exec at_ns1 ip link set dev veth1 up + ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2 + ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad + ip netns exec at_ns2 ip link set dev veth2 up + ip link add br0 type bridge + ip link set br0 up + ip link set dev br0 mtu 1500 + ip link set veth0b master br0 + ip link set veth1b master br0 + ip link set veth2b master br0 +} + +function add_ipip_tunnel { + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200 + ip netns exec at_ns0 ip link set dev $DEV_NS up + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns1 \ + ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200 + ip netns exec at_ns1 ip link set dev $DEV_NS up + # same inner IP address in at_ns0 and at_ns1 + ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 + + ip netns exec at_ns2 ip link add dev $DEV type ipip external + ip netns exec at_ns2 ip link set dev $DEV up + ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 +} + +function add_ipip6_tunnel { + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64 + ip netns exec at_ns0 ip link set dev $DEV_NS up + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns1 \ + ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64 + ip netns exec at_ns1 ip link set dev $DEV_NS up + # same inner IP address in at_ns0 and at_ns1 + ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 + + ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external + ip netns exec at_ns2 ip link set dev $DEV up + ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 +} + +function add_ip6ip6_tunnel { + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64 + ip netns exec at_ns0 ip link set dev $DEV_NS up + ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64 + ip netns exec at_ns1 \ + ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64 + ip netns exec at_ns1 ip link set dev $DEV_NS up + # same inner IP address in at_ns0 and at_ns1 + ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64 + + ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external + ip netns exec at_ns2 ip link set dev $DEV up + ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64 +} + +function attach_bpf { + DEV=$1 + SET_TUNNEL=$2 + GET_TUNNEL=$3 + ip netns exec at_ns2 tc qdisc add dev $DEV clsact + ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL + ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL +} + +function test_ipip { + DEV_NS=ipip_std + DEV=ipip_bpf + config_device +# tcpdump -nei br0 & + cat /sys/kernel/debug/tracing/trace_pipe & + + add_ipip_tunnel + attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel + + ip netns exec at_ns0 ping -c 1 10.1.1.200 + ip netns exec at_ns2 ping -c 1 10.1.1.100 + ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null + ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null + sleep 0.2 + # tcp check _same_ IP over different tunnels + ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 + ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 + cleanup +} + +# IPv4 over IPv6 tunnel +function test_ipip6 { + DEV_NS=ipip_std + DEV=ipip_bpf + config_device +# tcpdump -nei br0 & + cat /sys/kernel/debug/tracing/trace_pipe & + + add_ipip6_tunnel + attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel + + ip netns exec at_ns0 ping -c 1 10.1.1.200 + ip netns exec at_ns2 ping -c 1 10.1.1.100 + ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null + ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null + sleep 0.2 + # tcp check _same_ IP over different tunnels + ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 + ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 + cleanup +} + +# IPv6 over IPv6 tunnel +function test_ip6ip6 { + DEV_NS=ipip_std + DEV=ipip_bpf + config_device +# tcpdump -nei br0 & + cat /sys/kernel/debug/tracing/trace_pipe & + + add_ip6ip6_tunnel + attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel + + ip netns exec at_ns0 ping -6 -c 1 2601:646::2 + ip netns exec at_ns2 ping -6 -c 1 2601:646::1 + ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null + ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null + sleep 0.2 + # tcp check _same_ IP over different tunnels + ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200 + ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201 + cleanup +} + +function cleanup { + set +ex + pkill iperf + ip netns delete at_ns0 + ip netns delete at_ns1 + ip netns delete at_ns2 + ip link del veth0 + ip link del veth1 + ip link del veth2 + ip link del br0 + pkill tcpdump + pkill cat + set -ex +} + +cleanup +echo "Testing IP tunnels..." +test_ipip +test_ipip6 +test_ip6ip6 +echo "*** PASS ***" diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c new file mode 100644 index 000000000..b313dba41 --- /dev/null +++ b/samples/bpf/test_lru_dist.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2016 Facebook + */ +#define _GNU_SOURCE +#include <linux/types.h> +#include <stdio.h> +#include <unistd.h> +#include <linux/bpf.h> +#include <errno.h> +#include <string.h> +#include <assert.h> +#include <sched.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <fcntl.h> +#include <stdlib.h> +#include <time.h> + +#include <bpf/bpf.h> +#include "bpf_util.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#endif +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +static int nr_cpus; +static unsigned long long *dist_keys; +static unsigned int dist_key_counts; + +struct list_head { + struct list_head *next, *prev; +}; + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +struct pfect_lru_node { + struct list_head list; + unsigned long long key; +}; + +struct pfect_lru { + struct list_head list; + struct pfect_lru_node *free_nodes; + unsigned int cur_size; + unsigned int lru_size; + unsigned int nr_unique; + unsigned int nr_misses; + unsigned int total; + int map_fd; +}; + +static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size, + unsigned int nr_possible_elems) +{ + lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + sizeof(unsigned long long), + sizeof(struct pfect_lru_node *), + nr_possible_elems, 0); + assert(lru->map_fd != -1); + + lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node)); + assert(lru->free_nodes); + + INIT_LIST_HEAD(&lru->list); + lru->cur_size = 0; + lru->lru_size = lru_size; + lru->nr_unique = lru->nr_misses = lru->total = 0; +} + +static void pfect_lru_destroy(struct pfect_lru *lru) +{ + close(lru->map_fd); + free(lru->free_nodes); +} + +static int pfect_lru_lookup_or_insert(struct pfect_lru *lru, + unsigned long long key) +{ + struct pfect_lru_node *node = NULL; + int seen = 0; + + lru->total++; + if (!bpf_map_lookup_elem(lru->map_fd, &key, &node)) { + if (node) { + list_move(&node->list, &lru->list); + return 1; + } + seen = 1; + } + + if (lru->cur_size < lru->lru_size) { + node = &lru->free_nodes[lru->cur_size++]; + INIT_LIST_HEAD(&node->list); + } else { + struct pfect_lru_node *null_node = NULL; + + node = list_last_entry(&lru->list, + struct pfect_lru_node, + list); + bpf_map_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST); + } + + node->key = key; + list_move(&node->list, &lru->list); + + lru->nr_misses++; + if (seen) { + assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_EXIST)); + } else { + lru->nr_unique++; + assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST)); + } + + return seen; +} + +static unsigned int read_keys(const char *dist_file, + unsigned long long **keys) +{ + struct stat fst; + unsigned long long *retkeys; + unsigned int counts = 0; + int dist_fd; + char *b, *l; + int i; + + dist_fd = open(dist_file, 0); + assert(dist_fd != -1); + + assert(fstat(dist_fd, &fst) == 0); + b = malloc(fst.st_size); + assert(b); + + assert(read(dist_fd, b, fst.st_size) == fst.st_size); + close(dist_fd); + for (i = 0; i < fst.st_size; i++) { + if (b[i] == '\n') + counts++; + } + counts++; /* in case the last line has no \n */ + + retkeys = malloc(counts * sizeof(unsigned long long)); + assert(retkeys); + + counts = 0; + for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n")) + retkeys[counts++] = strtoull(l, NULL, 10); + free(b); + + *keys = retkeys; + + return counts; +} + +static int create_map(int map_type, int map_flags, unsigned int size) +{ + int map_fd; + + map_fd = bpf_create_map(map_type, sizeof(unsigned long long), + sizeof(unsigned long long), size, map_flags); + + if (map_fd == -1) + perror("bpf_create_map"); + + return map_fd; +} + +static int sched_next_online(int pid, int next_to_try) +{ + cpu_set_t cpuset; + + if (next_to_try == nr_cpus) + return -1; + + while (next_to_try < nr_cpus) { + CPU_ZERO(&cpuset); + CPU_SET(next_to_try++, &cpuset); + if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) + break; + } + + return next_to_try; +} + +static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data), + void *data) +{ + int next_sched_cpu = 0; + pid_t pid[tasks]; + int i; + + for (i = 0; i < tasks; i++) { + pid[i] = fork(); + if (pid[i] == 0) { + next_sched_cpu = sched_next_online(0, next_sched_cpu); + fn(i, data); + exit(0); + } else if (pid[i] == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } + /* It is mostly redundant and just allow the parent + * process to update next_shced_cpu for the next child + * process + */ + next_sched_cpu = sched_next_online(pid[i], next_sched_cpu); + } + for (i = 0; i < tasks; i++) { + int status; + + assert(waitpid(pid[i], &status, 0) == pid[i]); + assert(status == 0); + } +} + +static void do_test_lru_dist(int task, void *data) +{ + unsigned int nr_misses = 0; + struct pfect_lru pfect_lru; + unsigned long long key, value = 1234; + unsigned int i; + + unsigned int lru_map_fd = ((unsigned int *)data)[0]; + unsigned int lru_size = ((unsigned int *)data)[1]; + unsigned long long key_offset = task * dist_key_counts; + + pfect_lru_init(&pfect_lru, lru_size, dist_key_counts); + + for (i = 0; i < dist_key_counts; i++) { + key = dist_keys[i] + key_offset; + + pfect_lru_lookup_or_insert(&pfect_lru, key); + + if (!bpf_map_lookup_elem(lru_map_fd, &key, &value)) + continue; + + if (bpf_map_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) { + printf("bpf_map_update_elem(lru_map_fd, %llu): errno:%d\n", + key, errno); + assert(0); + } + + nr_misses++; + } + + printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n", + task, pfect_lru.nr_unique, dist_key_counts, nr_misses, + dist_key_counts); + printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n", + task, pfect_lru.nr_unique, pfect_lru.total, + pfect_lru.nr_misses, pfect_lru.total); + + pfect_lru_destroy(&pfect_lru); + close(lru_map_fd); +} + +static void test_parallel_lru_dist(int map_type, int map_flags, + int nr_tasks, unsigned int lru_size) +{ + int child_data[2]; + int lru_map_fd; + + printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type, + map_flags); + + if (map_flags & BPF_F_NO_COMMON_LRU) + lru_map_fd = create_map(map_type, map_flags, + nr_cpus * lru_size); + else + lru_map_fd = create_map(map_type, map_flags, + nr_tasks * lru_size); + assert(lru_map_fd != -1); + + child_data[0] = lru_map_fd; + child_data[1] = lru_size; + + run_parallel(nr_tasks, do_test_lru_dist, child_data); + + close(lru_map_fd); +} + +static void test_lru_loss0(int map_type, int map_flags) +{ + unsigned long long key, value[nr_cpus]; + unsigned int old_unused_losses = 0; + unsigned int new_unused_losses = 0; + unsigned int used_losses = 0; + int map_fd; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + if (map_flags & BPF_F_NO_COMMON_LRU) + map_fd = create_map(map_type, map_flags, 900 * nr_cpus); + else + map_fd = create_map(map_type, map_flags, 900); + + assert(map_fd != -1); + + value[0] = 1234; + + for (key = 1; key <= 1000; key++) { + int start_key, end_key; + + assert(bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0); + + start_key = 101; + end_key = min(key, 900); + + while (start_key <= end_key) { + bpf_map_lookup_elem(map_fd, &start_key, value); + start_key++; + } + } + + for (key = 1; key <= 1000; key++) { + if (bpf_map_lookup_elem(map_fd, &key, value)) { + if (key <= 100) + old_unused_losses++; + else if (key <= 900) + used_losses++; + else + new_unused_losses++; + } + } + + close(map_fd); + + printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) " + "newer-elem-losses:%d(/100)\n", + old_unused_losses, used_losses, new_unused_losses); +} + +static void test_lru_loss1(int map_type, int map_flags) +{ + unsigned long long key, value[nr_cpus]; + int map_fd; + unsigned int nr_losses = 0; + + printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, + map_flags); + + assert(sched_next_online(0, 0) != -1); + + if (map_flags & BPF_F_NO_COMMON_LRU) + map_fd = create_map(map_type, map_flags, 1000 * nr_cpus); + else + map_fd = create_map(map_type, map_flags, 1000); + + assert(map_fd != -1); + + value[0] = 1234; + + for (key = 1; key <= 1000; key++) + assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST)); + + for (key = 1; key <= 1000; key++) { + if (bpf_map_lookup_elem(map_fd, &key, value)) + nr_losses++; + } + + close(map_fd); + + printf("nr_losses:%d(/1000)\n", nr_losses); +} + +static void do_test_parallel_lru_loss(int task, void *data) +{ + const unsigned int nr_stable_elems = 1000; + const unsigned int nr_repeats = 100000; + + int map_fd = *(int *)data; + unsigned long long stable_base; + unsigned long long key, value[nr_cpus]; + unsigned long long next_ins_key; + unsigned int nr_losses = 0; + unsigned int i; + + stable_base = task * nr_repeats * 2 + 1; + next_ins_key = stable_base; + value[0] = 1234; + for (i = 0; i < nr_stable_elems; i++) { + assert(bpf_map_update_elem(map_fd, &next_ins_key, value, + BPF_NOEXIST) == 0); + next_ins_key++; + } + + for (i = 0; i < nr_repeats; i++) { + int rn; + + rn = rand(); + + if (rn % 10) { + key = rn % nr_stable_elems + stable_base; + bpf_map_lookup_elem(map_fd, &key, value); + } else { + bpf_map_update_elem(map_fd, &next_ins_key, value, + BPF_NOEXIST); + next_ins_key++; + } + } + + key = stable_base; + for (i = 0; i < nr_stable_elems; i++) { + if (bpf_map_lookup_elem(map_fd, &key, value)) + nr_losses++; + key++; + } + + printf(" task:%d nr_losses:%u\n", task, nr_losses); +} + +static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks) +{ + int map_fd; + + printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type, + map_flags); + + /* Give 20% more than the active working set */ + if (map_flags & BPF_F_NO_COMMON_LRU) + map_fd = create_map(map_type, map_flags, + nr_cpus * (1000 + 200)); + else + map_fd = create_map(map_type, map_flags, + nr_tasks * (1000 + 200)); + + assert(map_fd != -1); + + run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd); + + close(map_fd); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; + const char *dist_file; + int nr_tasks = 1; + int lru_size; + int f; + + if (argc < 4) { + printf("Usage: %s <dist-file> <lru-size> <nr-tasks>\n", + argv[0]); + return -1; + } + + dist_file = argv[1]; + lru_size = atoi(argv[2]); + nr_tasks = atoi(argv[3]); + + setbuf(stdout, NULL); + + assert(!setrlimit(RLIMIT_MEMLOCK, &r)); + + srand(time(NULL)); + + nr_cpus = bpf_num_possible_cpus(); + assert(nr_cpus != -1); + printf("nr_cpus:%d\n\n", nr_cpus); + + nr_tasks = min(nr_tasks, nr_cpus); + + dist_key_counts = read_keys(dist_file, &dist_keys); + if (!dist_key_counts) { + printf("%s has no key\n", dist_file); + return -1; + } + + for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) { + test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); + test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); + test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f], + nr_tasks); + test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f], + nr_tasks, lru_size); + printf("\n"); + } + + free(dist_keys); + + return 0; +} diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c new file mode 100644 index 000000000..1b568575a --- /dev/null +++ b/samples/bpf/test_lwt_bpf.c @@ -0,0 +1,253 @@ +/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <stdint.h> +#include <stddef.h> +#include <linux/bpf.h> +#include <linux/ip.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmpv6.h> +#include <linux/if_ether.h> +#include <bpf/bpf_helpers.h> +#include <string.h> + +# define printk(fmt, ...) \ + ({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ + }) + +#define CB_MAGIC 1234 + +/* Test: Pass all packets through */ +SEC("nop") +int do_nop(struct __sk_buff *skb) +{ + return BPF_OK; +} + +/* Test: Verify context information can be accessed */ +SEC("test_ctx") +int do_test_ctx(struct __sk_buff *skb) +{ + skb->cb[0] = CB_MAGIC; + printk("len %d hash %d protocol %d\n", skb->len, skb->hash, + skb->protocol); + printk("cb %d ingress_ifindex %d ifindex %d\n", skb->cb[0], + skb->ingress_ifindex, skb->ifindex); + + return BPF_OK; +} + +/* Test: Ensure skb->cb[] buffer is cleared */ +SEC("test_cb") +int do_test_cb(struct __sk_buff *skb) +{ + printk("cb0: %x cb1: %x cb2: %x\n", skb->cb[0], skb->cb[1], + skb->cb[2]); + printk("cb3: %x cb4: %x\n", skb->cb[3], skb->cb[4]); + + return BPF_OK; +} + +/* Test: Verify skb data can be read */ +SEC("test_data") +int do_test_data(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + struct iphdr *iph = data; + + if (data + sizeof(*iph) > data_end) { + printk("packet truncated\n"); + return BPF_DROP; + } + + printk("src: %x dst: %x\n", iph->saddr, iph->daddr); + + return BPF_OK; +} + +#define IP_CSUM_OFF offsetof(struct iphdr, check) +#define IP_DST_OFF offsetof(struct iphdr, daddr) +#define IP_SRC_OFF offsetof(struct iphdr, saddr) +#define IP_PROTO_OFF offsetof(struct iphdr, protocol) +#define TCP_CSUM_OFF offsetof(struct tcphdr, check) +#define UDP_CSUM_OFF offsetof(struct udphdr, check) +#define IS_PSEUDO 0x10 + +static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip, + uint32_t new_ip, int rw_daddr) +{ + int ret, off = 0, flags = IS_PSEUDO; + uint8_t proto; + + ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1); + if (ret < 0) { + printk("bpf_l4_csum_replace failed: %d\n", ret); + return BPF_DROP; + } + + switch (proto) { + case IPPROTO_TCP: + off = TCP_CSUM_OFF; + break; + + case IPPROTO_UDP: + off = UDP_CSUM_OFF; + flags |= BPF_F_MARK_MANGLED_0; + break; + + case IPPROTO_ICMPV6: + off = offsetof(struct icmp6hdr, icmp6_cksum); + break; + } + + if (off) { + ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip, + flags | sizeof(new_ip)); + if (ret < 0) { + printk("bpf_l4_csum_replace failed: %d\n"); + return BPF_DROP; + } + } + + ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); + if (ret < 0) { + printk("bpf_l3_csum_replace failed: %d\n", ret); + return BPF_DROP; + } + + if (rw_daddr) + ret = bpf_skb_store_bytes(skb, IP_DST_OFF, &new_ip, sizeof(new_ip), 0); + else + ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); + + if (ret < 0) { + printk("bpf_skb_store_bytes() failed: %d\n", ret); + return BPF_DROP; + } + + return BPF_OK; +} + +/* Test: Verify skb data can be modified */ +SEC("test_rewrite") +int do_test_rewrite(struct __sk_buff *skb) +{ + uint32_t old_ip, new_ip = 0x3fea8c0; + int ret; + + ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4); + if (ret < 0) { + printk("bpf_skb_load_bytes failed: %d\n", ret); + return BPF_DROP; + } + + if (old_ip == 0x2fea8c0) { + printk("out: rewriting from %x to %x\n", old_ip, new_ip); + return rewrite(skb, old_ip, new_ip, 1); + } + + return BPF_OK; +} + +static inline int __do_push_ll_and_redirect(struct __sk_buff *skb) +{ + uint64_t smac = SRC_MAC, dmac = DST_MAC; + int ret, ifindex = DST_IFINDEX; + struct ethhdr ehdr; + + ret = bpf_skb_change_head(skb, 14, 0); + if (ret < 0) { + printk("skb_change_head() failed: %d\n", ret); + } + + ehdr.h_proto = __constant_htons(ETH_P_IP); + memcpy(&ehdr.h_source, &smac, 6); + memcpy(&ehdr.h_dest, &dmac, 6); + + ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0); + if (ret < 0) { + printk("skb_store_bytes() failed: %d\n", ret); + return BPF_DROP; + } + + return bpf_redirect(ifindex, 0); +} + +SEC("push_ll_and_redirect_silent") +int do_push_ll_and_redirect_silent(struct __sk_buff *skb) +{ + return __do_push_ll_and_redirect(skb); +} + +SEC("push_ll_and_redirect") +int do_push_ll_and_redirect(struct __sk_buff *skb) +{ + int ret, ifindex = DST_IFINDEX; + + ret = __do_push_ll_and_redirect(skb); + if (ret >= 0) + printk("redirected to %d\n", ifindex); + + return ret; +} + +static inline void __fill_garbage(struct __sk_buff *skb) +{ + uint64_t f = 0xFFFFFFFFFFFFFFFF; + + bpf_skb_store_bytes(skb, 0, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 8, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 16, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 24, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 32, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 40, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 48, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 56, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 64, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 72, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 80, &f, sizeof(f), 0); + bpf_skb_store_bytes(skb, 88, &f, sizeof(f), 0); +} + +SEC("fill_garbage") +int do_fill_garbage(struct __sk_buff *skb) +{ + __fill_garbage(skb); + printk("Set initial 96 bytes of header to FF\n"); + return BPF_OK; +} + +SEC("fill_garbage_and_redirect") +int do_fill_garbage_and_redirect(struct __sk_buff *skb) +{ + int ifindex = DST_IFINDEX; + __fill_garbage(skb); + printk("redirected to %d\n", ifindex); + return bpf_redirect(ifindex, 0); +} + +/* Drop all packets */ +SEC("drop_all") +int do_drop_all(struct __sk_buff *skb) +{ + printk("dropping with: %d\n", BPF_DROP); + return BPF_DROP; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh new file mode 100755 index 000000000..65a976058 --- /dev/null +++ b/samples/bpf/test_lwt_bpf.sh @@ -0,0 +1,400 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Uncomment to see generated bytecode +#VERBOSE=verbose + +NS1=lwt_ns1 +NS2=lwt_ns2 +VETH0=tst_lwt1a +VETH1=tst_lwt1b +VETH2=tst_lwt2a +VETH3=tst_lwt2b +IPVETH0="192.168.254.1" +IPVETH1="192.168.254.2" +IPVETH1b="192.168.254.3" + +IPVETH2="192.168.111.1" +IPVETH3="192.168.111.2" + +IP_LOCAL="192.168.99.1" + +TRACE_ROOT=/sys/kernel/debug/tracing + +function lookup_mac() +{ + set +x + if [ ! -z "$2" ]; then + MAC=$(ip netns exec $2 ip link show $1 | grep ether | awk '{print $2}') + else + MAC=$(ip link show $1 | grep ether | awk '{print $2}') + fi + MAC="${MAC//:/}" + echo "0x${MAC:10:2}${MAC:8:2}${MAC:6:2}${MAC:4:2}${MAC:2:2}${MAC:0:2}" + set -x +} + +function cleanup { + set +ex + rm test_lwt_bpf.o 2> /dev/null + ip link del $VETH0 2> /dev/null + ip link del $VETH1 2> /dev/null + ip link del $VETH2 2> /dev/null + ip link del $VETH3 2> /dev/null + ip netns exec $NS1 killall netserver + ip netns delete $NS1 2> /dev/null + ip netns delete $NS2 2> /dev/null + set -ex +} + +function setup_one_veth { + ip netns add $1 + ip link add $2 type veth peer name $3 + ip link set dev $2 up + ip addr add $4/24 dev $2 + ip link set $3 netns $1 + ip netns exec $1 ip link set dev $3 up + ip netns exec $1 ip addr add $5/24 dev $3 + + if [ "$6" ]; then + ip netns exec $1 ip addr add $6/32 dev $3 + fi +} + +function get_trace { + set +x + cat ${TRACE_ROOT}/trace | grep -v '^#' + set -x +} + +function cleanup_routes { + ip route del ${IPVETH1}/32 dev $VETH0 2> /dev/null || true + ip route del table local local ${IP_LOCAL}/32 dev lo 2> /dev/null || true +} + +function install_test { + cleanup_routes + cp /dev/null ${TRACE_ROOT}/trace + + OPTS="encap bpf headroom 14 $1 obj test_lwt_bpf.o section $2 $VERBOSE" + + if [ "$1" == "in" ]; then + ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo + else + ip route add ${IPVETH1}/32 $OPTS dev $VETH0 + fi +} + +function remove_prog { + if [ "$1" == "in" ]; then + ip route del table local local ${IP_LOCAL}/32 dev lo + else + ip route del ${IPVETH1}/32 dev $VETH0 + fi +} + +function filter_trace { + # Add newline to allow starting EXPECT= variables on newline + NL=$'\n' + echo "${NL}$*" | sed -e 's/^.*: : //g' +} + +function expect_fail { + set +x + echo "FAIL:" + echo "Expected: $1" + echo "Got: $2" + set -x + exit 1 +} + +function match_trace { + set +x + RET=0 + TRACE=$1 + EXPECT=$2 + GOT="$(filter_trace "$TRACE")" + + [ "$GOT" != "$EXPECT" ] && { + expect_fail "$EXPECT" "$GOT" + RET=1 + } + set -x + return $RET +} + +function test_start { + set +x + echo "----------------------------------------------------------------" + echo "Starting test: $*" + echo "----------------------------------------------------------------" + set -x +} + +function failure { + get_trace + echo "FAIL: $*" + exit 1 +} + +function test_ctx_xmit { + test_start "test_ctx on lwt xmit" + install_test xmit test_ctx + ping -c 3 $IPVETH1 || { + failure "test_ctx xmit: packets are dropped" + } + match_trace "$(get_trace)" " +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX" || exit 1 + remove_prog xmit +} + +function test_ctx_out { + test_start "test_ctx on lwt out" + install_test out test_ctx + ping -c 3 $IPVETH1 || { + failure "test_ctx out: packets are dropped" + } + match_trace "$(get_trace)" " +len 84 hash 0 protocol 0 +cb 1234 ingress_ifindex 0 ifindex 0 +len 84 hash 0 protocol 0 +cb 1234 ingress_ifindex 0 ifindex 0 +len 84 hash 0 protocol 0 +cb 1234 ingress_ifindex 0 ifindex 0" || exit 1 + remove_prog out +} + +function test_ctx_in { + test_start "test_ctx on lwt in" + install_test in test_ctx + ping -c 3 $IP_LOCAL || { + failure "test_ctx out: packets are dropped" + } + # We will both request & reply packets as the packets will + # be from $IP_LOCAL => $IP_LOCAL + match_trace "$(get_trace)" " +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 1 ifindex 1 +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 1 ifindex 1 +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 1 ifindex 1 +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 1 ifindex 1 +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 1 ifindex 1 +len 84 hash 0 protocol 8 +cb 1234 ingress_ifindex 1 ifindex 1" || exit 1 + remove_prog in +} + +function test_data { + test_start "test_data on lwt $1" + install_test $1 test_data + ping -c 3 $IPVETH1 || { + failure "test_data ${1}: packets are dropped" + } + match_trace "$(get_trace)" " +src: 1fea8c0 dst: 2fea8c0 +src: 1fea8c0 dst: 2fea8c0 +src: 1fea8c0 dst: 2fea8c0" || exit 1 + remove_prog $1 +} + +function test_data_in { + test_start "test_data on lwt in" + install_test in test_data + ping -c 3 $IP_LOCAL || { + failure "test_data in: packets are dropped" + } + # We will both request & reply packets as the packets will + # be from $IP_LOCAL => $IP_LOCAL + match_trace "$(get_trace)" " +src: 163a8c0 dst: 163a8c0 +src: 163a8c0 dst: 163a8c0 +src: 163a8c0 dst: 163a8c0 +src: 163a8c0 dst: 163a8c0 +src: 163a8c0 dst: 163a8c0 +src: 163a8c0 dst: 163a8c0" || exit 1 + remove_prog in +} + +function test_cb { + test_start "test_cb on lwt $1" + install_test $1 test_cb + ping -c 3 $IPVETH1 || { + failure "test_cb ${1}: packets are dropped" + } + match_trace "$(get_trace)" " +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0 +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0 +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0" || exit 1 + remove_prog $1 +} + +function test_cb_in { + test_start "test_cb on lwt in" + install_test in test_cb + ping -c 3 $IP_LOCAL || { + failure "test_cb in: packets are dropped" + } + # We will both request & reply packets as the packets will + # be from $IP_LOCAL => $IP_LOCAL + match_trace "$(get_trace)" " +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0 +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0 +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0 +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0 +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0 +cb0: 0 cb1: 0 cb2: 0 +cb3: 0 cb4: 0" || exit 1 + remove_prog in +} + +function test_drop_all { + test_start "test_drop_all on lwt $1" + install_test $1 drop_all + ping -c 3 $IPVETH1 && { + failure "test_drop_all ${1}: Unexpected success of ping" + } + match_trace "$(get_trace)" " +dropping with: 2 +dropping with: 2 +dropping with: 2" || exit 1 + remove_prog $1 +} + +function test_drop_all_in { + test_start "test_drop_all on lwt in" + install_test in drop_all + ping -c 3 $IP_LOCAL && { + failure "test_drop_all in: Unexpected success of ping" + } + match_trace "$(get_trace)" " +dropping with: 2 +dropping with: 2 +dropping with: 2" || exit 1 + remove_prog in +} + +function test_push_ll_and_redirect { + test_start "test_push_ll_and_redirect on lwt xmit" + install_test xmit push_ll_and_redirect + ping -c 3 $IPVETH1 || { + failure "Redirected packets appear to be dropped" + } + match_trace "$(get_trace)" " +redirected to $DST_IFINDEX +redirected to $DST_IFINDEX +redirected to $DST_IFINDEX" || exit 1 + remove_prog xmit +} + +function test_no_l2_and_redirect { + test_start "test_no_l2_and_redirect on lwt xmit" + install_test xmit fill_garbage_and_redirect + ping -c 3 $IPVETH1 && { + failure "Unexpected success despite lack of L2 header" + } + match_trace "$(get_trace)" " +redirected to $DST_IFINDEX +redirected to $DST_IFINDEX +redirected to $DST_IFINDEX" || exit 1 + remove_prog xmit +} + +function test_rewrite { + test_start "test_rewrite on lwt xmit" + install_test xmit test_rewrite + ping -c 3 $IPVETH1 || { + failure "Rewritten packets appear to be dropped" + } + match_trace "$(get_trace)" " +out: rewriting from 2fea8c0 to 3fea8c0 +out: rewriting from 2fea8c0 to 3fea8c0 +out: rewriting from 2fea8c0 to 3fea8c0" || exit 1 + remove_prog out +} + +function test_fill_garbage { + test_start "test_fill_garbage on lwt xmit" + install_test xmit fill_garbage + ping -c 3 $IPVETH1 && { + failure "test_drop_all ${1}: Unexpected success of ping" + } + match_trace "$(get_trace)" " +Set initial 96 bytes of header to FF +Set initial 96 bytes of header to FF +Set initial 96 bytes of header to FF" || exit 1 + remove_prog xmit +} + +function test_netperf_nop { + test_start "test_netperf_nop on lwt xmit" + install_test xmit nop + netperf -H $IPVETH1 -t TCP_STREAM || { + failure "packets appear to be dropped" + } + match_trace "$(get_trace)" ""|| exit 1 + remove_prog xmit +} + +function test_netperf_redirect { + test_start "test_netperf_redirect on lwt xmit" + install_test xmit push_ll_and_redirect_silent + netperf -H $IPVETH1 -t TCP_STREAM || { + failure "Rewritten packets appear to be dropped" + } + match_trace "$(get_trace)" ""|| exit 1 + remove_prog xmit +} + +cleanup +setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b +setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3 +ip netns exec $NS1 netserver +echo 1 > ${TRACE_ROOT}/tracing_on + +DST_MAC=$(lookup_mac $VETH1 $NS1) +SRC_MAC=$(lookup_mac $VETH0) +DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex) + +CLANG_OPTS="-O2 -target bpf -I ../include/" +CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX" +clang $CLANG_OPTS -c test_lwt_bpf.c -o test_lwt_bpf.o + +test_ctx_xmit +test_ctx_out +test_ctx_in +test_data "xmit" +test_data "out" +test_data_in +test_cb "xmit" +test_cb "out" +test_cb_in +test_drop_all "xmit" +test_drop_all "out" +test_drop_all_in +test_rewrite +test_push_ll_and_redirect +test_no_l2_and_redirect +test_fill_garbage +test_netperf_nop +test_netperf_redirect + +cleanup +echo 0 > ${TRACE_ROOT}/tracing_on +exit 0 diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c new file mode 100644 index 000000000..b0200c8ea --- /dev/null +++ b/samples/bpf/test_map_in_map_kern.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/in6.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "trace_common.h" + +#define MAX_NR_PORTS 65536 + +/* map #0 */ +struct inner_a { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, int); + __uint(max_entries, MAX_NR_PORTS); +} port_a SEC(".maps"); + +/* map #1 */ +struct inner_h { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} port_h SEC(".maps"); + +/* map #2 */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} reg_result_h SEC(".maps"); + +/* map #3 */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} inline_result_h SEC(".maps"); + +/* map #4 */ /* Test case #0 */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_NR_PORTS); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_a); /* use inner_a as inner map */ +} a_of_port_a SEC(".maps"); + +/* map #5 */ /* Test case #1 */ +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_a); /* use inner_a as inner map */ +} h_of_port_a SEC(".maps"); + +/* map #6 */ /* Test case #2 */ +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_h); /* use inner_h as inner map */ +} h_of_port_h SEC(".maps"); + +static __always_inline int do_reg_lookup(void *inner_map, u32 port) +{ + int *result; + + result = bpf_map_lookup_elem(inner_map, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_array_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_a) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_a, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_h) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_h, &port); + return result ? *result : -ENOENT; +} + +SEC("kprobe/__sys_connect") +int trace_sys_connect(struct pt_regs *ctx) +{ + struct sockaddr_in6 *in6; + u16 test_case, port, dst6[8]; + int addrlen, ret, inline_ret, ret_key = 0; + u32 port_key; + void *outer_map, *inner_map; + bool inline_hash = false; + + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx); + addrlen = (int)PT_REGS_PARM3_CORE(ctx); + + if (addrlen != sizeof(*in6)) + return 0; + + ret = bpf_probe_read_user(dst6, sizeof(dst6), &in6->sin6_addr); + if (ret) { + inline_ret = ret; + goto done; + } + + if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + return 0; + + test_case = dst6[7]; + + ret = bpf_probe_read_user(&port, sizeof(port), &in6->sin6_port); + if (ret) { + inline_ret = ret; + goto done; + } + + port_key = port; + + ret = -ENOENT; + if (test_case == 0) { + outer_map = &a_of_port_a; + } else if (test_case == 1) { + outer_map = &h_of_port_a; + } else if (test_case == 2) { + outer_map = &h_of_port_h; + } else { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + inner_map = bpf_map_lookup_elem(outer_map, &port_key); + if (!inner_map) { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + ret = do_reg_lookup(inner_map, port_key); + + if (test_case == 0 || test_case == 1) + inline_ret = do_inline_array_lookup(inner_map, port_key); + else + inline_ret = do_inline_hash_lookup(inner_map, port_key); + +done: + bpf_map_update_elem(®_result_h, &ret_key, &ret, BPF_ANY); + bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c new file mode 100644 index 000000000..98656de56 --- /dev/null +++ b/samples/bpf/test_map_in_map_user.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2017 Facebook + */ +#include <sys/resource.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <stdint.h> +#include <assert.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +static int map_fd[7]; + +#define PORT_A (map_fd[0]) +#define PORT_H (map_fd[1]) +#define REG_RESULT_H (map_fd[2]) +#define INLINE_RESULT_H (map_fd[3]) +#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */ +#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */ +#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */ + +static const char * const test_names[] = { + "Array of Array", + "Hash of Array", + "Hash of Hash", +}; + +#define NR_TESTS (sizeof(test_names) / sizeof(*test_names)) + +static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key) +{ + struct bpf_map_info info = {}; + uint32_t info_len = sizeof(info); + int ret, id; + + ret = bpf_obj_get_info_by_fd(inner_map_fd, &info, &info_len); + assert(!ret); + + ret = bpf_map_lookup_elem(map_in_map_fd, &key, &id); + assert(!ret); + assert(id == info.id); +} + +static void populate_map(uint32_t port_key, int magic_result) +{ + int ret; + + ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY); + assert(!ret); + + ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result, + BPF_NOEXIST); + assert(!ret); + + ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY); + assert(!ret); + check_map_id(PORT_A, A_OF_PORT_A, port_key); + + ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST); + assert(!ret); + check_map_id(PORT_A, H_OF_PORT_A, port_key); + + ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST); + assert(!ret); + check_map_id(PORT_H, H_OF_PORT_H, port_key); +} + +static void test_map_in_map(void) +{ + struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; + uint32_t result_key = 0, port_key; + int result, inline_result; + int magic_result = 0xfaceb00c; + int ret; + int i; + + port_key = rand() & 0x00FF; + populate_map(port_key, magic_result); + + in6.sin6_addr.s6_addr16[0] = 0xdead; + in6.sin6_addr.s6_addr16[1] = 0xbeef; + in6.sin6_port = port_key; + + for (i = 0; i < NR_TESTS; i++) { + printf("%s: ", test_names[i]); + + in6.sin6_addr.s6_addr16[7] = i; + ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6)); + assert(ret == -1 && errno == EBADF); + + ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result); + assert(!ret); + + ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key, + &inline_result); + assert(!ret); + + if (result != magic_result || inline_result != magic_result) { + printf("Error. result:%d inline_result:%d\n", + result, inline_result); + exit(1); + } + + bpf_map_delete_elem(REG_RESULT_H, &result_key); + bpf_map_delete_elem(INLINE_RESULT_H, &result_key); + + printf("Pass\n"); + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "trace_sys_connect"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "port_a"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "port_h"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "reg_result_h"); + map_fd[3] = bpf_object__find_map_fd_by_name(obj, "inline_result_h"); + map_fd[4] = bpf_object__find_map_fd_by_name(obj, "a_of_port_a"); + map_fd[5] = bpf_object__find_map_fd_by_name(obj, "h_of_port_a"); + map_fd[6] = bpf_object__find_map_fd_by_name(obj, "h_of_port_h"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0 || + map_fd[3] < 0 || map_fd[4] < 0 || map_fd[5] < 0 || map_fd[6] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + test_map_in_map(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c new file mode 100644 index 000000000..f6d593e47 --- /dev/null +++ b/samples/bpf/test_overhead_kprobe_kern.c @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define _(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +SEC("kprobe/__set_task_comm") +int prog(struct pt_regs *ctx) +{ + struct signal_struct *signal; + struct task_struct *tsk; + char oldcomm[16] = {}; + char newcomm[16] = {}; + u16 oom_score_adj; + u32 pid; + + tsk = (void *)PT_REGS_PARM1(ctx); + + pid = _(tsk->pid); + bpf_probe_read_kernel(oldcomm, sizeof(oldcomm), &tsk->comm); + bpf_probe_read_kernel(newcomm, sizeof(newcomm), + (void *)PT_REGS_PARM2(ctx)); + signal = _(tsk->signal); + oom_score_adj = _(signal->oom_score_adj); + return 0; +} + +SEC("kprobe/urandom_read") +int prog2(struct pt_regs *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c new file mode 100644 index 000000000..8763181a3 --- /dev/null +++ b/samples/bpf/test_overhead_raw_tp_kern.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook */ +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("raw_tracepoint/task_rename") +int prog(struct bpf_raw_tracepoint_args *ctx) +{ + return 0; +} + +SEC("raw_tracepoint/urandom_read") +int prog2(struct bpf_raw_tracepoint_args *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c new file mode 100644 index 000000000..eaa32693f --- /dev/null +++ b/samples/bpf/test_overhead_tp_kern.c @@ -0,0 +1,36 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* from /sys/kernel/debug/tracing/events/task/task_rename/format */ +struct task_rename { + __u64 pad; + __u32 pid; + char oldcomm[16]; + char newcomm[16]; + __u16 oom_score_adj; +}; +SEC("tracepoint/task/task_rename") +int prog(struct task_rename *ctx) +{ + return 0; +} + +/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */ +struct urandom_read { + __u64 pad; + int got_bits; + int pool_left; + int input_left; +}; +SEC("tracepoint/random/urandom_read") +int prog2(struct urandom_read *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c new file mode 100644 index 000000000..94f74112a --- /dev/null +++ b/samples/bpf/test_overhead_user.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Facebook + */ +#define _GNU_SOURCE +#include <sched.h> +#include <errno.h> +#include <stdio.h> +#include <sys/types.h> +#include <asm/unistd.h> +#include <fcntl.h> +#include <unistd.h> +#include <assert.h> +#include <sys/wait.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/bpf.h> +#include <string.h> +#include <time.h> +#include <sys/resource.h> +#include <bpf/bpf.h> +#include "bpf_load.h" + +#define MAX_CNT 1000000 + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static void test_task_rename(int cpu) +{ + __u64 start_time; + char buf[] = "test\n"; + int i, fd; + + fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); + if (fd < 0) { + printf("couldn't open /proc\n"); + exit(1); + } + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) { + if (write(fd, buf, sizeof(buf)) < 0) { + printf("task rename failed: %s\n", strerror(errno)); + close(fd); + return; + } + } + printf("task_rename:%d: %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + close(fd); +} + +static void test_urandom_read(int cpu) +{ + __u64 start_time; + char buf[4]; + int i, fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + printf("couldn't open /dev/urandom\n"); + exit(1); + } + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) { + if (read(fd, buf, sizeof(buf)) < 0) { + printf("failed to read from /dev/urandom: %s\n", strerror(errno)); + close(fd); + return; + } + } + printf("urandom_read:%d: %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + close(fd); +} + +static void loop(int cpu, int flags) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + if (flags & 1) + test_task_rename(cpu); + if (flags & 2) + test_urandom_read(cpu); +} + +static void run_perf_test(int tasks, int flags) +{ + pid_t pid[tasks]; + int i; + + for (i = 0; i < tasks; i++) { + pid[i] = fork(); + if (pid[i] == 0) { + loop(i, flags); + exit(0); + } else if (pid[i] == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } + } + for (i = 0; i < tasks; i++) { + int status; + + assert(waitpid(pid[i], &status, 0) == pid[i]); + assert(status == 0); + } +} + +static void unload_progs(void) +{ + close(prog_fd[0]); + close(prog_fd[1]); + close(event_fd[0]); + close(event_fd[1]); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + int num_cpu = 8; + int test_flags = ~0; + + setrlimit(RLIMIT_MEMLOCK, &r); + + if (argc > 1) + test_flags = atoi(argv[1]) ? : test_flags; + if (argc > 2) + num_cpu = atoi(argv[2]) ? : num_cpu; + + if (test_flags & 0x3) { + printf("BASE\n"); + run_perf_test(num_cpu, test_flags); + } + + if (test_flags & 0xC) { + snprintf(filename, sizeof(filename), + "%s_kprobe_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/KPROBE\n"); + run_perf_test(num_cpu, test_flags >> 2); + unload_progs(); + } + + if (test_flags & 0x30) { + snprintf(filename, sizeof(filename), + "%s_tp_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/TRACEPOINT\n"); + run_perf_test(num_cpu, test_flags >> 4); + unload_progs(); + } + + if (test_flags & 0xC0) { + snprintf(filename, sizeof(filename), + "%s_raw_tp_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/RAW_TRACEPOINT\n"); + run_perf_test(num_cpu, test_flags >> 6); + unload_progs(); + } + + return 0; +} diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh new file mode 100755 index 000000000..35db26f73 --- /dev/null +++ b/samples/bpf/test_override_return.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +rm -r tmpmnt +rm -f testfile.img +dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 +DEVICE=$(losetup --show -f testfile.img) +mkfs.btrfs -f $DEVICE +mkdir tmpmnt +./tracex7 $DEVICE +if [ $? -eq 0 ] +then + echo "SUCCESS!" +else + echo "FAILED!" +fi +losetup -d $DEVICE diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c new file mode 100644 index 000000000..220a96438 --- /dev/null +++ b/samples/bpf/test_probe_write_user_kern.c @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "trace_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct sockaddr_in); + __type(value, struct sockaddr_in); + __uint(max_entries, 256); +} dnat_map SEC(".maps"); + +/* kprobe is NOT a stable ABI + * kernel functions can be removed, renamed or completely change semantics. + * Number of arguments and their positions can change, etc. + * In such case this bpf+kprobe example will no longer be meaningful + * + * This example sits on a syscall, and the syscall ABI is relatively stable + * of course, across platforms, and over time, the ABI may change. + */ +SEC("kprobe/" SYSCALL(sys_connect)) +int bpf_prog1(struct pt_regs *ctx) +{ + struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); + void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs); + int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs); + struct sockaddr_in new_addr, orig_addr = {}; + struct sockaddr_in *mapped_addr; + + if (sockaddr_len > sizeof(orig_addr)) + return 0; + + if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0) + return 0; + + mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr); + if (mapped_addr != NULL) { + memcpy(&new_addr, mapped_addr, sizeof(new_addr)); + bpf_probe_write_user(sockaddr_arg, &new_addr, + sizeof(new_addr)); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c new file mode 100644 index 000000000..00ccfb834 --- /dev/null +++ b/samples/bpf/test_probe_write_user_user.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <assert.h> +#include <unistd.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +int main(int ac, char **argv) +{ + struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in; + struct sockaddr serv_addr, mapped_addr, tmp_addr; + int serverfd, serverconnfd, clientfd, map_fd; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + socklen_t sockaddr_len; + char filename[256]; + char *ip; + + serv_addr_in = (struct sockaddr_in *)&serv_addr; + mapped_addr_in = (struct sockaddr_in *)&mapped_addr; + tmp_addr_in = (struct sockaddr_in *)&tmp_addr; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "dnat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); + assert((clientfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); + + /* Bind server to ephemeral port on lo */ + memset(&serv_addr, 0, sizeof(serv_addr)); + serv_addr_in->sin_family = AF_INET; + serv_addr_in->sin_port = 0; + serv_addr_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + assert(bind(serverfd, &serv_addr, sizeof(serv_addr)) == 0); + + sockaddr_len = sizeof(serv_addr); + assert(getsockname(serverfd, &serv_addr, &sockaddr_len) == 0); + ip = inet_ntoa(serv_addr_in->sin_addr); + printf("Server bound to: %s:%d\n", ip, ntohs(serv_addr_in->sin_port)); + + memset(&mapped_addr, 0, sizeof(mapped_addr)); + mapped_addr_in->sin_family = AF_INET; + mapped_addr_in->sin_port = htons(5555); + mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255"); + + assert(!bpf_map_update_elem(map_fd, &mapped_addr, &serv_addr, BPF_ANY)); + + assert(listen(serverfd, 5) == 0); + + ip = inet_ntoa(mapped_addr_in->sin_addr); + printf("Client connecting to: %s:%d\n", + ip, ntohs(mapped_addr_in->sin_port)); + assert(connect(clientfd, &mapped_addr, sizeof(mapped_addr)) == 0); + + sockaddr_len = sizeof(tmp_addr); + ip = inet_ntoa(tmp_addr_in->sin_addr); + assert((serverconnfd = accept(serverfd, &tmp_addr, &sockaddr_len)) > 0); + printf("Server received connection from: %s:%d\n", + ip, ntohs(tmp_addr_in->sin_port)); + + sockaddr_len = sizeof(tmp_addr); + assert(getpeername(clientfd, &tmp_addr, &sockaddr_len) == 0); + ip = inet_ntoa(tmp_addr_in->sin_addr); + printf("Client's peer address: %s:%d\n", + ip, ntohs(tmp_addr_in->sin_port)); + + /* Is the server's getsockname = the socket getpeername */ + assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h new file mode 100644 index 000000000..8cb5400ae --- /dev/null +++ b/samples/bpf/trace_common.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_COMMON_H +#define __TRACE_COMMON_H + +#ifdef __x86_64__ +#define SYSCALL(SYS) "__x64_" __stringify(SYS) +#elif defined(__s390x__) +#define SYSCALL(SYS) "__s390x_" __stringify(SYS) +#else +#define SYSCALL(SYS) __stringify(SYS) +#endif + +#endif diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c new file mode 100644 index 000000000..7d3c66fb3 --- /dev/null +++ b/samples/bpf/trace_event_kern.c @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/bpf_perf_event.h> +#include <uapi/linux/perf_event.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct key_t { + char comm[TASK_COMM_LEN]; + u32 kernstack; + u32 userstack; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, 10000); +} counts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); + +#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) +#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) + +SEC("perf_event") +int bpf_prog1(struct bpf_perf_event_data *ctx) +{ + char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; + char time_fmt2[] = "Get Time Failed, ErrCode: %d"; + char addr_fmt[] = "Address recorded on event: %llx"; + char fmt[] = "CPU-%d period %lld ip %llx"; + u32 cpu = bpf_get_smp_processor_id(); + struct bpf_perf_event_value value_buf; + struct key_t key; + u64 *val, one = 1; + int ret; + + if (ctx->sample_period < 10000) + /* ignore warmup */ + return 0; + bpf_get_current_comm(&key.comm, sizeof(key.comm)); + key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS); + key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS); + if ((int)key.kernstack < 0 && (int)key.userstack < 0) { + bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period, + PT_REGS_IP(&ctx->regs)); + return 0; + } + + ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value)); + if (!ret) + bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running); + else + bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + + if (ctx->addr != 0) + bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr); + + val = bpf_map_lookup_elem(&counts, &key); + if (val) + (*val)++; + else + bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c new file mode 100644 index 000000000..ac1ba3681 --- /dev/null +++ b/samples/bpf/trace_event_user.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Facebook + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <linux/perf_event.h> +#include <linux/bpf.h> +#include <signal.h> +#include <errno.h> +#include <sys/resource.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "perf-sys.h" +#include "trace_helpers.h" + +#define SAMPLE_FREQ 50 + +static int pid; +/* counts, stackmap */ +static int map_fd[2]; +struct bpf_program *prog; +static bool sys_read_seen, sys_write_seen; + +static void print_ksym(__u64 addr) +{ + struct ksym *sym; + + if (!addr) + return; + sym = ksym_search(addr); + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + return; + } + + printf("%s;", sym->name); + if (!strstr(sym->name, "sys_read")) + sys_read_seen = true; + else if (!strstr(sym->name, "sys_write")) + sys_write_seen = true; +} + +static void print_addr(__u64 addr) +{ + if (!addr) + return; + printf("%llx;", addr); +} + +#define TASK_COMM_LEN 16 + +struct key_t { + char comm[TASK_COMM_LEN]; + __u32 kernstack; + __u32 userstack; +}; + +static void print_stack(struct key_t *key, __u64 count) +{ + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; + static bool warned; + int i; + + printf("%3lld %s;", count, key->comm); + if (bpf_map_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) { + printf("---;"); + } else { + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) + print_ksym(ip[i]); + } + printf("-;"); + if (bpf_map_lookup_elem(map_fd[1], &key->userstack, ip) != 0) { + printf("---;"); + } else { + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) + print_addr(ip[i]); + } + if (count < 6) + printf("\r"); + else + printf("\n"); + + if (key->kernstack == -EEXIST && !warned) { + printf("stackmap collisions seen. Consider increasing size\n"); + warned = true; + } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) { + printf("err stackid %d %d\n", key->kernstack, key->userstack); + } +} + +static void err_exit(int err) +{ + kill(pid, SIGKILL); + exit(err); +} + +static void print_stacks(void) +{ + struct key_t key = {}, next_key; + __u64 value; + __u32 stackid = 0, next_id; + int error = 1, fd = map_fd[0], stack_map = map_fd[1]; + + sys_read_seen = sys_write_seen = false; + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &value); + print_stack(&next_key, value); + bpf_map_delete_elem(fd, &next_key); + key = next_key; + } + printf("\n"); + if (!sys_read_seen || !sys_write_seen) { + printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n"); + err_exit(error); + } + + /* clear stack map */ + while (bpf_map_get_next_key(stack_map, &stackid, &next_id) == 0) { + bpf_map_delete_elem(stack_map, &next_id); + stackid = next_id; + } +} + +static inline int generate_load(void) +{ + if (system("dd if=/dev/zero of=/dev/null count=5000k status=none") < 0) { + printf("failed to generate some load with dd: %s\n", strerror(errno)); + return -1; + } + + return 0; +} + +static void test_perf_event_all_cpu(struct perf_event_attr *attr) +{ + int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + struct bpf_link **links = calloc(nr_cpus, sizeof(struct bpf_link *)); + int i, pmu_fd, error = 1; + + if (!links) { + printf("malloc of links failed\n"); + goto err; + } + + /* system wide perf event, no need to inherit */ + attr->inherit = 0; + + /* open perf_event on all cpus */ + for (i = 0; i < nr_cpus; i++) { + pmu_fd = sys_perf_event_open(attr, -1, i, -1, 0); + if (pmu_fd < 0) { + printf("sys_perf_event_open failed\n"); + goto all_cpu_err; + } + links[i] = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(links[i])) { + printf("bpf_program__attach_perf_event failed\n"); + links[i] = NULL; + close(pmu_fd); + goto all_cpu_err; + } + } + + if (generate_load() < 0) + goto all_cpu_err; + + print_stacks(); + error = 0; +all_cpu_err: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); +err: + free(links); + if (error) + err_exit(error); +} + +static void test_perf_event_task(struct perf_event_attr *attr) +{ + struct bpf_link *link = NULL; + int pmu_fd, error = 1; + + /* per task perf event, enable inherit so the "dd ..." command can be traced properly. + * Enabling inherit will cause bpf_perf_prog_read_time helper failure. + */ + attr->inherit = 1; + + /* open task bound event */ + pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); + if (pmu_fd < 0) { + printf("sys_perf_event_open failed\n"); + goto err; + } + link = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(link)) { + printf("bpf_program__attach_perf_event failed\n"); + link = NULL; + close(pmu_fd); + goto err; + } + + if (generate_load() < 0) + goto err; + + print_stacks(); + error = 0; +err: + bpf_link__destroy(link); + if (error) + err_exit(error); +} + +static void test_bpf_perf_event(void) +{ + struct perf_event_attr attr_type_hw = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + }; + struct perf_event_attr attr_type_sw = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + struct perf_event_attr attr_hw_cache_l1d = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + }; + struct perf_event_attr attr_hw_cache_branch_miss = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_BPU | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + }; + struct perf_event_attr attr_type_raw = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_RAW, + /* Intel Instruction Retired */ + .config = 0xc0, + }; + struct perf_event_attr attr_type_raw_lock_load = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_RAW, + /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */ + .config = 0x21d0, + /* Request to record lock address from PEBS */ + .sample_type = PERF_SAMPLE_ADDR, + /* Record address value requires precise event */ + .precise_ip = 2, + }; + + printf("Test HW_CPU_CYCLES\n"); + test_perf_event_all_cpu(&attr_type_hw); + test_perf_event_task(&attr_type_hw); + + printf("Test SW_CPU_CLOCK\n"); + test_perf_event_all_cpu(&attr_type_sw); + test_perf_event_task(&attr_type_sw); + + printf("Test HW_CACHE_L1D\n"); + test_perf_event_all_cpu(&attr_hw_cache_l1d); + test_perf_event_task(&attr_hw_cache_l1d); + + printf("Test HW_CACHE_BPU\n"); + test_perf_event_all_cpu(&attr_hw_cache_branch_miss); + test_perf_event_task(&attr_hw_cache_branch_miss); + + printf("Test Instruction Retired\n"); + test_perf_event_all_cpu(&attr_type_raw); + test_perf_event_task(&attr_type_raw); + + printf("Test Lock Load\n"); + test_perf_event_all_cpu(&attr_type_raw_lock_load); + test_perf_event_task(&attr_type_raw_lock_load); + + printf("*** PASS ***\n"); +} + + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; + char filename[256]; + int error = 1; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + setrlimit(RLIMIT_MEMLOCK, &r); + + signal(SIGINT, err_exit); + signal(SIGTERM, err_exit); + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + goto cleanup; + } + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + printf("opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + printf("finding a counts/stackmap map in obj file failed\n"); + goto cleanup; + } + + pid = fork(); + if (pid == 0) { + read_trace_pipe(); + return 0; + } else if (pid == -1) { + printf("couldn't spawn process\n"); + goto cleanup; + } + + test_bpf_perf_event(); + error = 0; + +cleanup: + bpf_object__close(obj); + err_exit(error); +} diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c new file mode 100644 index 000000000..b64815af0 --- /dev/null +++ b/samples/bpf/trace_output_kern.c @@ -0,0 +1,31 @@ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "trace_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 2); +} my_map SEC(".maps"); + +SEC("kprobe/" SYSCALL(sys_write)) +int bpf_prog1(struct pt_regs *ctx) +{ + struct S { + u64 pid; + u64 cookie; + } data; + + data.pid = bpf_get_current_pid_tgid(); + data.cookie = 0x12345678; + + bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c new file mode 100644 index 000000000..364b98764 --- /dev/null +++ b/samples/bpf/trace_output_user.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <fcntl.h> +#include <poll.h> +#include <time.h> +#include <signal.h> +#include <bpf/libbpf.h> + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static __u64 start_time; +static __u64 cnt; + +#define MAX_CNT 100000ll + +static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) +{ + struct { + __u64 pid; + __u64 cookie; + } *e = data; + + if (e->cookie != 0x12345678) { + printf("BUG pid %llx cookie %llx sized %d\n", + e->pid, e->cookie, size); + return; + } + + cnt++; + + if (cnt == MAX_CNT) { + printf("recv %lld events per sec\n", + MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + return; + } +} + +int main(int argc, char **argv) +{ + struct perf_buffer_opts pb_opts = {}; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct perf_buffer *pb; + struct bpf_object *obj; + int map_fd, ret = 0; + char filename[256]; + FILE *f; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + pb_opts.sample_cb = print_bpf_output; + pb = perf_buffer__new(map_fd, 8, &pb_opts); + ret = libbpf_get_error(pb); + if (ret) { + printf("failed to setup perf_buffer: %d\n", ret); + return 1; + } + + f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r"); + (void) f; + + start_time = time_get_ns(); + while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) { + } + kill(0, SIGINT); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return ret; +} diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c new file mode 100644 index 000000000..ef30d2b35 --- /dev/null +++ b/samples/bpf/tracex1_kern.c @@ -0,0 +1,54 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define _(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +/* kprobe is NOT a stable ABI + * kernel functions can be removed, renamed or completely change semantics. + * Number of arguments and their positions can change, etc. + * In such case this bpf+kprobe example will no longer be meaningful + */ +SEC("kprobe/__netif_receive_skb_core") +int bpf_prog1(struct pt_regs *ctx) +{ + /* attaches to kprobe __netif_receive_skb_core, + * looks for packets on loobpack device and prints them + */ + char devname[IFNAMSIZ]; + struct net_device *dev; + struct sk_buff *skb; + int len; + + /* non-portable! works for the given kernel only */ + bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); + dev = _(skb->dev); + len = _(skb->len); + + bpf_probe_read_kernel(devname, sizeof(devname), dev->name); + + if (devname[0] == 'l' && devname[1] == 'o') { + char fmt[] = "skb %p len %d\n"; + /* using bpf_trace_printk() for DEBUG ONLY */ + bpf_trace_printk(fmt, sizeof(fmt), skb, len); + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c new file mode 100644 index 000000000..9d4adb7fd --- /dev/null +++ b/samples/bpf/tracex1_user.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <unistd.h> +#include <bpf/libbpf.h> +#include "trace_helpers.h" + +int main(int ac, char **argv) +{ + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + FILE *f; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + f = popen("taskset 1 ping -c5 localhost", "r"); + (void) f; + + read_trace_pipe(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c new file mode 100644 index 000000000..5bc696bac --- /dev/null +++ b/samples/bpf/tracex2_kern.c @@ -0,0 +1,102 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "trace_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/kfree_skb") +int bpf_prog2(struct pt_regs *ctx) +{ + long loc = 0; + long init_val = 1; + long *value; + + /* read ip of kfree_skb caller. + * non-portable version of __builtin_return_address(0) + */ + BPF_KPROBE_READ_RET_IP(loc, ctx); + + value = bpf_map_lookup_elem(&my_map, &loc); + if (value) + *value += 1; + else + bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY); + return 0; +} + +static unsigned int log2(unsigned int v) +{ + unsigned int r; + unsigned int shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +static unsigned int log2l(unsigned long v) +{ + unsigned int hi = v >> 32; + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +struct hist_key { + char comm[16]; + u64 pid_tgid; + u64 uid_gid; + u64 index; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(struct hist_key)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_hist_map SEC(".maps"); + +SEC("kprobe/" SYSCALL(sys_write)) +int bpf_prog3(struct pt_regs *ctx) +{ + long write_size = PT_REGS_PARM3(ctx); + long init_val = 1; + long *value; + struct hist_key key; + + key.index = log2l(write_size); + key.pid_tgid = bpf_get_current_pid_tgid(); + key.uid_gid = bpf_get_current_uid_gid(); + bpf_get_current_comm(&key.comm, sizeof(key.comm)); + + value = bpf_map_lookup_elem(&my_hist_map, &key); + if (value) + __sync_fetch_and_add(value, 1); + else + bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY); + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c new file mode 100644 index 000000000..3d6eab711 --- /dev/null +++ b/samples/bpf/tracex2_user.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <sys/resource.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" + +#define MAX_INDEX 64 +#define MAX_STARS 38 + +/* my_map, my_hist_map */ +static int map_fd[2]; + +static void stars(char *str, long val, long max, int width) +{ + int i; + + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) + str[i] = '*'; + if (val > max) + str[i - 1] = '+'; + str[i] = '\0'; +} + +struct task { + char comm[16]; + __u64 pid_tgid; + __u64 uid_gid; +}; + +struct hist_key { + struct task t; + __u32 index; +}; + +#define SIZE sizeof(struct task) + +static void print_hist_for_pid(int fd, void *task) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct hist_key key = {}, next_key; + long values[nr_cpus]; + char starstr[MAX_STARS]; + long value; + long data[MAX_INDEX] = {}; + int max_ind = -1; + long max_value = 0; + int i, ind; + + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + if (memcmp(&next_key, task, SIZE)) { + key = next_key; + continue; + } + bpf_map_lookup_elem(fd, &next_key, values); + value = 0; + for (i = 0; i < nr_cpus; i++) + value += values[i]; + ind = next_key.index; + data[ind] = value; + if (value && ind > max_ind) + max_ind = ind; + if (value > max_value) + max_value = value; + key = next_key; + } + + printf(" syscall write() stats\n"); + printf(" byte_size : count distribution\n"); + for (i = 1; i <= max_ind + 1; i++) { + stars(starstr, data[i - 1], max_value, MAX_STARS); + printf("%8ld -> %-8ld : %-8ld |%-*s|\n", + (1l << i) >> 1, (1l << i) - 1, data[i - 1], + MAX_STARS, starstr); + } +} + +static void print_hist(int fd) +{ + struct hist_key key = {}, next_key; + static struct task tasks[1024]; + int task_cnt = 0; + int i; + + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + int found = 0; + + for (i = 0; i < task_cnt; i++) + if (memcmp(&tasks[i], &next_key, SIZE) == 0) + found = 1; + if (!found) + memcpy(&tasks[task_cnt++], &next_key, SIZE); + key = next_key; + } + + for (i = 0; i < task_cnt; i++) { + printf("\npid %d cmd %s uid %d\n", + (__u32) tasks[i].pid_tgid, + tasks[i].comm, + (__u32) tasks[i].uid_gid); + print_hist_for_pid(fd, &tasks[i]); + } + +} + +static void int_exit(int sig) +{ + print_hist(map_fd[1]); + exit(0); +} + +int main(int ac, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + long key, next_key, value; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int i, j = 0; + FILE *f; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + /* start 'ping' in the background to have some kfree_skb events */ + f = popen("ping -4 -c5 localhost", "r"); + (void) f; + + /* start 'dd' in the background to have plenty of 'write' syscalls */ + f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); + (void) f; + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + + for (i = 0; i < 5; i++) { + key = 0; + while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd[0], &next_key, &value); + printf("location 0x%lx count %ld\n", next_key, value); + key = next_key; + } + if (key) + printf("\n"); + sleep(1); + } + print_hist(map_fd[1]); + +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c new file mode 100644 index 000000000..710a4410b --- /dev/null +++ b/samples/bpf/tracex3_kern.c @@ -0,0 +1,90 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, u64); + __uint(max_entries, 4096); +} my_map SEC(".maps"); + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/blk_mq_start_request") +int bpf_prog1(struct pt_regs *ctx) +{ + long rq = PT_REGS_PARM1(ctx); + u64 val = bpf_ktime_get_ns(); + + bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY); + return 0; +} + +static unsigned int log2l(unsigned long long n) +{ +#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; } + int i = -(n == 0); + S(32); S(16); S(8); S(4); S(2); S(1); + return i; +#undef S +} + +#define SLOTS 100 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, SLOTS); +} lat_map SEC(".maps"); + +SEC("kprobe/blk_account_io_done") +int bpf_prog2(struct pt_regs *ctx) +{ + long rq = PT_REGS_PARM1(ctx); + u64 *value, l, base; + u32 index; + + value = bpf_map_lookup_elem(&my_map, &rq); + if (!value) + return 0; + + u64 cur_time = bpf_ktime_get_ns(); + u64 delta = cur_time - *value; + + bpf_map_delete_elem(&my_map, &rq); + + /* the lines below are computing index = log10(delta)*10 + * using integer arithmetic + * index = 29 ~ 1 usec + * index = 59 ~ 1 msec + * index = 89 ~ 1 sec + * index = 99 ~ 10sec or more + * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3 + */ + l = log2l(delta); + base = 1ll << l; + index = (l * 64 + (delta - base) * 64 / base) * 3 / 64; + + if (index >= SLOTS) + index = SLOTS - 1; + + value = bpf_map_lookup_elem(&lat_map, &index); + if (value) + *value += 1; + + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c new file mode 100644 index 000000000..83e0fecbb --- /dev/null +++ b/samples/bpf/tracex3_user.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + */ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <sys/resource.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" + +#define SLOTS 100 + +static void clear_stats(int fd) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 values[nr_cpus]; + __u32 key; + + memset(values, 0, sizeof(values)); + for (key = 0; key < SLOTS; key++) + bpf_map_update_elem(fd, &key, values, BPF_ANY); +} + +const char *color[] = { + "\033[48;5;255m", + "\033[48;5;252m", + "\033[48;5;250m", + "\033[48;5;248m", + "\033[48;5;246m", + "\033[48;5;244m", + "\033[48;5;242m", + "\033[48;5;240m", + "\033[48;5;238m", + "\033[48;5;236m", + "\033[48;5;234m", + "\033[48;5;232m", +}; +const int num_colors = ARRAY_SIZE(color); + +const char nocolor[] = "\033[00m"; + +const char *sym[] = { + " ", + " ", + ".", + ".", + "*", + "*", + "o", + "o", + "O", + "O", + "#", + "#", +}; + +bool full_range = false; +bool text_only = false; + +static void print_banner(void) +{ + if (full_range) + printf("|1ns |10ns |100ns |1us |10us |100us" + " |1ms |10ms |100ms |1s |10s\n"); + else + printf("|1us |10us |100us |1ms |10ms " + "|100ms |1s |10s\n"); +} + +static void print_hist(int fd) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 total_events = 0; + long values[nr_cpus]; + __u64 max_cnt = 0; + __u64 cnt[SLOTS]; + __u64 value; + __u32 key; + int i; + + for (key = 0; key < SLOTS; key++) { + bpf_map_lookup_elem(fd, &key, values); + value = 0; + for (i = 0; i < nr_cpus; i++) + value += values[i]; + cnt[key] = value; + total_events += value; + if (value > max_cnt) + max_cnt = value; + } + clear_stats(fd); + for (key = full_range ? 0 : 29; key < SLOTS; key++) { + int c = num_colors * cnt[key] / (max_cnt + 1); + + if (text_only) + printf("%s", sym[c]); + else + printf("%s %s", color[c], nocolor); + } + printf(" # %lld\n", total_events); +} + +int main(int ac, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int map_fd, i, j = 0; + + for (i = 1; i < ac; i++) { + if (strcmp(argv[i], "-a") == 0) { + full_range = true; + } else if (strcmp(argv[i], "-t") == 0) { + text_only = true; + } else if (strcmp(argv[i], "-h") == 0) { + printf("Usage:\n" + " -a display wider latency range\n" + " -t text only\n"); + return 1; + } + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + + printf(" heatmap of IO latency\n"); + if (text_only) + printf(" %s", sym[num_colors - 1]); + else + printf(" %s %s", color[num_colors - 1], nocolor); + printf(" - many events with this latency\n"); + + if (text_only) + printf(" %s", sym[0]); + else + printf(" %s %s", color[0], nocolor); + printf(" - few events\n"); + + for (i = 0; ; i++) { + if (i % 20 == 0) + print_banner(); + print_hist(map_fd); + sleep(2); + } + +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c new file mode 100644 index 000000000..eb0f8fdd1 --- /dev/null +++ b/samples/bpf/tracex4_kern.c @@ -0,0 +1,55 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct pair { + u64 val; + u64 ip; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, struct pair); + __uint(max_entries, 1000000); +} my_map SEC(".maps"); + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/kmem_cache_free") +int bpf_prog1(struct pt_regs *ctx) +{ + long ptr = PT_REGS_PARM2(ctx); + + bpf_map_delete_elem(&my_map, &ptr); + return 0; +} + +SEC("kretprobe/kmem_cache_alloc_node") +int bpf_prog2(struct pt_regs *ctx) +{ + long ptr = PT_REGS_RC(ctx); + long ip = 0; + + /* get ip address of kmem_cache_alloc_node() caller */ + BPF_KRETPROBE_READ_RET_IP(ip, ctx); + + struct pair v = { + .val = bpf_ktime_get_ns(), + .ip = ip, + }; + + bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY); + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c new file mode 100644 index 000000000..e8faf8f18 --- /dev/null +++ b/samples/bpf/tracex4_user.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + */ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <time.h> +#include <sys/resource.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +struct pair { + long long val; + __u64 ip; +}; + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static void print_old_objects(int fd) +{ + long long val = time_get_ns(); + __u64 key, next_key; + struct pair v; + + key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ + + key = -1; + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &v); + key = next_key; + if (val - v.val < 1000000000ll) + /* object was allocated more then 1 sec ago */ + continue; + printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n", + next_key, (val - v.val) / 1000000000ll, v.ip); + } +} + +int main(int ac, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int map_fd, i, j = 0; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + + for (i = 0; ; i++) { + print_old_objects(map_fd); + sleep(1); + } + +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c new file mode 100644 index 000000000..64a1f7550 --- /dev/null +++ b/samples/bpf/tracex5_kern.c @@ -0,0 +1,93 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/seccomp.h> +#include <uapi/linux/unistd.h> +#include "syscall_nrs.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +#ifdef __mips__ + __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */ +#else + __uint(max_entries, 1024); +#endif +} progs SEC(".maps"); + +SEC("kprobe/__seccomp_filter") +int bpf_prog1(struct pt_regs *ctx) +{ + int sc_nr = (int)PT_REGS_PARM1(ctx); + + /* dispatch into next BPF program depending on syscall number */ + bpf_tail_call(ctx, &progs, sc_nr); + + /* fall through -> unknown syscall */ + if (sc_nr >= __NR_getuid && sc_nr <= __NR_getsid) { + char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n"; + bpf_trace_printk(fmt, sizeof(fmt), sc_nr); + } + return 0; +} + +/* we jump here when syscall number == __NR_write */ +PROG(SYS__NR_write)(struct pt_regs *ctx) +{ + struct seccomp_data sd; + + bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + if (sd.args[2] == 512) { + char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; + bpf_trace_printk(fmt, sizeof(fmt), + sd.args[0], sd.args[1], sd.args[2]); + } + return 0; +} + +PROG(SYS__NR_read)(struct pt_regs *ctx) +{ + struct seccomp_data sd; + + bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + if (sd.args[2] > 128 && sd.args[2] <= 1024) { + char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; + bpf_trace_printk(fmt, sizeof(fmt), + sd.args[0], sd.args[1], sd.args[2]); + } + return 0; +} + +#ifdef __NR_mmap2 +PROG(SYS__NR_mmap2)(struct pt_regs *ctx) +{ + char fmt[] = "mmap2\n"; + + bpf_trace_printk(fmt, sizeof(fmt)); + return 0; +} +#endif + +#ifdef __NR_mmap +PROG(SYS__NR_mmap)(struct pt_regs *ctx) +{ + char fmt[] = "mmap\n"; + + bpf_trace_printk(fmt, sizeof(fmt)); + return 0; +} +#endif + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c new file mode 100644 index 000000000..c17d3fb5f --- /dev/null +++ b/samples/bpf/tracex5_user.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <sys/prctl.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <sys/resource.h> +#include "trace_helpers.h" + +#ifdef __mips__ +#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */ +#else +#define MAX_ENTRIES 1024 +#endif + +/* install fake seccomp program to enable seccomp code path inside the kernel, + * so that our kprobe attached to seccomp_phase1() can be triggered + */ +static void install_accept_all_seccomp(void) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + if (prctl(PR_SET_SECCOMP, 2, &prog)) + perror("prctl"); +} + +int main(int ac, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + int key, fd, progs_fd; + const char *section; + char filename[256]; + FILE *f; + + setrlimit(RLIMIT_MEMLOCK, &r); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + progs_fd = bpf_object__find_map_fd_by_name(obj, "progs"); + if (progs_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + section = bpf_program__section_name(prog); + /* register only syscalls to PROG_ARRAY */ + if (sscanf(section, "kprobe/%d", &key) != 1) + continue; + + fd = bpf_program__fd(prog); + bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY); + } + + install_accept_all_seccomp(); + + f = popen("dd if=/dev/zero of=/dev/null count=5", "r"); + (void) f; + + read_trace_pipe(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c new file mode 100644 index 000000000..fd602c277 --- /dev/null +++ b/samples/bpf/tracex6_kern.c @@ -0,0 +1,82 @@ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 64); +} counters SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, u64); + __uint(max_entries, 64); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct bpf_perf_event_value); + __uint(max_entries, 64); +} values2 SEC(".maps"); + +SEC("kprobe/htab_map_get_next_key") +int bpf_prog1(struct pt_regs *ctx) +{ + u32 key = bpf_get_smp_processor_id(); + u64 count, *val; + s64 error; + + count = bpf_perf_event_read(&counters, key); + error = (s64)count; + if (error <= -2 && error >= -22) + return 0; + + val = bpf_map_lookup_elem(&values, &key); + if (val) + *val = count; + else + bpf_map_update_elem(&values, &key, &count, BPF_NOEXIST); + + return 0; +} + +/* + * Since *_map_lookup_elem can't be expected to trigger bpf programs + * due to potential deadlocks (bpf_disable_instrumentation), this bpf + * program will be attached to bpf_map_copy_value (which is called + * from map_lookup_elem) and will only filter the hashtable type. + */ +SEC("kprobe/bpf_map_copy_value") +int BPF_KPROBE(bpf_prog2, struct bpf_map *map) +{ + u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *val, buf; + enum bpf_map_type type; + int error; + + type = BPF_CORE_READ(map, map_type); + if (type != BPF_MAP_TYPE_HASH) + return 0; + + error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); + if (error) + return 0; + + val = bpf_map_lookup_elem(&values2, &key); + if (val) + *val = buf; + else + bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c new file mode 100644 index 000000000..33df97847 --- /dev/null +++ b/samples/bpf/tracex6_user.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <assert.h> +#include <fcntl.h> +#include <linux/perf_event.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <sys/resource.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "perf-sys.h" + +#define SAMPLE_PERIOD 0x7fffffffffffffffULL + +/* counters, values, values2 */ +static int map_fd[3]; + +static void check_on_cpu(int cpu, struct perf_event_attr *attr) +{ + struct bpf_perf_event_value value2; + int pmu_fd, error = 0; + cpu_set_t set; + __u64 value; + + /* Move to target CPU */ + CPU_ZERO(&set); + CPU_SET(cpu, &set); + assert(sched_setaffinity(0, sizeof(set), &set) == 0); + /* Open perf event and attach to the perf_event_array */ + pmu_fd = sys_perf_event_open(attr, -1/*pid*/, cpu/*cpu*/, -1/*group_fd*/, 0); + if (pmu_fd < 0) { + fprintf(stderr, "sys_perf_event_open failed on CPU %d\n", cpu); + error = 1; + goto on_exit; + } + assert(bpf_map_update_elem(map_fd[0], &cpu, &pmu_fd, BPF_ANY) == 0); + assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0); + /* Trigger the kprobe */ + bpf_map_get_next_key(map_fd[1], &cpu, NULL); + /* Check the value */ + if (bpf_map_lookup_elem(map_fd[1], &cpu, &value)) { + fprintf(stderr, "Value missing for CPU %d\n", cpu); + error = 1; + goto on_exit; + } else { + fprintf(stderr, "CPU %d: %llu\n", cpu, value); + } + /* The above bpf_map_lookup_elem should trigger the second kprobe */ + if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) { + fprintf(stderr, "Value2 missing for CPU %d\n", cpu); + error = 1; + goto on_exit; + } else { + fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu, + value2.counter, value2.enabled, value2.running); + } + +on_exit: + assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error); + assert(ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE, 0) == 0 || error); + assert(close(pmu_fd) == 0 || error); + assert(bpf_map_delete_elem(map_fd[1], &cpu) == 0 || error); + exit(error); +} + +static void test_perf_event_array(struct perf_event_attr *attr, + const char *name) +{ + int i, status, nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + pid_t pid[nr_cpus]; + int err = 0; + + printf("Test reading %s counters\n", name); + + for (i = 0; i < nr_cpus; i++) { + pid[i] = fork(); + assert(pid[i] >= 0); + if (pid[i] == 0) { + check_on_cpu(i, attr); + exit(1); + } + } + + for (i = 0; i < nr_cpus; i++) { + assert(waitpid(pid[i], &status, 0) == pid[i]); + err |= status; + } + + if (err) + printf("Test: %s FAILED\n", name); +} + +static void test_bpf_perf_event(void) +{ + struct perf_event_attr attr_cycles = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_HARDWARE, + .read_format = 0, + .sample_type = 0, + .config = PERF_COUNT_HW_CPU_CYCLES, + }; + struct perf_event_attr attr_clock = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_SOFTWARE, + .read_format = 0, + .sample_type = 0, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + struct perf_event_attr attr_raw = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_RAW, + .read_format = 0, + .sample_type = 0, + /* Intel Instruction Retired */ + .config = 0xc0, + }; + struct perf_event_attr attr_l1d_load = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_HW_CACHE, + .read_format = 0, + .sample_type = 0, + .config = + PERF_COUNT_HW_CACHE_L1D | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + }; + struct perf_event_attr attr_llc_miss = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_HW_CACHE, + .read_format = 0, + .sample_type = 0, + .config = + PERF_COUNT_HW_CACHE_LL | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + }; + struct perf_event_attr attr_msr_tsc = { + .freq = 0, + .sample_period = 0, + .inherit = 0, + /* From /sys/bus/event_source/devices/msr/ */ + .type = 7, + .read_format = 0, + .sample_type = 0, + .config = 0, + }; + + test_perf_event_array(&attr_cycles, "HARDWARE-cycles"); + test_perf_event_array(&attr_clock, "SOFTWARE-clock"); + test_perf_event_array(&attr_raw, "RAW-instruction-retired"); + test_perf_event_array(&attr_l1d_load, "HW_CACHE-L1D-load"); + + /* below tests may fail in qemu */ + test_perf_event_array(&attr_llc_miss, "HW_CACHE-LLC-miss"); + test_perf_event_array(&attr_msr_tsc, "Dynamic-msr-tsc"); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int i = 0; + + setrlimit(RLIMIT_MEMLOCK, &r); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; + } + + test_bpf_perf_event(); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c new file mode 100644 index 000000000..c5a92df8a --- /dev/null +++ b/samples/bpf/tracex7_kern.c @@ -0,0 +1,16 @@ +#include <uapi/linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include <bpf/bpf_helpers.h> + +SEC("kprobe/open_ctree") +int bpf_prog1(struct pt_regs *ctx) +{ + unsigned long rc = -12; + + bpf_override_return(ctx, rc); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c new file mode 100644 index 000000000..8be7ce18d --- /dev/null +++ b/samples/bpf/tracex7_user.c @@ -0,0 +1,56 @@ +#define _GNU_SOURCE + +#include <stdio.h> +#include <unistd.h> +#include <bpf/libbpf.h> + +int main(int argc, char **argv) +{ + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + char command[256]; + int ret = 0; + FILE *f; + + if (!argv[1]) { + fprintf(stderr, "ERROR: Run with the btrfs device argument!\n"); + return 0; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + snprintf(command, 256, "mount %s tmpmnt/", argv[1]); + f = popen(command, "r"); + ret = pclose(f); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return ret ? 0 : 1; +} diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c new file mode 100644 index 000000000..34b64394e --- /dev/null +++ b/samples/bpf/xdp1_kern.c @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, 256); +} rxcnt SEC(".maps"); + +static int parse_ipv4(void *data, u64 nh_off, void *data_end) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static int parse_ipv6(void *data, u64 nh_off, void *data_end) +{ + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp1") +int xdp_prog1(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + long *value; + u16 h_proto; + u64 nh_off; + u32 ipproto; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + if (h_proto == htons(ETH_P_IP)) + ipproto = parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + ipproto = parse_ipv6(data, nh_off, data_end); + else + ipproto = 0; + + value = bpf_map_lookup_elem(&rxcnt, &ipproto); + if (value) + *value += 1; + + return rc; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c new file mode 100644 index 000000000..c447ad9e3 --- /dev/null +++ b/samples/bpf/xdp1_user.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 PLUMgrid + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <libgen.h> +#include <sys/resource.h> +#include <net/if.h> + +#include "bpf_util.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +static int ifindex; +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static __u32 prog_id; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (prog_id == curr_prog_id) + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on a given interface\n"); + else + printf("program on interface changed, not removing\n"); + exit(0); +} + +/* simple per-protocol drop counter + */ +static void poll_stats(int map_fd, int interval) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 values[nr_cpus], prev[UINT8_MAX] = { 0 }; + int i; + + while (1) { + __u32 key = UINT32_MAX; + + sleep(interval); + + while (bpf_map_get_next_key(map_fd, &key, &key) != -1) { + __u64 sum = 0; + + assert(bpf_map_lookup_elem(map_fd, &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += values[i]; + if (sum > prev[key]) + printf("proto %u: %10llu pkt/s\n", + key, (sum - prev[key]) / interval); + prev[key] = sum; + } + } +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] IFACE\n\n" + "OPTS:\n" + " -S use skb-mode\n" + " -N enforce native mode\n" + " -F force loading prog\n", + prog); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + const char *optstr = "FSN"; + int prog_fd, map_fd, opt; + struct bpf_object *obj; + struct bpf_map *map; + char filename[256]; + int err; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + if (optind == argc) { + usage(basename(argv[0])); + return 1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + ifindex = if_nametoindex(argv[optind]); + if (!ifindex) { + perror("if_nametoindex"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + map = bpf_map__next(NULL, obj); + if (!map) { + printf("finding a map in obj file failed\n"); + return 1; + } + map_fd = bpf_map__fd(map); + + if (!prog_fd) { + printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + return 1; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + printf("link set xdp fd failed\n"); + return 1; + } + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return err; + } + prog_id = info.id; + + poll_stats(map_fd, 2); + + return 0; +} diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c new file mode 100644 index 000000000..c787f4b49 --- /dev/null +++ b/samples/bpf/xdp2_kern.c @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, 256); +} rxcnt SEC(".maps"); + +static void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +static int parse_ipv4(void *data, u64 nh_off, void *data_end) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static int parse_ipv6(void *data, u64 nh_off, void *data_end) +{ + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp1") +int xdp_prog1(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + long *value; + u16 h_proto; + u64 nh_off; + u32 ipproto; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + if (h_proto == htons(ETH_P_IP)) + ipproto = parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + ipproto = parse_ipv6(data, nh_off, data_end); + else + ipproto = 0; + + value = bpf_map_lookup_elem(&rxcnt, &ipproto); + if (value) + *value += 1; + + if (ipproto == IPPROTO_UDP) { + swap_src_dst_mac(data); + rc = XDP_TX; + } + + return rc; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh new file mode 100755 index 000000000..4bde9d066 --- /dev/null +++ b/samples/bpf/xdp2skb_meta.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. +# +# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load +# eBPF programs, both for XDP and clsbpf. Shell script function +# wrappers and even long options parsing is illustrated, for ease of +# use. +# +# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs +# that need to collaborate between XDP and TC hooks. Thus, it is +# convenient that the same tool load both programs that need to work +# together. +# +BPF_FILE=xdp2skb_meta_kern.o +DIR=$(dirname $0) + +[ -z "$TC" ] && TC=tc +[ -z "$IP" ] && IP=ip + +function usage() { + echo "" + echo "Usage: $0 [-vfh] --dev ethX" + echo " -d | --dev : Network device (required)" + echo " --flush : Cleanup flush TC and XDP progs" + echo " --list : (\$LIST) List TC and XDP progs" + echo " -v | --verbose : (\$VERBOSE) Verbose" + echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)" + echo "" +} + +## -- General shell logging cmds -- +function err() { + local exitcode=$1 + shift + echo "ERROR: $@" >&2 + exit $exitcode +} + +function info() { + if [[ -n "$VERBOSE" ]]; then + echo "# $@" + fi +} + +## -- Helper function calls -- + +# Wrapper call for TC and IP +# - Will display the offending command on failure +function _call_cmd() { + local cmd="$1" + local allow_fail="$2" + shift 2 + if [[ -n "$VERBOSE" ]]; then + echo "$cmd $@" + fi + if [[ -n "$DRYRUN" ]]; then + return + fi + $cmd "$@" + local status=$? + if (( $status != 0 )); then + if [[ "$allow_fail" == "" ]]; then + err 2 "Exec error($status) occurred cmd: \"$cmd $@\"" + fi + fi +} +function call_tc() { + _call_cmd "$TC" "" "$@" +} +function call_tc_allow_fail() { + _call_cmd "$TC" "allow_fail" "$@" +} +function call_ip() { + _call_cmd "$IP" "" "$@" +} + +## --- Parse command line arguments / parameters --- +# Using external program "getopt" to get --long-options +OPTIONS=$(getopt -o vfhd: \ + --long verbose,flush,help,list,dev:,dry-run -- "$@") +if (( $? != 0 )); then + err 4 "Error calling getopt" +fi +eval set -- "$OPTIONS" + +unset DEV +unset FLUSH +while true; do + case "$1" in + -d | --dev ) # device + DEV=$2 + info "Device set to: DEV=$DEV" >&2 + shift 2 + ;; + -v | --verbose) + VERBOSE=yes + # info "Verbose mode: VERBOSE=$VERBOSE" >&2 + shift + ;; + --dry-run ) + DRYRUN=yes + VERBOSE=yes + info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2 + shift + ;; + -f | --flush ) + FLUSH=yes + shift + ;; + --list ) + LIST=yes + shift + ;; + -- ) + shift + break + ;; + -h | --help ) + usage; + exit 0 + ;; + * ) + shift + break + ;; + esac +done + +FILE="$DIR/$BPF_FILE" +if [[ ! -e $FILE ]]; then + err 3 "Missing BPF object file ($FILE)" +fi + +if [[ -z $DEV ]]; then + usage + err 2 "Please specify network device -- required option --dev" +fi + +## -- Function calls -- + +function list_tc() +{ + local device="$1" + shift + info "Listing current TC ingress rules" + call_tc filter show dev $device ingress +} + +function list_xdp() +{ + local device="$1" + shift + info "Listing current XDP device($device) setting" + call_ip link show dev $device | grep --color=auto xdp +} + +function flush_tc() +{ + local device="$1" + shift + info "Flush TC on device: $device" + call_tc_allow_fail filter del dev $device ingress + call_tc_allow_fail qdisc del dev $device clsact +} + +function flush_xdp() +{ + local device="$1" + shift + info "Flush XDP on device: $device" + call_ip link set dev $device xdp off +} + +function attach_tc_mark() +{ + local device="$1" + local file="$2" + local prog="tc_mark" + shift 2 + + # Re-attach clsact to clear/flush existing role + call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null + call_tc qdisc add dev $device clsact + + # Attach BPF prog + call_tc filter add dev $device ingress \ + prio 1 handle 1 bpf da obj $file sec $prog +} + +function attach_xdp_mark() +{ + local device="$1" + local file="$2" + local prog="xdp_mark" + shift 2 + + # Remove XDP prog in-case it's already loaded + # TODO: Need ip-link option to override/replace existing XDP prog + flush_xdp $device + + # Attach XDP/BPF prog + call_ip link set dev $device xdp obj $file sec $prog +} + +if [[ -n $FLUSH ]]; then + flush_tc $DEV + flush_xdp $DEV + exit 0 +fi + +if [[ -n $LIST ]]; then + list_tc $DEV + list_xdp $DEV + exit 0 +fi + +attach_tc_mark $DEV $FILE +attach_xdp_mark $DEV $FILE diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c new file mode 100644 index 000000000..9b783316e --- /dev/null +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. + * + * Example howto transfer info from XDP to SKB, e.g. skb->mark + * ----------------------------------------------------------- + * This uses the XDP data_meta infrastructure, and is a cooperation + * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. + * + * Notice: This example does not use the BPF C-loader (bpf_load.c), + * but instead rely on the iproute2 TC tool for loading BPF-objects. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/pkt_cls.h> + +#include <bpf/bpf_helpers.h> + +/* + * This struct is stored in the XDP 'data_meta' area, which is located + * just in-front-of the raw packet payload data. The meaning is + * specific to these two BPF programs that use it as a communication + * channel. XDP adjust/increase the area via a bpf-helper, and TC use + * boundary checks to see if data have been provided. + * + * The struct must be 4 byte aligned, which here is enforced by the + * struct __attribute__((aligned(4))). + */ +struct meta_info { + __u32 mark; +} __attribute__((aligned(4))); + +SEC("xdp_mark") +int _xdp_mark(struct xdp_md *ctx) +{ + struct meta_info *meta; + void *data, *data_end; + int ret; + + /* Reserve space in-front of data pointer for our meta info. + * (Notice drivers not supporting data_meta will fail here!) + */ + ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); + if (ret < 0) + return XDP_ABORTED; + + /* Notice: Kernel-side verifier requires that loading of + * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(), + * as pkt-data pointers are invalidated. Helpers that require + * this are determined/marked by bpf_helper_changes_pkt_data() + */ + data = (void *)(unsigned long)ctx->data; + + /* Check data_meta have room for meta_info struct */ + meta = (void *)(unsigned long)ctx->data_meta; + if (meta + 1 > data) + return XDP_ABORTED; + + meta->mark = 42; + + return XDP_PASS; +} + +SEC("tc_mark") +int _tc_mark(struct __sk_buff *ctx) +{ + void *data = (void *)(unsigned long)ctx->data; + void *data_end = (void *)(unsigned long)ctx->data_end; + void *data_meta = (void *)(unsigned long)ctx->data_meta; + struct meta_info *meta = data_meta; + + /* Check XDP gave us some data_meta */ + if (meta + 1 > data) { + ctx->mark = 41; + /* Skip "accept" if no data_meta is avail */ + return TC_ACT_OK; + } + + /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */ + ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */ + + return TC_ACT_OK; +} + +/* Manually attaching these programs: +export DEV=ixgbe2 +export FILE=xdp2skb_meta_kern.o + +# via TC command +tc qdisc del dev $DEV clsact 2> /dev/null +tc qdisc add dev $DEV clsact +tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark +tc filter show dev $DEV ingress + +# XDP via IP command: +ip link set dev $DEV xdp off +ip link set dev $DEV xdp obj $FILE sec xdp_mark + +# Use iptable to "see" if SKBs are marked +iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29 +iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a + +# Hint: catch XDP_ABORTED errors via +perf record -e xdp:* +perf script + +*/ diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c new file mode 100644 index 000000000..ffdd54862 --- /dev/null +++ b/samples/bpf/xdp_adjust_tail_kern.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program shows how to use bpf_xdp_adjust_tail() by + * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed + * to be more preice in case of v4)" where receiving packets bigger then + * 600 bytes. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <bpf/bpf_helpers.h> + +#define DEFAULT_TTL 64 +#define MAX_PCKT_SIZE 600 +#define ICMP_TOOBIG_SIZE 98 +#define ICMP_TOOBIG_PAYLOAD_SIZE 92 + +/* volatile to prevent compiler optimizations */ +static volatile __u32 max_pcktsz = MAX_PCKT_SIZE; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 1); +} icmpcnt SEC(".maps"); + +static __always_inline void count_icmp(void) +{ + u64 key = 0; + u64 *icmp_count; + + icmp_count = bpf_map_lookup_elem(&icmpcnt, &key); + if (icmp_count) + *icmp_count += 1; +} + +static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth) +{ + struct ethhdr *eth; + + eth = data; + memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN); + memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN); + eth->h_proto = orig_eth->h_proto; +} + +static __always_inline __u16 csum_fold_helper(__u32 csum) +{ + return ~((csum & 0xffff) + (csum >> 16)); +} + +static __always_inline void ipv4_csum(void *data_start, int data_size, + __u32 *csum) +{ + *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum); + *csum = csum_fold_helper(*csum); +} + +static __always_inline int send_icmp4_too_big(struct xdp_md *xdp) +{ + int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr); + + if (bpf_xdp_adjust_head(xdp, 0 - headroom)) + return XDP_DROP; + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + + if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end) + return XDP_DROP; + + struct iphdr *iph, *orig_iph; + struct icmphdr *icmp_hdr; + struct ethhdr *orig_eth; + __u32 csum = 0; + __u64 off = 0; + + orig_eth = data + headroom; + swap_mac(data, orig_eth); + off += sizeof(struct ethhdr); + iph = data + off; + off += sizeof(struct iphdr); + icmp_hdr = data + off; + off += sizeof(struct icmphdr); + orig_iph = data + off; + icmp_hdr->type = ICMP_DEST_UNREACH; + icmp_hdr->code = ICMP_FRAG_NEEDED; + icmp_hdr->un.frag.mtu = htons(max_pcktsz - sizeof(struct ethhdr)); + icmp_hdr->checksum = 0; + ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum); + icmp_hdr->checksum = csum; + iph->ttl = DEFAULT_TTL; + iph->daddr = orig_iph->saddr; + iph->saddr = orig_iph->daddr; + iph->version = 4; + iph->ihl = 5; + iph->protocol = IPPROTO_ICMP; + iph->tos = 0; + iph->tot_len = htons( + ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr)); + iph->check = 0; + csum = 0; + ipv4_csum(iph, sizeof(struct iphdr), &csum); + iph->check = csum; + count_icmp(); + return XDP_TX; +} + + +static __always_inline int handle_ipv4(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + int pckt_size = data_end - data; + int offset; + + if (pckt_size > max(max_pcktsz, ICMP_TOOBIG_SIZE)) { + offset = pckt_size - ICMP_TOOBIG_SIZE; + if (bpf_xdp_adjust_tail(xdp, 0 - offset)) + return XDP_PASS; + return send_icmp4_too_big(xdp); + } + return XDP_PASS; +} + +SEC("xdp_icmp") +int _xdp_icmp(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct ethhdr *eth = data; + __u16 h_proto; + + if (eth + 1 > data_end) + return XDP_DROP; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_IP)) + return handle_ipv4(xdp); + else + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c new file mode 100644 index 000000000..ba482dc3d --- /dev/null +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <net/if.h> +#include <sys/resource.h> +#include <arpa/inet.h> +#include <netinet/ether.h> +#include <unistd.h> +#include <time.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#define STATS_INTERVAL_S 2U +#define MAX_PCKT_SIZE 600 + +static int ifindex = -1; +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static __u32 prog_id; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (ifindex > -1) { + if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (prog_id == curr_prog_id) + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on a given iface\n"); + else + printf("program on interface changed, not removing\n"); + } + exit(0); +} + +/* simple "icmp packet too big sent" counter + */ +static void poll_stats(unsigned int map_fd, unsigned int kill_after_s) +{ + time_t started_at = time(NULL); + __u64 value = 0; + int key = 0; + + + while (!kill_after_s || time(NULL) - started_at <= kill_after_s) { + sleep(STATS_INTERVAL_S); + + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + + printf("icmp \"packet too big\" sent: %10llu pkts\n", value); + } +} + +static void usage(const char *cmd) +{ + printf("Start a XDP prog which send ICMP \"packet too big\" \n" + "messages if ingress packet is bigger then MAX_SIZE bytes\n"); + printf("Usage: %s [...]\n", cmd); + printf(" -i <ifname|ifindex> Interface\n"); + printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n"); + printf(" -P <MAX_PCKT_SIZE> Default: %u\n", MAX_PCKT_SIZE); + printf(" -S use skb-mode\n"); + printf(" -N enforce native mode\n"); + printf(" -F force loading prog\n"); + printf(" -h Display this help\n"); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + unsigned char opt_flags[256] = {}; + const char *optstr = "i:T:P:SNFh"; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + unsigned int kill_after_s = 0; + int i, prog_fd, map_fd, opt; + struct bpf_object *obj; + __u32 max_pckt_size = 0; + __u32 key = 0; + char filename[256]; + int err; + + for (i = 0; i < strlen(optstr); i++) + if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z') + opt_flags[(unsigned char)optstr[i]] = 1; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + + switch (opt) { + case 'i': + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); + break; + case 'T': + kill_after_s = atoi(optarg); + break; + case 'P': + max_pckt_size = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + default: + usage(argv[0]); + return 1; + } + opt_flags[opt] = 0; + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + for (i = 0; i < strlen(optstr); i++) { + if (opt_flags[(unsigned int)optstr[i]]) { + fprintf(stderr, "Missing argument -%c\n", optstr[i]); + usage(argv[0]); + return 1; + } + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + return 1; + } + + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + /* static global var 'max_pcktsz' is accessible from .data section */ + if (max_pckt_size) { + map_fd = bpf_object__find_map_fd_by_name(obj, "xdp_adju.data"); + if (map_fd < 0) { + printf("finding a max_pcktsz map in obj file failed\n"); + return 1; + } + bpf_map_update_elem(map_fd, &key, &max_pckt_size, BPF_ANY); + } + + /* fetch icmpcnt map */ + map_fd = bpf_object__find_map_fd_by_name(obj, "icmpcnt"); + if (map_fd < 0) { + printf("finding a icmpcnt map in obj file failed\n"); + return 1; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + printf("link set xdp fd failed\n"); + return 1; + } + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return 1; + } + prog_id = info.id; + + poll_stats(map_fd, kill_after_s); + int_exit(0); + + return 0; +} diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c new file mode 100644 index 000000000..54c099cbd --- /dev/null +++ b/samples/bpf/xdp_fwd_kern.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +#include <bpf/bpf_helpers.h> + +#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 64); +} xdp_tx_ports SEC(".maps"); + +/* from include/net/ip.h */ +static __always_inline int ip_decrease_ttl(struct iphdr *iph) +{ + u32 check = (__force u32)iph->check; + + check += (__force u32)htons(0x0100); + iph->check = (__force __sum16)(check + (check >= 0xFFFF)); + return --iph->ttl; +} + +static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct bpf_fib_lookup fib_params; + struct ethhdr *eth = data; + struct ipv6hdr *ip6h; + struct iphdr *iph; + u16 h_proto; + u64 nh_off; + int rc; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + __builtin_memset(&fib_params, 0, sizeof(fib_params)); + + h_proto = eth->h_proto; + if (h_proto == htons(ETH_P_IP)) { + iph = data + nh_off; + + if (iph + 1 > data_end) + return XDP_DROP; + + if (iph->ttl <= 1) + return XDP_PASS; + + fib_params.family = AF_INET; + fib_params.tos = iph->tos; + fib_params.l4_protocol = iph->protocol; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = ntohs(iph->tot_len); + fib_params.ipv4_src = iph->saddr; + fib_params.ipv4_dst = iph->daddr; + } else if (h_proto == htons(ETH_P_IPV6)) { + struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src; + struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst; + + ip6h = data + nh_off; + if (ip6h + 1 > data_end) + return XDP_DROP; + + if (ip6h->hop_limit <= 1) + return XDP_PASS; + + fib_params.family = AF_INET6; + fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; + fib_params.l4_protocol = ip6h->nexthdr; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = ntohs(ip6h->payload_len); + *src = ip6h->saddr; + *dst = ip6h->daddr; + } else { + return XDP_PASS; + } + + fib_params.ifindex = ctx->ingress_ifindex; + + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); + /* + * Some rc (return codes) from bpf_fib_lookup() are important, + * to understand how this XDP-prog interacts with network stack. + * + * BPF_FIB_LKUP_RET_NO_NEIGH: + * Even if route lookup was a success, then the MAC-addresses are also + * needed. This is obtained from arp/neighbour table, but if table is + * (still) empty then BPF_FIB_LKUP_RET_NO_NEIGH is returned. To avoid + * doing ARP lookup directly from XDP, then send packet to normal + * network stack via XDP_PASS and expect it will do ARP resolution. + * + * BPF_FIB_LKUP_RET_FWD_DISABLED: + * The bpf_fib_lookup respect sysctl net.ipv{4,6}.conf.all.forwarding + * setting, and will return BPF_FIB_LKUP_RET_FWD_DISABLED if not + * enabled this on ingress device. + */ + if (rc == BPF_FIB_LKUP_RET_SUCCESS) { + /* Verify egress index has been configured as TX-port. + * (Note: User can still have inserted an egress ifindex that + * doesn't support XDP xmit, which will result in packet drops). + * + * Note: lookup in devmap supported since 0cdbb4b09a0. + * If not supported will fail with: + * cannot pass map_type 14 into func bpf_map_lookup_elem#1: + */ + if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex)) + return XDP_PASS; + + if (h_proto == htons(ETH_P_IP)) + ip_decrease_ttl(iph); + else if (h_proto == htons(ETH_P_IPV6)) + ip6h->hop_limit--; + + memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); + memcpy(eth->h_source, fib_params.smac, ETH_ALEN); + return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0); + } + + return XDP_PASS; +} + +SEC("xdp_fwd") +int xdp_fwd_prog(struct xdp_md *ctx) +{ + return xdp_fwd_flags(ctx, 0); +} + +SEC("xdp_fwd_direct") +int xdp_fwd_direct_prog(struct xdp_md *ctx) +{ + return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT); +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c new file mode 100644 index 000000000..74a4583d0 --- /dev/null +++ b/samples/bpf/xdp_fwd_user.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <linux/limits.h> +#include <net/if.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <libgen.h> + +#include <bpf/libbpf.h> +#include <bpf/bpf.h> + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + +static int do_attach(int idx, int prog_fd, int map_fd, const char *name) +{ + int err; + + err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags); + if (err < 0) { + printf("ERROR: failed to attach program to %s\n", name); + return err; + } + + /* Adding ifindex as a possible egress TX port */ + err = bpf_map_update_elem(map_fd, &idx, &idx, 0); + if (err) + printf("ERROR: failed using device %s as TX-port\n", name); + + return err; +} + +static int do_detach(int idx, const char *name) +{ + int err; + + err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); + if (err < 0) + printf("ERROR: failed to detach program from %s\n", name); + + /* TODO: Remember to cleanup map, when adding use of shared map + * bpf_map_delete_elem((map_fd, &idx); + */ + return err; +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] interface-list\n" + "\nOPTS:\n" + " -d detach program\n" + " -D direct table lookups (skip fib rules)\n", + prog); +} + +int main(int argc, char **argv) +{ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + const char *prog_name = "xdp_fwd"; + struct bpf_program *prog; + int prog_fd, map_fd = -1; + char filename[PATH_MAX]; + struct bpf_object *obj; + int opt, i, idx, err; + int attach = 1; + int ret = 0; + + while ((opt = getopt(argc, argv, ":dDSF")) != -1) { + switch (opt) { + case 'd': + attach = 0; + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'D': + prog_name = "xdp_fwd_direct"; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + if (optind == argc) { + usage(basename(argv[0])); + return 1; + } + + if (attach) { + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (access(filename, O_RDONLY) < 0) { + printf("error accessing file %s: %s\n", + filename, strerror(errno)); + return 1; + } + + err = bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd); + if (err) { + printf("Does kernel support devmap lookup?\n"); + /* If not, the error message will be: + * "cannot pass map_type 14 into func bpf_map_lookup_elem#1" + */ + return 1; + } + + prog = bpf_object__find_program_by_title(obj, prog_name); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + printf("program not found: %s\n", strerror(prog_fd)); + return 1; + } + map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj, + "xdp_tx_ports")); + if (map_fd < 0) { + printf("map not found: %s\n", strerror(map_fd)); + return 1; + } + } + + for (i = optind; i < argc; ++i) { + idx = if_nametoindex(argv[i]); + if (!idx) + idx = strtoul(argv[i], NULL, 0); + + if (!idx) { + fprintf(stderr, "Invalid arg\n"); + return 1; + } + if (!attach) { + err = do_detach(idx, argv[i]); + if (err) + ret = err; + } else { + err = do_attach(idx, prog_fd, map_fd, argv[i]); + if (err) + ret = err; + } + } + + return ret; +} diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c new file mode 100644 index 000000000..5c955b812 --- /dev/null +++ b/samples/bpf/xdp_monitor_kern.c @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. + * + * XDP monitor tool, based on tracepoints + */ +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 2); + /* TODO: have entries for all possible errno's */ +} redirect_err_cnt SEC(".maps"); + +#define XDP_UNKNOWN XDP_REDIRECT + 1 +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, XDP_UNKNOWN + 1); +} exception_cnt SEC(".maps"); + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_redirect_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline +int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) +{ + u32 key = XDP_REDIRECT_ERROR; + int err = ctx->err; + u64 *cnt; + + if (!err) + key = XDP_REDIRECT_SUCCESS; + + cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); + if (!cnt) + return 1; + *cnt += 1; + + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tracepoint/xdp/xdp_redirect_err") +int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + + +SEC("tracepoint/xdp/xdp_redirect_map_err") +int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Likely unloaded when prog starts */ +SEC("tracepoint/xdp/xdp_redirect") +int trace_xdp_redirect(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Likely unloaded when prog starts */ +SEC("tracepoint/xdp/xdp_redirect_map") +int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + u64 *cnt; + u32 key; + + key = ctx->act; + if (key > XDP_REDIRECT) + key = XDP_UNKNOWN; + + cnt = bpf_map_lookup_elem(&exception_cnt, &key); + if (!cnt) + return 1; + *cnt += 1; + + return 0; +} + +/* Common stats data record shared with _user.c */ +struct datarec { + u64 processed; + u64 dropped; + u64 info; + u64 err; +}; +#define MAX_CPUS 64 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, MAX_CPUS); +} cpumap_enqueue_cnt SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} cpumap_kthread_cnt SEC(".maps"); + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->info += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->info++; + + return 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} devmap_xmit_cnt SEC(".maps"); + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct devmap_xmit_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int from_ifindex; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int to_ifindex; // offset:16; size:4; signed:1; + int drops; // offset:20; size:4; signed:1; + int sent; // offset:24; size:4; signed:1; + int err; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_devmap_xmit") +int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->sent; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + rec->info += 1; + + /* Record error cases, where no frame were sent */ + if (ctx->err) + rec->err++; + + /* Catch API error of drv ndo_xdp_xmit sent more than count */ + if (ctx->drops < 0) + rec->err++; + + return 1; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c new file mode 100644 index 000000000..03d0a1829 --- /dev/null +++ b/samples/bpf/xdp_monitor_user.c @@ -0,0 +1,792 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +static const char *__doc__= + "XDP monitor tool, based on tracepoints\n" +; + +static const char *__doc_err_only__= + " NOTICE: Only tracking XDP redirect errors\n" + " Enable TX success stats via '--stats'\n" + " (which comes with a per packet processing overhead)\n" +; + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdint.h> +#include <string.h> +#include <ctype.h> +#include <unistd.h> +#include <locale.h> + +#include <sys/resource.h> +#include <getopt.h> +#include <net/if.h> +#include <time.h> + +#include <signal.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" + +enum map_type { + REDIRECT_ERR_CNT, + EXCEPTION_CNT, + CPUMAP_ENQUEUE_CNT, + CPUMAP_KTHREAD_CNT, + DEVMAP_XMIT_CNT, +}; + +static const char *const map_type_strings[] = { + [REDIRECT_ERR_CNT] = "redirect_err_cnt", + [EXCEPTION_CNT] = "exception_cnt", + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", + [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt", +}; + +#define NUM_MAP 5 +#define NUM_TP 8 + +static int tp_cnt; +static int map_cnt; +static int verbose = 1; +static bool debug = false; +struct bpf_map *map_data[NUM_MAP] = {}; +struct bpf_link *tp_links[NUM_TP] = {}; +struct bpf_object *obj; + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"debug", no_argument, NULL, 'D' }, + {"stats", no_argument, NULL, 'S' }, + {"sec", required_argument, NULL, 's' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); + exit(0); +} + +/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ +#define EXIT_FAIL_MEM 5 + +static void usage(char *argv[]) +{ + int i; + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", + argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-15s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf("short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAILURE); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +enum { + REDIR_SUCCESS = 0, + REDIR_ERROR = 1, +}; +#define REDIR_RES_MAX 2 +static const char *redir_names[REDIR_RES_MAX] = { + [REDIR_SUCCESS] = "Success", + [REDIR_ERROR] = "Error", +}; +static const char *err2str(int err) +{ + if (err < REDIR_RES_MAX) + return redir_names[err]; + return NULL; +} +/* enum xdp_action */ +#define XDP_UNKNOWN XDP_REDIRECT + 1 +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 info; + __u64 err; +}; +#define MAX_CPUS 64 + +/* Userspace structs for collection of stats from maps */ +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct u64rec { + __u64 processed; +}; +struct record_u64 { + /* record for _kern side __u64 values */ + __u64 timestamp; + struct u64rec total; + struct u64rec *cpu; +}; + +struct stats_record { + struct record_u64 xdp_redirect[REDIR_RES_MAX]; + struct record_u64 xdp_exception[XDP_ACTION_MAX]; + struct record xdp_cpumap_kthread; + struct record xdp_cpumap_enqueue[MAX_CPUS]; + struct record xdp_devmap_xmit; +}; + +static bool map_collect_record(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_info = 0; + __u64 sum_err = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].info = values[i].info; + sum_info += values[i].info; + rec->cpu[i].err = values[i].err; + sum_err += values[i].err; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.info = sum_info; + rec->total.err = sum_err; + return true; +} + +static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct u64rec values[nr_cpus]; + __u64 sum_total = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_total += values[i].processed; + } + rec->total.processed = sum_total; + return true; +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_drop(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->dropped - p->dropped; + pps = packets / period; + } + return pps; +} + +static double calc_info(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->info - p->info; + pps = packets / period; + } + return pps; +} + +static double calc_err(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->err - p->err; + pps = packets / period; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + bool err_only) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + int rec_i = 0, i, to_cpu; + double t = 0, pps = 0; + + /* Header */ + printf("%-15s %-7s %-12s %-12s %-9s\n", + "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); + + /* tracepoint: xdp:xdp_redirect_* */ + if (err_only) + rec_i = REDIR_ERROR; + + for (; rec_i < REDIR_RES_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_redirect[rec_i]; + prev = &stats_prev->xdp_redirect[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "XDP_REDIRECT", i, + rec_i ? 0.0: pps, rec_i ? pps : 0.0, + err2str(rec_i)); + } + pps = calc_pps_u64(&rec->total, &prev->total, t); + printf(fmt2, "XDP_REDIRECT", "total", + rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); + } + + /* tracepoint: xdp:xdp_exception */ + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_exception[rec_i]; + prev = &stats_prev->xdp_exception[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "Exception", i, + 0.0, pps, action2str(rec_i)); + } + pps = calc_pps_u64(&rec->total, &prev->total, t); + if (pps > 0) + printf(fmt2, "Exception", "total", + 0.0, pps, action2str(rec_i)); + } + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + struct record *rec, *prev; + char *info_str = ""; + double drop, info; + + rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; + prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt1, "cpumap-enqueue", + i, to_cpu, pps, drop, info, info_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + printf(fmt2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, info, info_str); + } + } + + /* cpumap kthread stats */ + { + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; + struct record *rec, *prev; + double drop, info; + char *i_str = ""; + + rec = &stats_rec->xdp_cpumap_kthread; + prev = &stats_prev->xdp_cpumap_kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) + i_str = "sched"; + if (pps > 0 || drop > 0) + printf(fmt1, "cpumap-kthread", + i, pps, drop, info, i_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) + i_str = "sched-sum"; + printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); + } + + /* devmap ndo_xdp_xmit stats */ + { + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n"; + struct record *rec, *prev; + double drop, info, err; + char *i_str = ""; + char *err_str = ""; + + rec = &stats_rec->xdp_devmap_xmit; + prev = &stats_prev->xdp_devmap_xmit; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + err = calc_err(r, p, t); + if (info > 0) { + i_str = "bulk-average"; + info = (pps+drop) / info; /* calc avg bulk */ + } + if (err > 0) + err_str = "drv-err"; + if (pps > 0 || drop > 0) + printf(fmt1, "devmap-xmit", + i, pps, drop, info, i_str, err_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + err = calc_err(&rec->total, &prev->total, t); + if (info > 0) { + i_str = "bulk-average"; + info = (pps+drop) / info; /* calc avg bulk */ + } + if (err > 0) + err_str = "drv-err"; + printf(fmt2, "devmap-xmit", "total", pps, drop, + info, i_str, err_str); + } + + printf("\n"); +} + +static bool stats_collect(struct stats_record *rec) +{ + int fd; + int i; + + /* TODO: Detect if someone unloaded the perf event_fd's, as + * this can happen by someone running perf-record -e + */ + + fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]); + for (i = 0; i < REDIR_RES_MAX; i++) + map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); + + fd = bpf_map__fd(map_data[EXCEPTION_CNT]); + for (i = 0; i < XDP_ACTION_MAX; i++) { + map_collect_record_u64(fd, i, &rec->xdp_exception[i]); + } + + fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]); + for (i = 0; i < MAX_CPUS; i++) + map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); + + fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]); + map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); + + fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]); + map_collect_record(fd, 0, &rec->xdp_devmap_xmit); + + return true; +} + +static void *alloc_rec_per_cpu(int record_size) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + void *array; + + array = calloc(nr_cpus, record_size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int rec_sz; + int i; + + /* Alloc main stats_record structure */ + rec = calloc(1, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + + /* Alloc stats stored per CPU for each record */ + rec_sz = sizeof(struct u64rec); + for (i = 0; i < REDIR_RES_MAX; i++) + rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < XDP_ACTION_MAX; i++) + rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); + + rec_sz = sizeof(struct datarec); + rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); + rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < MAX_CPUS; i++) + rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < REDIR_RES_MAX; i++) + free(r->xdp_redirect[i].cpu); + + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->xdp_exception[i].cpu); + + free(r->xdp_cpumap_kthread.cpu); + free(r->xdp_devmap_xmit.cpu); + + for (i = 0; i < MAX_CPUS; i++) + free(r->xdp_cpumap_enqueue[i].cpu); + + free(r); +} + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static void stats_poll(int interval, bool err_only) +{ + struct stats_record *rec, *prev; + + rec = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(rec); + + if (err_only) + printf("\n%s\n", __doc_err_only__); + + /* Trick to pretty printf with thousands separators use %' */ + setlocale(LC_NUMERIC, "en_US"); + + /* Header */ + if (verbose) + printf("\n%s", __doc__); + + /* TODO Need more advanced stats on error types */ + if (verbose) { + printf(" - Stats map0: %s\n", bpf_map__name(map_data[0])); + printf(" - Stats map1: %s\n", bpf_map__name(map_data[1])); + printf("\n"); + } + fflush(stdout); + + while (1) { + swap(&prev, &rec); + stats_collect(rec); + stats_print(rec, prev, err_only); + fflush(stdout); + sleep(interval); + } + + free_stats_record(rec); + free_stats_record(prev); +} + +static void print_bpf_prog_info(void) +{ + struct bpf_program *prog; + struct bpf_map *map; + int i = 0; + + /* Prog info */ + printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt); + bpf_object__for_each_program(prog, obj) { + printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog)); + i++; + } + + i = 0; + /* Maps info */ + printf("Loaded BPF prog have %d map(s)\n", map_cnt); + bpf_object__for_each_map(map, obj) { + const char *name = bpf_map__name(map); + int fd = bpf_map__fd(map); + + printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); + i++; + } + + /* Event info */ + printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt); + for (i = 0; i < tp_cnt; i++) { + int fd = bpf_link__fd(tp_links[i]); + + if (fd != -1) + printf(" - event_fd[%d] = fd(%d)\n", i, fd); + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; + int longindex = 0, opt; + int ret = EXIT_FAILURE; + enum map_type type; + char filename[256]; + + /* Default settings: */ + bool errors_only = true; + int interval = 2; + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hDSs:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'D': + debug = true; + break; + case 'S': + errors_only = false; + break; + case 's': + interval = atoi(optarg); + break; + case 'h': + default: + usage(argv); + return ret; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return ret; + } + + /* Remove tracepoint program when program is interrupted or killed */ + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (type = 0; type < NUM_MAP; type++) { + map_data[type] = + bpf_object__find_map_by_name(obj, map_type_strings[type]); + + if (libbpf_get_error(map_data[type])) { + printf("ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + map_cnt++; + } + + bpf_object__for_each_program(prog, obj) { + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + printf("ERROR: bpf_program__attach failed\n"); + tp_links[tp_cnt] = NULL; + goto cleanup; + } + tp_cnt++; + } + + if (debug) { + print_bpf_prog_info(); + } + + /* Unload/stop tracepoint event by closing bpf_link's */ + if (errors_only) { + /* The bpf_link[i] depend on the order of + * the functions was defined in _kern.c + */ + bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */ + tp_links[2] = NULL; + + bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */ + tp_links[3] = NULL; + } + + stats_poll(interval, errors_only); + + ret = EXIT_SUCCESS; + +cleanup: + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); + return ret; +} diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c new file mode 100644 index 000000000..8255025de --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_kern.c @@ -0,0 +1,730 @@ +/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) + * + * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/if_vlan.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/udp.h> + +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "hash_func01.h" + +#define MAX_CPUS NR_CPUS + +/* Special map type that can XDP_REDIRECT frames to another CPU */ +struct { + __uint(type, BPF_MAP_TYPE_CPUMAP); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct bpf_cpumap_val)); + __uint(max_entries, MAX_CPUS); +} cpu_map SEC(".maps"); + +/* Common stats data record to keep userspace more simple */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; + __u64 xdp_pass; + __u64 xdp_drop; + __u64 xdp_redirect; +}; + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} rx_cnt SEC(".maps"); + +/* Used by trace point */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 2); + /* TODO: have entries for all possible errno's */ +} redirect_err_cnt SEC(".maps"); + +/* Used by trace point */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, MAX_CPUS); +} cpumap_enqueue_cnt SEC(".maps"); + +/* Used by trace point */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} cpumap_kthread_cnt SEC(".maps"); + +/* Set of maps controlling available CPU, and for iterating through + * selectable redirect CPUs. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, MAX_CPUS); +} cpus_available SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cpus_count SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cpus_iterator SEC(".maps"); + +/* Used by trace point */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} exception_cnt SEC(".maps"); + +/* Helper parse functions */ + +/* Parse Ethernet layer 2, extract network layer 3 offset and protocol + * + * Returns false on error and non-supported ether-type + */ +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +static __always_inline +bool parse_eth(struct ethhdr *eth, void *data_end, + u16 *eth_proto, u64 *l3_offset) +{ + u16 eth_type; + u64 offset; + + offset = sizeof(*eth); + if ((void *)eth + offset > data_end) + return false; + + eth_type = eth->h_proto; + + /* Skip non 802.3 Ethertypes */ + if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN)) + return false; + + /* Handle VLAN tagged packet */ + if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { + struct vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + offset += sizeof(*vlan_hdr); + if ((void *)eth + offset > data_end) + return false; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + } + /* Handle double VLAN tagged packet */ + if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { + struct vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + offset += sizeof(*vlan_hdr); + if ((void *)eth + offset > data_end) + return false; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + } + + *eth_proto = ntohs(eth_type); + *l3_offset = offset; + return true; +} + +static __always_inline +u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + struct udphdr *udph; + u16 dport; + + if (iph + 1 > data_end) + return 0; + if (!(iph->protocol == IPPROTO_UDP)) + return 0; + + udph = (void *)(iph + 1); + if (udph + 1 > data_end) + return 0; + + dport = ntohs(udph->dest); + return dport; +} + +static __always_inline +int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static __always_inline +int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp_cpu_map0") +int xdp_prognum0_no_touch(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map1_touch_data") +int xdp_prognum1_touch_data(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u16 eth_type; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Validate packet length is minimum Eth header size */ + if (eth + 1 > data_end) + return XDP_ABORTED; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + /* Read packet data, and use it (drop non 802.3 Ethertypes) */ + eth_type = eth->h_proto; + if (ntohs(eth_type) < ETH_P_802_3_MIN) { + rec->dropped++; + return XDP_DROP; + } + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map2_round_robin") +int xdp_prognum2_round_robin(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 cpu_dest; + u32 *cpu_lookup; + u32 key0 = 0; + + u32 *cpu_selected; + u32 *cpu_iterator; + u32 *cpu_max; + u32 cpu_idx; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + + cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); + if (!cpu_iterator) + return XDP_ABORTED; + cpu_idx = *cpu_iterator; + + *cpu_iterator += 1; + if (*cpu_iterator == *cpu_max) + *cpu_iterator = 0; + + cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key0); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map3_proto_separate") +int xdp_prognum3_proto_separate(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map4_ddos_filter_pktgen") +int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u16 dest_port; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + /* DDoS filter UDP port 9 (pktgen) */ + dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); + if (dest_port == 9) { + if (rec) + rec->dropped++; + return XDP_DROP; + } + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +/* Hashing initval */ +#define INITVAL 15485863 + +static __always_inline +u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + u32 cpu_hash; + + if (iph + 1 > data_end) + return 0; + + cpu_hash = iph->saddr + iph->daddr; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); + + return cpu_hash; +} + +static __always_inline +u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + u32 cpu_hash; + + if (ip6h + 1 > data_end) + return 0; + + cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; + cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; + cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; + cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); + + return cpu_hash; +} + +/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The + * hashing scheme is symmetric, meaning swapping IP src/dest still hit + * same CPU. + */ +SEC("xdp_cpu_map5_lb_hash_ip_pairs") +int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 *cpu_max; + u32 cpu_hash; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key); + if (!cpu_max) + return XDP_ABORTED; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Hash for IPv4 and IPv6 */ + switch (eth_proto) { + case ETH_P_IP: + cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_IPV6: + cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ + default: + cpu_hash = 0; + } + + /* Choose CPU based on hash */ + cpu_idx = cpu_hash % *cpu_max; + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +char _license[] SEC("license") = "GPL"; + +/*** Trace point code ***/ + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_redirect_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline +int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) +{ + u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + int err = ctx->err; + + if (!err) + key = XDP_REDIRECT_SUCCESS; + + rec = bpf_map_lookup_elem(&redirect_err_cnt, &key); + if (!rec) + return 0; + rec->dropped += 1; + + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tracepoint/xdp/xdp_redirect_err") +int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +SEC("tracepoint/xdp/xdp_redirect_map_err") +int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&exception_cnt, &key); + if (!rec) + return 1; + rec->dropped += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->issue += 1; + + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; + unsigned int xdp_pass; // offset:32; size:4; signed:0; + unsigned int xdp_drop; // offset:36; size:4; signed:0; + unsigned int xdp_redirect; // offset:40; size:4; signed:0; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + rec->xdp_pass += ctx->xdp_pass; + rec->xdp_drop += ctx->xdp_drop; + rec->xdp_redirect += ctx->xdp_redirect; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->issue++; + + return 0; +} diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c new file mode 100644 index 000000000..16eb839e7 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -0,0 +1,983 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +static const char *__doc__ = + " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\""; + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <locale.h> +#include <sys/resource.h> +#include <sys/sysinfo.h> +#include <getopt.h> +#include <net/if.h> +#include <time.h> +#include <linux/limits.h> + +#include <arpa/inet.h> +#include <linux/if_link.h> + +/* How many xdp_progs are defined in _kern.c */ +#define MAX_PROG 6 + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_util.h" + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; +static __u32 prog_id; + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int n_cpus; + +enum map_type { + CPU_MAP, + RX_CNT, + REDIRECT_ERR_CNT, + CPUMAP_ENQUEUE_CNT, + CPUMAP_KTHREAD_CNT, + CPUS_AVAILABLE, + CPUS_COUNT, + CPUS_ITERATOR, + EXCEPTION_CNT, +}; + +static const char *const map_type_strings[] = { + [CPU_MAP] = "cpu_map", + [RX_CNT] = "rx_cnt", + [REDIRECT_ERR_CNT] = "redirect_err_cnt", + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", + [CPUS_AVAILABLE] = "cpus_available", + [CPUS_COUNT] = "cpus_count", + [CPUS_ITERATOR] = "cpus_iterator", + [EXCEPTION_CNT] = "exception_cnt", +}; + +#define NUM_TP 5 +#define NUM_MAP 9 +struct bpf_link *tp_links[NUM_TP] = {}; +static int map_fds[NUM_MAP]; +static int tp_cnt = 0; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"sec", required_argument, NULL, 's' }, + {"progname", required_argument, NULL, 'p' }, + {"qsize", required_argument, NULL, 'q' }, + {"cpu", required_argument, NULL, 'c' }, + {"stress-mode", no_argument, NULL, 'x' }, + {"no-separators", no_argument, NULL, 'z' }, + {"force", no_argument, NULL, 'F' }, + {"mprog-disable", no_argument, NULL, 'n' }, + {"mprog-name", required_argument, NULL, 'e' }, + {"mprog-filename", required_argument, NULL, 'f' }, + {"redirect-device", required_argument, NULL, 'r' }, + {"redirect-map", required_argument, NULL, 'm' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (ifindex > -1) { + if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(EXIT_FAIL); + } + if (prog_id == curr_prog_id) { + fprintf(stderr, + "Interrupted: Removing XDP program on ifindex:%d device:%s\n", + ifindex, ifname); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + } else if (!curr_prog_id) { + printf("couldn't find a prog id on a given iface\n"); + } else { + printf("program on interface changed, not removing\n"); + } + } + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + exit(EXIT_OK); +} + +static void print_avail_progs(struct bpf_object *obj) +{ + struct bpf_program *pos; + + bpf_object__for_each_program(pos, obj) { + if (bpf_program__is_xdp(pos)) + printf(" %s\n", bpf_program__section_name(pos)); + } +} + +static void usage(char *argv[], struct bpf_object *obj) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n Programs to be used for --progname:\n"); + print_avail_progs(obj); + printf("\n"); +} + +/* gettime returns the current time of day in nanoseconds. + * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC) + * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE) + */ +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAIL); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; + __u64 xdp_pass; + __u64 xdp_drop; + __u64 xdp_redirect; +}; +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct stats_record { + struct record rx_cnt; + struct record redir_err; + struct record kthread; + struct record exception; + struct record enq[]; +}; + +static bool map_collect_percpu(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_xdp_redirect = 0; + __u64 sum_xdp_pass = 0; + __u64 sum_xdp_drop = 0; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_issue = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].issue = values[i].issue; + sum_issue += values[i].issue; + rec->cpu[i].xdp_pass = values[i].xdp_pass; + sum_xdp_pass += values[i].xdp_pass; + rec->cpu[i].xdp_drop = values[i].xdp_drop; + sum_xdp_drop += values[i].xdp_drop; + rec->cpu[i].xdp_redirect = values[i].xdp_redirect; + sum_xdp_redirect += values[i].xdp_redirect; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.issue = sum_issue; + rec->total.xdp_pass = sum_xdp_pass; + rec->total.xdp_drop = sum_xdp_drop; + rec->total.xdp_redirect = sum_xdp_redirect; + return true; +} + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec *array; + + array = calloc(nr_cpus, sizeof(struct datarec)); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int i, size; + + size = sizeof(*rec) + n_cpus * sizeof(struct record); + rec = malloc(size); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + memset(rec, 0, size); + rec->rx_cnt.cpu = alloc_record_per_cpu(); + rec->redir_err.cpu = alloc_record_per_cpu(); + rec->kthread.cpu = alloc_record_per_cpu(); + rec->exception.cpu = alloc_record_per_cpu(); + for (i = 0; i < n_cpus; i++) + rec->enq[i].cpu = alloc_record_per_cpu(); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < n_cpus; i++) + free(r->enq[i].cpu); + free(r->exception.cpu); + free(r->kthread.cpu); + free(r->redir_err.cpu); + free(r->rx_cnt.cpu); + free(r); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->dropped - p->dropped; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, + struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = packets / period_; + } + return pps; +} + +static void calc_xdp_pps(struct datarec *r, struct datarec *p, + double *xdp_pass, double *xdp_drop, + double *xdp_redirect, double period_) +{ + *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0; + if (period_ > 0) { + *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_; + *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_; + *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_; + } +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + char *prog_name, char *mprog_name, int mprog_fd) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + double pps = 0, drop = 0, err = 0; + bool mprog_enabled = false; + struct record *rec, *prev; + int to_cpu; + double t; + int i; + + if (mprog_fd > 0) + mprog_enabled = true; + + /* Header */ + printf("Running XDP/eBPF prog_name:%s\n", prog_name); + printf("%-15s %-7s %-14s %-11s %-9s\n", + "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info"); + + /* XDP rx_cnt */ + { + char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n"; + char *errstr = ""; + + rec = &stats_rec->rx_cnt; + prev = &stats_prev->rx_cnt; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + errstr = "cpu-dest/err"; + if (pps > 0) + printf(fmt_rx, "XDP-RX", + i, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + printf(fm2_rx, "XDP-RX", "total", pps, drop); + } + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) { + char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *errstr = ""; + + rec = &stats_rec->enq[to_cpu]; + prev = &stats_prev->enq[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt, "cpumap-enqueue", + i, to_cpu, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + printf(fm2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, err, errstr); + } + } + + /* cpumap kthread stats */ + { + char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *e_str = ""; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + e_str = "sched"; + if (pps > 0) + printf(fmt_k, "cpumap_kthread", + i, pps, drop, err, e_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) + e_str = "sched-sum"; + printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str); + } + + /* XDP redirect err tracepoints (very unlikely) */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->redir_err; + prev = &stats_prev->redir_err; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "redirect_err", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "redirect_err", "total", pps, drop); + } + + /* XDP general exception tracepoints */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->exception; + prev = &stats_prev->exception; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "xdp_exception", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "xdp_exception", "total", pps, drop); + } + + /* CPUMAP attached XDP program that runs on remote/destination CPU */ + if (mprog_enabled) { + char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f\n"; + char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f\n"; + double xdp_pass, xdp_drop, xdp_redirect; + + printf("\n2nd remote XDP/eBPF prog_name: %s\n", mprog_name); + printf("%-15s %-7s %-14s %-11s %-9s\n", + "XDP-cpumap", "CPU:to", "xdp-pass", "xdp-drop", "xdp-redir"); + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, + &xdp_redirect, t); + if (xdp_pass > 0 || xdp_drop > 0 || xdp_redirect > 0) + printf(fmt_k, "xdp-in-kthread", i, xdp_pass, xdp_drop, + xdp_redirect); + } + calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop, + &xdp_redirect, t); + printf(fm2_k, "xdp-in-kthread", "total", xdp_pass, xdp_drop, xdp_redirect); + } + + printf("\n"); + fflush(stdout); +} + +static void stats_collect(struct stats_record *rec) +{ + int fd, i; + + fd = map_fds[RX_CNT]; + map_collect_percpu(fd, 0, &rec->rx_cnt); + + fd = map_fds[REDIRECT_ERR_CNT]; + map_collect_percpu(fd, 1, &rec->redir_err); + + fd = map_fds[CPUMAP_ENQUEUE_CNT]; + for (i = 0; i < n_cpus; i++) + map_collect_percpu(fd, i, &rec->enq[i]); + + fd = map_fds[CPUMAP_KTHREAD_CNT]; + map_collect_percpu(fd, 0, &rec->kthread); + + fd = map_fds[EXCEPTION_CNT]; + map_collect_percpu(fd, 0, &rec->exception); +} + + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, + __u32 avail_idx, bool new) +{ + __u32 curr_cpus_count = 0; + __u32 key = 0; + int ret; + + /* Add a CPU entry to cpumap, as this allocate a cpu entry in + * the kernel for the cpu. + */ + ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0); + if (ret) { + fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); + exit(EXIT_FAIL_BPF); + } + + /* Inform bpf_prog's that a new CPU is available to select + * from via some control maps. + */ + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0); + if (ret) { + fprintf(stderr, "Add to avail CPUs failed\n"); + exit(EXIT_FAIL_BPF); + } + + /* When not replacing/updating existing entry, bump the count */ + ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count); + if (ret) { + fprintf(stderr, "Failed reading curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + if (new) { + curr_cpus_count++; + ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key, + &curr_cpus_count, 0); + if (ret) { + fprintf(stderr, "Failed write curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + } + /* map_fd[7] = cpus_iterator */ + printf("%s CPU:%u as idx:%u qsize:%d prog_fd: %d (cpus_count:%u)\n", + new ? "Add-new":"Replace", cpu, avail_idx, + value->qsize, value->bpf_prog.fd, curr_cpus_count); + + return 0; +} + +/* CPUs are zero-indexed. Thus, add a special sentinel default value + * in map cpus_available to mark CPU index'es not configured + */ +static void mark_cpus_unavailable(void) +{ + __u32 invalid_cpu = n_cpus; + int ret, i; + + for (i = 0; i < n_cpus; i++) { + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i, + &invalid_cpu, 0); + if (ret) { + fprintf(stderr, "Failed marking CPU unavailable\n"); + exit(EXIT_FAIL_BPF); + } + } +} + +/* Stress cpumap management code by concurrently changing underlying cpumap */ +static void stress_cpumap(struct bpf_cpumap_val *value) +{ + /* Changing qsize will cause kernel to free and alloc a new + * bpf_cpu_map_entry, with an associated/complicated tear-down + * procedure. + */ + value->qsize = 1024; + create_cpu_entry(1, value, 0, false); + value->qsize = 8; + create_cpu_entry(1, value, 0, false); + value->qsize = 16000; + create_cpu_entry(1, value, 0, false); +} + +static void stats_poll(int interval, bool use_separators, char *prog_name, + char *mprog_name, struct bpf_cpumap_val *value, + bool stress_mode) +{ + struct stats_record *record, *prev; + int mprog_fd; + + record = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(record); + + /* Trick to pretty printf with thousands separators use %' */ + if (use_separators) + setlocale(LC_NUMERIC, "en_US"); + + while (1) { + swap(&prev, &record); + mprog_fd = value->bpf_prog.fd; + stats_collect(record); + stats_print(record, prev, prog_name, mprog_name, mprog_fd); + sleep(interval); + if (stress_mode) + stress_cpumap(value); + } + + free_stats_record(record); + free_stats_record(prev); +} + +static int init_tracepoints(struct bpf_object *obj) +{ + struct bpf_program *prog; + + bpf_object__for_each_program(prog, obj) { + if (bpf_program__is_tracepoint(prog) != true) + continue; + + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + tp_links[tp_cnt] = NULL; + return -EINVAL; + } + tp_cnt++; + } + + return 0; +} + +static int init_map_fds(struct bpf_object *obj) +{ + enum map_type type; + + for (type = 0; type < NUM_MAP; type++) { + map_fds[type] = + bpf_object__find_map_fd_by_name(obj, + map_type_strings[type]); + + if (map_fds[type] < 0) + return -ENOENT; + } + + return 0; +} + +static int load_cpumap_prog(char *file_name, char *prog_name, + char *redir_interface, char *redir_map) +{ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + .expected_attach_type = BPF_XDP_CPUMAP, + .file = file_name, + }; + struct bpf_program *prog; + struct bpf_object *obj; + int fd; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &fd)) + return -1; + + if (fd < 0) { + fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", + strerror(errno)); + return fd; + } + + if (redir_interface && redir_map) { + int err, map_fd, ifindex_out, key = 0; + + map_fd = bpf_object__find_map_fd_by_name(obj, redir_map); + if (map_fd < 0) + return map_fd; + + ifindex_out = if_nametoindex(redir_interface); + if (!ifindex_out) + return -1; + + err = bpf_map_update_elem(map_fd, &key, &ifindex_out, 0); + if (err < 0) + return err; + } + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) { + fprintf(stderr, "bpf_object__find_program_by_title failed\n"); + return EXIT_FAIL; + } + + return bpf_program__fd(prog); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; + char *mprog_filename = "xdp_redirect_kern.o"; + char *redir_interface = NULL, *redir_map = NULL; + char *mprog_name = "xdp_redirect_dummy"; + bool mprog_disable = false; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_UNSPEC, + }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + struct bpf_cpumap_val value; + bool use_separators = true; + bool stress_mode = false; + struct bpf_program *prog; + struct bpf_object *obj; + int err = EXIT_FAIL; + char filename[256]; + int added_cpus = 0; + int longindex = 0; + int interval = 2; + int add_cpu = -1; + int opt, prog_fd; + int *cpu, i; + __u32 qsize; + + n_cpus = get_nprocs_conf(); + + /* Notice: choosing he queue size is very important with the + * ixgbe driver, because it's driver page recycling trick is + * dependend on pages being returned quickly. The number of + * out-standing packets in the system must be less-than 2x + * RX-ring size. + */ + qsize = 128+64; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return err; + + if (prog_fd < 0) { + fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", + strerror(errno)); + return err; + } + + if (init_tracepoints(obj) < 0) { + fprintf(stderr, "ERR: bpf_program__attach failed\n"); + return err; + } + + if (init_map_fds(obj) < 0) { + fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); + return err; + } + mark_cpus_unavailable(); + + cpu = malloc(n_cpus * sizeof(int)); + if (!cpu) { + fprintf(stderr, "failed to allocate cpu array\n"); + return err; + } + memset(cpu, 0, n_cpus * sizeof(int)); + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:n", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 's': + interval = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'x': + stress_mode = true; + break; + case 'z': + use_separators = false; + break; + case 'p': + /* Selecting eBPF prog to load */ + prog_name = optarg; + break; + case 'n': + mprog_disable = true; + break; + case 'f': + mprog_filename = optarg; + break; + case 'e': + mprog_name = optarg; + break; + case 'r': + redir_interface = optarg; + break; + case 'm': + redir_map = optarg; + break; + case 'c': + /* Add multiple CPUs */ + add_cpu = strtoul(optarg, NULL, 0); + if (add_cpu >= n_cpus) { + fprintf(stderr, + "--cpu nr too large for cpumap err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + cpu[added_cpus++] = add_cpu; + break; + case 'q': + qsize = atoi(optarg); + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'h': + error: + default: + free(cpu); + usage(argv, obj); + return EXIT_FAIL_OPTION; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv, obj); + err = EXIT_FAIL_OPTION; + goto out; + } + /* Required option */ + if (add_cpu == -1) { + fprintf(stderr, "ERR: required option --cpu missing\n"); + fprintf(stderr, " Specify multiple --cpu option to add more\n"); + usage(argv, obj); + err = EXIT_FAIL_OPTION; + goto out; + } + + value.bpf_prog.fd = 0; + if (!mprog_disable) + value.bpf_prog.fd = load_cpumap_prog(mprog_filename, mprog_name, + redir_interface, redir_map); + if (value.bpf_prog.fd < 0) { + err = value.bpf_prog.fd; + goto out; + } + value.qsize = qsize; + + for (i = 0; i < added_cpus; i++) + create_cpu_entry(cpu[i], &value, i, true); + + /* Remove XDP program when program is interrupted or killed */ + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) { + fprintf(stderr, "bpf_object__find_program_by_title failed\n"); + goto out; + } + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + fprintf(stderr, "bpf_program__fd failed\n"); + goto out; + } + + if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + fprintf(stderr, "link set xdp fd failed\n"); + err = EXIT_FAIL_XDP; + goto out; + } + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + goto out; + } + prog_id = info.id; + + stats_poll(interval, use_separators, prog_name, mprog_name, + &value, stress_mode); + + err = EXIT_OK; +out: + free(cpu); + return err; +} diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c new file mode 100644 index 000000000..d26ec3aa2 --- /dev/null +++ b/samples/bpf/xdp_redirect_kern.c @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); +} tx_port SEC(".maps"); + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, 1); +} rxcnt SEC(".maps"); + +static void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +SEC("xdp_redirect") +int xdp_redirect_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + int *ifindex, port = 0; + long *value; + u32 key = 0; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + ifindex = bpf_map_lookup_elem(&tx_port, &port); + if (!ifindex) + return rc; + + value = bpf_map_lookup_elem(&rxcnt, &key); + if (value) + *value += 1; + + swap_src_dst_mac(data); + return bpf_redirect(*ifindex, 0); +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp_redirect_dummy") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c new file mode 100644 index 000000000..6489352ab --- /dev/null +++ b/samples/bpf/xdp_redirect_map_kern.c @@ -0,0 +1,92 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 100); +} tx_port SEC(".maps"); + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, 1); +} rxcnt SEC(".maps"); + +static void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +SEC("xdp_redirect_map") +int xdp_redirect_map_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + int vport, port = 0, m = 0; + long *value; + u32 key = 0; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + /* constant virtual port */ + vport = 0; + + /* count packet in global counter */ + value = bpf_map_lookup_elem(&rxcnt, &key); + if (value) + *value += 1; + + swap_src_dst_mac(data); + + /* send packet out physical port */ + return bpf_redirect_map(&tx_port, vport, 0); +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp_redirect_dummy") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c new file mode 100644 index 000000000..35e16dee6 --- /dev/null +++ b/samples/bpf/xdp_redirect_map_user.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <net/if.h> +#include <unistd.h> +#include <libgen.h> +#include <sys/resource.h> + +#include "bpf_util.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +static int ifindex_in; +static int ifindex_out; +static bool ifindex_out_xdp_dummy_attached = true; +static __u32 prog_id; +static __u32 dummy_prog_id; + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int rxcnt_map_fd; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (prog_id == curr_prog_id) + bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on iface IN\n"); + else + printf("program on iface IN changed, not removing\n"); + + if (ifindex_out_xdp_dummy_attached) { + curr_prog_id = 0; + if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, + xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (dummy_prog_id == curr_prog_id) + bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on iface OUT\n"); + else + printf("program on iface OUT changed, not removing\n"); + } + exit(0); +} + +static void poll_stats(int interval, int ifindex) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 values[nr_cpus], prev[nr_cpus]; + + memset(prev, 0, sizeof(prev)); + + while (1) { + __u64 sum = 0; + __u32 key = 0; + int i; + + sleep(interval); + assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[i]); + if (sum) + printf("ifindex %i: %10llu pkt/s\n", + ifindex, sum / interval); + memcpy(prev, values, sizeof(values)); + } +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" + "OPTS:\n" + " -S use skb-mode\n" + " -N enforce native mode\n" + " -F force loading prog\n", + prog); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + struct bpf_program *prog, *dummy_prog; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + int prog_fd, dummy_prog_fd; + const char *optstr = "FSN"; + struct bpf_object *obj; + int ret, opt, key = 0; + char filename[256]; + int tx_port_map_fd; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + if (optind == argc) { + printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); + return 1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + ifindex_in = if_nametoindex(argv[optind]); + if (!ifindex_in) + ifindex_in = strtoul(argv[optind], NULL, 0); + + ifindex_out = if_nametoindex(argv[optind + 1]); + if (!ifindex_out) + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + + printf("input: %d output: %d\n", ifindex_in, ifindex_out); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + prog = bpf_program__next(NULL, obj); + dummy_prog = bpf_program__next(prog, obj); + if (!prog || !dummy_prog) { + printf("finding a prog in obj file failed\n"); + return 1; + } + /* bpf_prog_load_xattr gives us the pointer to first prog's fd, + * so we're missing only the fd for dummy prog + */ + dummy_prog_fd = bpf_program__fd(dummy_prog); + if (prog_fd < 0 || dummy_prog_fd < 0) { + printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + return 1; + } + + tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); + rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); + if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { + printf("bpf_object__find_map_fd_by_name failed\n"); + return 1; + } + + if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { + printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); + return 1; + } + + ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (ret) { + printf("can't get prog info - %s\n", strerror(errno)); + return ret; + } + prog_id = info.id; + + /* Loading dummy XDP prog on out-device */ + if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, + (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { + printf("WARN: link set xdp fd failed on %d\n", ifindex_out); + ifindex_out_xdp_dummy_attached = false; + } + + memset(&info, 0, sizeof(info)); + ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); + if (ret) { + printf("can't get prog info - %s\n", strerror(errno)); + return ret; + } + dummy_prog_id = info.id; + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + /* populate virtual to physical port map */ + ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); + if (ret) { + perror("bpf_update_elem"); + goto out; + } + + poll_stats(2, ifindex_out); + +out: + return 0; +} diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c new file mode 100644 index 000000000..3c92adc2a --- /dev/null +++ b/samples/bpf/xdp_redirect_user.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <net/if.h> +#include <unistd.h> +#include <libgen.h> +#include <sys/resource.h> + +#include "bpf_util.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +static int ifindex_in; +static int ifindex_out; +static bool ifindex_out_xdp_dummy_attached = true; +static __u32 prog_id; +static __u32 dummy_prog_id; + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int rxcnt_map_fd; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (prog_id == curr_prog_id) + bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on iface IN\n"); + else + printf("program on iface IN changed, not removing\n"); + + if (ifindex_out_xdp_dummy_attached) { + curr_prog_id = 0; + if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, + xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (dummy_prog_id == curr_prog_id) + bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on iface OUT\n"); + else + printf("program on iface OUT changed, not removing\n"); + } + exit(0); +} + +static void poll_stats(int interval, int ifindex) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 values[nr_cpus], prev[nr_cpus]; + + memset(prev, 0, sizeof(prev)); + + while (1) { + __u64 sum = 0; + __u32 key = 0; + int i; + + sleep(interval); + assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[i]); + if (sum) + printf("ifindex %i: %10llu pkt/s\n", + ifindex, sum / interval); + memcpy(prev, values, sizeof(values)); + } +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" + "OPTS:\n" + " -S use skb-mode\n" + " -N enforce native mode\n" + " -F force loading prog\n", + prog); +} + + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + struct bpf_program *prog, *dummy_prog; + int prog_fd, tx_port_map_fd, opt; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + const char *optstr = "FSN"; + struct bpf_object *obj; + char filename[256]; + int dummy_prog_fd; + int ret, key = 0; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + if (optind + 2 != argc) { + printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); + return 1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + ifindex_in = if_nametoindex(argv[optind]); + if (!ifindex_in) + ifindex_in = strtoul(argv[optind], NULL, 0); + + ifindex_out = if_nametoindex(argv[optind + 1]); + if (!ifindex_out) + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + + printf("input: %d output: %d\n", ifindex_in, ifindex_out); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + prog = bpf_program__next(NULL, obj); + dummy_prog = bpf_program__next(prog, obj); + if (!prog || !dummy_prog) { + printf("finding a prog in obj file failed\n"); + return 1; + } + /* bpf_prog_load_xattr gives us the pointer to first prog's fd, + * so we're missing only the fd for dummy prog + */ + dummy_prog_fd = bpf_program__fd(dummy_prog); + if (prog_fd < 0 || dummy_prog_fd < 0) { + printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + return 1; + } + + tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); + rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); + if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { + printf("bpf_object__find_map_fd_by_name failed\n"); + return 1; + } + + if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { + printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); + return 1; + } + + ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (ret) { + printf("can't get prog info - %s\n", strerror(errno)); + return ret; + } + prog_id = info.id; + + /* Loading dummy XDP prog on out-device */ + if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, + (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { + printf("WARN: link set xdp fd failed on %d\n", ifindex_out); + ifindex_out_xdp_dummy_attached = false; + } + + memset(&info, 0, sizeof(info)); + ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); + if (ret) { + printf("can't get prog info - %s\n", strerror(errno)); + return ret; + } + dummy_prog_id = info.id; + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + /* bpf redirect port */ + ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); + if (ret) { + perror("bpf_update_elem"); + goto out; + } + + poll_stats(2, ifindex_out); + +out: + return ret; +} diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c new file mode 100644 index 000000000..b37ca2b13 --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_kern.c @@ -0,0 +1,186 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <bpf/bpf_helpers.h> +#include <linux/slab.h> +#include <net/ip_fib.h> + +struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; +}; + +/* Key for lpm_trie*/ +union key_4 { + u32 b32[2]; + u8 b8[8]; +}; + +struct arp_entry { + __be64 mac; + __be32 dst; +}; + +struct direct_map { + struct arp_entry arp; + int ifindex; + __be64 mac; +}; + +/* Map for trie implementation*/ +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(struct trie_value)); + __uint(max_entries, 50); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_map SEC(".maps"); + +/* Map for counter*/ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 256); +} rxcnt SEC(".maps"); + +/* Map for ARP table*/ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, __be64); + __uint(max_entries, 50); +} arp_table SEC(".maps"); + +/* Map to keep the exact match entries in the route table*/ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, struct direct_map); + __uint(max_entries, 50); +} exact_match SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 100); +} tx_port SEC(".maps"); + +/* Function to set source and destination mac of the packet */ +static inline void set_src_dst_mac(void *data, void *src, void *dst) +{ + unsigned short *source = src; + unsigned short *dest = dst; + unsigned short *p = data; + + __builtin_memcpy(p, dest, 6); + __builtin_memcpy(p + 3, source, 6); +} + +/* Parse IPV4 packet to get SRC, DST IP and protocol */ +static inline int parse_ipv4(void *data, u64 nh_off, void *data_end, + __be32 *src, __be32 *dest) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + *src = iph->saddr; + *dest = iph->daddr; + return iph->protocol; +} + +SEC("xdp_router_ipv4") +int xdp_router_ipv4_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + __be64 *dest_mac = NULL, *src_mac = NULL; + void *data = (void *)(long)ctx->data; + struct trie_value *prefix_value; + int rc = XDP_DROP, forward_to; + struct ethhdr *eth = data; + union key_4 key4; + long *value; + u16 h_proto; + u32 ipproto; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_ARP)) { + return XDP_PASS; + } else if (h_proto == htons(ETH_P_IP)) { + struct direct_map *direct_entry; + __be32 src_ip = 0, dest_ip = 0; + + ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip); + direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip); + /* Check for exact match, this would give a faster lookup*/ + if (direct_entry && direct_entry->mac && direct_entry->arp.mac) { + src_mac = &direct_entry->mac; + dest_mac = &direct_entry->arp.mac; + forward_to = direct_entry->ifindex; + } else { + /* Look up in the trie for lpm*/ + key4.b32[0] = 32; + key4.b8[4] = dest_ip & 0xff; + key4.b8[5] = (dest_ip >> 8) & 0xff; + key4.b8[6] = (dest_ip >> 16) & 0xff; + key4.b8[7] = (dest_ip >> 24) & 0xff; + prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); + if (!prefix_value) + return XDP_DROP; + src_mac = &prefix_value->value; + if (!src_mac) + return XDP_DROP; + dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); + if (!dest_mac) { + if (!prefix_value->gw) + return XDP_DROP; + dest_ip = prefix_value->gw; + dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); + } + forward_to = prefix_value->ifindex; + } + } else { + ipproto = 0; + } + if (src_mac && dest_mac) { + set_src_dst_mac(data, src_mac, dest_mac); + value = bpf_map_lookup_elem(&rxcnt, &ipproto); + if (value) + *value += 1; + return bpf_redirect_map(&tx_port, forward_to, 0); + } + return rc; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c new file mode 100644 index 000000000..c2da1b51f --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -0,0 +1,741 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2017 Cavium, Inc. + */ +#include <linux/bpf.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include <bpf/bpf.h> +#include <arpa/inet.h> +#include <fcntl.h> +#include <poll.h> +#include <net/if.h> +#include <netdb.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include "bpf_util.h" +#include <bpf/libbpf.h> +#include <sys/resource.h> +#include <libgen.h> + +int sock, sock_arp, flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int total_ifindex; +static int *ifindex_list; +static __u32 *prog_id_list; +char buf[8192]; +static int lpm_map_fd; +static int rxcnt_map_fd; +static int arp_table_map_fd; +static int exact_match_map_fd; +static int tx_port_map_fd; + +static int get_route_table(int rtm_family); +static void int_exit(int sig) +{ + __u32 prog_id = 0; + int i = 0; + + for (i = 0; i < total_ifindex; i++) { + if (bpf_get_link_xdp_id(ifindex_list[i], &prog_id, flags)) { + printf("bpf_get_link_xdp_id on iface %d failed\n", + ifindex_list[i]); + exit(1); + } + if (prog_id_list[i] == prog_id) + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); + else if (!prog_id) + printf("couldn't find a prog id on iface %d\n", + ifindex_list[i]); + else + printf("program on iface %d changed, not removing\n", + ifindex_list[i]); + prog_id = 0; + } + exit(0); +} + +static void close_and_exit(int sig) +{ + close(sock); + close(sock_arp); + + int_exit(0); +} + +/* Get the mac address of the interface given interface name */ +static __be64 getmac(char *iface) +{ + struct ifreq ifr; + __be64 mac = 0; + int fd, i; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + ifr.ifr_addr.sa_family = AF_INET; + strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1); + if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { + printf("ioctl failed leaving....\n"); + return -1; + } + for (i = 0; i < 6 ; i++) + *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i]; + close(fd); + return mac; +} + +static int recv_msg(struct sockaddr_nl sock_addr, int sock) +{ + struct nlmsghdr *nh; + int len, nll = 0; + char *buf_ptr; + + buf_ptr = buf; + while (1) { + len = recv(sock, buf_ptr, sizeof(buf) - nll, 0); + if (len < 0) + return len; + + nh = (struct nlmsghdr *)buf_ptr; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + buf_ptr += len; + nll += len; + if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH) + break; + + if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE) + break; + } + return nll; +} + +/* Function to parse the route entry returned by netlink + * Updates the route entry related map entries + */ +static void read_route(struct nlmsghdr *nh, int nll) +{ + char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24]; + struct bpf_lpm_trie_key *prefix_key; + struct rtattr *rt_attr; + struct rtmsg *rt_msg; + int rtm_family; + int rtl; + int i; + struct route_table { + int dst_len, iface, metric; + char *iface_name; + __be32 dst, gw; + __be64 mac; + } route; + struct arp_table { + __be64 mac; + __be32 dst; + }; + + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + if (nh->nlmsg_type == RTM_DELROUTE) + printf("DELETING Route entry\n"); + else if (nh->nlmsg_type == RTM_GETROUTE) + printf("READING Route entry\n"); + else if (nh->nlmsg_type == RTM_NEWROUTE) + printf("NEW Route entry\n"); + else + printf("%d\n", nh->nlmsg_type); + + memset(&route, 0, sizeof(route)); + printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n"); + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct rtmsg *)NLMSG_DATA(nh); + rtm_family = rt_msg->rtm_family; + if (rtm_family == AF_INET) + if (rt_msg->rtm_table != RT_TABLE_MAIN) + continue; + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + rtl = RTM_PAYLOAD(nh); + + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + (*((__be32 *)RTA_DATA(rt_attr)))); + break; + case RTA_GATEWAY: + sprintf(gws, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case RTA_OIF: + sprintf(ifs, "%u", + *((int *)RTA_DATA(rt_attr))); + break; + case RTA_METRICS: + sprintf(metrics, "%u", + *((int *)RTA_DATA(rt_attr))); + default: + break; + } + } + sprintf(dsts_len, "%d", rt_msg->rtm_dst_len); + route.dst = atoi(dsts); + route.dst_len = atoi(dsts_len); + route.gw = atoi(gws); + route.iface = atoi(ifs); + route.metric = atoi(metrics); + route.iface_name = alloca(sizeof(char *) * IFNAMSIZ); + route.iface_name = if_indextoname(route.iface, route.iface_name); + route.mac = getmac(route.iface_name); + if (route.mac == -1) + int_exit(0); + assert(bpf_map_update_elem(tx_port_map_fd, + &route.iface, &route.iface, 0) == 0); + if (rtm_family == AF_INET) { + struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; + } *prefix_value; + + prefix_key = alloca(sizeof(*prefix_key) + 3); + prefix_value = alloca(sizeof(*prefix_value)); + + prefix_key->prefixlen = 32; + prefix_key->prefixlen = route.dst_len; + direct_entry.mac = route.mac & 0xffffffffffff; + direct_entry.ifindex = route.iface; + direct_entry.arp.mac = 0; + direct_entry.arp.dst = 0; + if (route.dst_len == 32) { + if (nh->nlmsg_type == RTM_DELROUTE) { + assert(bpf_map_delete_elem(exact_match_map_fd, + &route.dst) == 0); + } else { + if (bpf_map_lookup_elem(arp_table_map_fd, + &route.dst, + &direct_entry.arp.mac) == 0) + direct_entry.arp.dst = route.dst; + assert(bpf_map_update_elem(exact_match_map_fd, + &route.dst, + &direct_entry, 0) == 0); + } + } + for (i = 0; i < 4; i++) + prefix_key->data[i] = (route.dst >> i * 8) & 0xff; + + printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n", + (int)prefix_key->data[0], + (int)prefix_key->data[1], + (int)prefix_key->data[2], + (int)prefix_key->data[3], + route.gw, route.dst_len, + route.metric, + route.iface_name); + if (bpf_map_lookup_elem(lpm_map_fd, prefix_key, + prefix_value) < 0) { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = prefix_key->data[i]; + prefix_value->value = route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + + assert(bpf_map_update_elem(lpm_map_fd, + prefix_key, + prefix_value, 0 + ) == 0); + } else { + if (nh->nlmsg_type == RTM_DELROUTE) { + printf("deleting entry\n"); + printf("prefix key=%d.%d.%d.%d/%d", + prefix_key->data[0], + prefix_key->data[1], + prefix_key->data[2], + prefix_key->data[3], + prefix_key->prefixlen); + assert(bpf_map_delete_elem(lpm_map_fd, + prefix_key + ) == 0); + /* Rereading the route table to check if + * there is an entry with the same + * prefix but a different metric as the + * deleted enty. + */ + get_route_table(AF_INET); + } else if (prefix_key->data[0] == + prefix_value->prefix[0] && + prefix_key->data[1] == + prefix_value->prefix[1] && + prefix_key->data[2] == + prefix_value->prefix[2] && + prefix_key->data[3] == + prefix_value->prefix[3] && + route.metric >= prefix_value->metric) { + continue; + } else { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = + prefix_key->data[i]; + prefix_value->value = + route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + assert(bpf_map_update_elem(lpm_map_fd, + prefix_key, + prefix_value, + 0) == 0); + } + } + } + memset(&route, 0, sizeof(route)); + memset(dsts, 0, sizeof(dsts)); + memset(dsts_len, 0, sizeof(dsts_len)); + memset(gws, 0, sizeof(gws)); + memset(ifs, 0, sizeof(ifs)); + memset(&route, 0, sizeof(route)); + } +} + +/* Function to read the existing route table when the process is launched*/ +static int get_route_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + + struct { + struct nlmsghdr nl; + struct rtmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETROUTE; + + req.rt.rtm_family = rtm_family; + req.rt.rtm_table = RT_TABLE_MAIN; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + printf("send to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_route(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to parse the arp entry returned by netlink + * Updates the arp entry related map entries + */ +static void read_arp(struct nlmsghdr *nh, int nll) +{ + struct rtattr *rt_attr; + char dsts[24], mac[24]; + struct ndmsg *rt_msg; + int rtl, ndm_family; + + struct arp_table { + __be64 mac; + __be32 dst; + } arp_entry; + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + if (nh->nlmsg_type == RTM_GETNEIGH) + printf("READING arp entry\n"); + printf("Address\tHwAddress\n"); + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct ndmsg *)NLMSG_DATA(nh); + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + ndm_family = rt_msg->ndm_family; + rtl = RTM_PAYLOAD(nh); + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case NDA_LLADDR: + sprintf(mac, "%lld", + *((__be64 *)RTA_DATA(rt_attr))); + break; + default: + break; + } + } + arp_entry.dst = atoi(dsts); + arp_entry.mac = atol(mac); + printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac); + if (ndm_family == AF_INET) { + if (bpf_map_lookup_elem(exact_match_map_fd, + &arp_entry.dst, + &direct_entry) == 0) { + if (nh->nlmsg_type == RTM_DELNEIGH) { + direct_entry.arp.dst = 0; + direct_entry.arp.mac = 0; + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + direct_entry.arp.dst = arp_entry.dst; + direct_entry.arp.mac = arp_entry.mac; + } + assert(bpf_map_update_elem(exact_match_map_fd, + &arp_entry.dst, + &direct_entry, 0 + ) == 0); + memset(&direct_entry, 0, sizeof(direct_entry)); + } + if (nh->nlmsg_type == RTM_DELNEIGH) { + assert(bpf_map_delete_elem(arp_table_map_fd, + &arp_entry.dst) == 0); + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + assert(bpf_map_update_elem(arp_table_map_fd, + &arp_entry.dst, + &arp_entry.mac, 0 + ) == 0); + } + } + memset(&arp_entry, 0, sizeof(arp_entry)); + memset(dsts, 0, sizeof(dsts)); + } +} + +/* Function to read the existing arp table when the process is launched*/ +static int get_arp_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + struct { + struct nlmsghdr nl; + struct ndmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETNEIGH; + req.rt.ndm_state = NUD_REACHABLE; + req.rt.ndm_family = rtm_family; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + printf("send to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to keep track and update changes in route and arp table + * Give regular statistics of packets forwarded + */ +static int monitor_route(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + const unsigned int nr_keys = 256; + struct pollfd fds_route, fds_arp; + __u64 prev[nr_keys][nr_cpus]; + struct sockaddr_nl la, lr; + __u64 values[nr_cpus]; + struct nlmsghdr *nh; + int nll, ret = 0; + int interval = 5; + __u32 key; + int i; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + fcntl(sock, F_SETFL, O_NONBLOCK); + memset(&lr, 0, sizeof(lr)); + lr.nl_family = AF_NETLINK; + lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY; + if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + fds_route.fd = sock; + fds_route.events = POLL_IN; + + sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock_arp < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + fcntl(sock_arp, F_SETFL, O_NONBLOCK); + memset(&la, 0, sizeof(la)); + la.nl_family = AF_NETLINK; + la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY; + if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + fds_arp.fd = sock_arp; + fds_arp.events = POLL_IN; + + memset(prev, 0, sizeof(prev)); + do { + signal(SIGINT, close_and_exit); + signal(SIGTERM, close_and_exit); + + sleep(interval); + for (key = 0; key < nr_keys; key++) { + __u64 sum = 0; + + assert(bpf_map_lookup_elem(rxcnt_map_fd, + &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[key][i]); + if (sum) + printf("proto %u: %10llu pkt/s\n", + key, sum / interval); + memcpy(prev[key], values, sizeof(values)); + } + + memset(buf, 0, sizeof(buf)); + if (poll(&fds_route, 1, 3) == POLL_IN) { + nll = recv_msg(lr, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + printf("Routing table updated.\n"); + read_route(nh, nll); + } + memset(buf, 0, sizeof(buf)); + if (poll(&fds_arp, 1, 3) == POLL_IN) { + nll = recv_msg(la, sock_arp); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); + } + + } while (1); +cleanup: + close(sock); + return ret; +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "%s: %s [OPTS] interface name list\n\n" + "OPTS:\n" + " -S use skb-mode\n" + " -F force loading prog\n", + __func__, prog); +} + +int main(int ac, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + const char *optstr = "SF"; + struct bpf_object *obj; + char filename[256]; + char **ifname_list; + int prog_fd, opt; + int err, i = 1; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + total_ifindex = ac - 1; + ifname_list = (argv + 1); + + while ((opt = getopt(ac, argv, optstr)) != -1) { + switch (opt) { + case 'S': + flags |= XDP_FLAGS_SKB_MODE; + total_ifindex--; + ifname_list++; + break; + case 'F': + flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + total_ifindex--; + ifname_list++; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(flags & XDP_FLAGS_SKB_MODE)) + flags |= XDP_FLAGS_DRV_MODE; + + if (optind == ac) { + usage(basename(argv[0])); + return 1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + printf("\n**************loading bpf file*********************\n\n\n"); + if (!prog_fd) { + printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + return 1; + } + + lpm_map_fd = bpf_object__find_map_fd_by_name(obj, "lpm_map"); + rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); + arp_table_map_fd = bpf_object__find_map_fd_by_name(obj, "arp_table"); + exact_match_map_fd = bpf_object__find_map_fd_by_name(obj, + "exact_match"); + tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); + if (lpm_map_fd < 0 || rxcnt_map_fd < 0 || arp_table_map_fd < 0 || + exact_match_map_fd < 0 || tx_port_map_fd < 0) { + printf("bpf_object__find_map_fd_by_name failed\n"); + return 1; + } + + ifindex_list = (int *)calloc(total_ifindex, sizeof(int *)); + for (i = 0; i < total_ifindex; i++) { + ifindex_list[i] = if_nametoindex(ifname_list[i]); + if (!ifindex_list[i]) { + printf("Couldn't translate interface name: %s", + strerror(errno)); + return 1; + } + } + prog_id_list = (__u32 *)calloc(total_ifindex, sizeof(__u32 *)); + for (i = 0; i < total_ifindex; i++) { + if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd, flags) < 0) { + printf("link set xdp fd failed\n"); + int recovery_index = i; + + for (i = 0; i < recovery_index; i++) + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); + + return 1; + } + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return err; + } + prog_id_list[i] = info.id; + memset(&info, 0, sizeof(info)); + printf("Attached to %d\n", ifindex_list[i]); + } + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + printf("*******************ROUTE TABLE*************************\n\n\n"); + get_route_table(AF_INET); + printf("*******************ARP TABLE***************************\n\n\n"); + get_arp_table(AF_INET); + if (monitor_route() < 0) { + printf("Error in receiving route update"); + return 1; + } + + return 0; +} diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c new file mode 100644 index 000000000..5e7459f9b --- /dev/null +++ b/samples/bpf/xdp_rxq_info_kern.c @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * + * Example howto extract XDP RX-queue info + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/in.h> +#include <bpf/bpf_helpers.h> + +/* Config setup from with userspace + * + * User-side setup ifindex in config_map, to verify that + * ctx->ingress_ifindex is correct (against configured ifindex) + */ +struct config { + __u32 action; + int ifindex; + __u32 options; +}; +enum cfg_options_flags { + NO_TOUCH = 0x0U, + READ_MEM = 0x1U, + SWAP_MAC = 0x2U, +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct config); + __uint(max_entries, 1); +} config_map SEC(".maps"); + +/* Common stats data record (shared with userspace) */ +struct datarec { + __u64 processed; + __u64 issue; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} stats_global_map SEC(".maps"); + +#define MAX_RXQs 64 + +/* Stats per rx_queue_index (per CPU) */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, MAX_RXQs + 1); +} rx_queue_index_map SEC(".maps"); + +static __always_inline +void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +SEC("xdp_prog0") +int xdp_prognum0(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct datarec *rec, *rxq_rec; + int ingress_ifindex; + struct config *config; + u32 key = 0; + + /* Global stats record */ + rec = bpf_map_lookup_elem(&stats_global_map, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + /* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF + * instructions inside kernel to access xdp_rxq->dev->ifindex + */ + ingress_ifindex = ctx->ingress_ifindex; + + config = bpf_map_lookup_elem(&config_map, &key); + if (!config) + return XDP_ABORTED; + + /* Simple test: check ctx provided ifindex is as expected */ + if (ingress_ifindex != config->ifindex) { + /* count this error case */ + rec->issue++; + return XDP_ABORTED; + } + + /* Update stats per rx_queue_index. Handle if rx_queue_index + * is larger than stats map can contain info for. + */ + key = ctx->rx_queue_index; + if (key >= MAX_RXQs) + key = MAX_RXQs; + rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key); + if (!rxq_rec) + return XDP_ABORTED; + rxq_rec->processed++; + if (key == MAX_RXQs) + rxq_rec->issue++; + + /* Default: Don't touch packet data, only count packets */ + if (unlikely(config->options & (READ_MEM|SWAP_MAC))) { + struct ethhdr *eth = data; + + if (eth + 1 > data_end) + return XDP_ABORTED; + + /* Avoid compiler removing this: Drop non 802.3 Ethertypes */ + if (ntohs(eth->h_proto) < ETH_P_802_3_MIN) + return XDP_ABORTED; + + /* XDP_TX requires changing MAC-addrs, else HW may drop. + * Can also be enabled with --swapmac (for test purposes) + */ + if (unlikely(config->options & SWAP_MAC)) + swap_src_dst_mac(data); + } + + return config->action; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c new file mode 100644 index 000000000..93fa1bc54 --- /dev/null +++ b/samples/bpf/xdp_rxq_info_user.c @@ -0,0 +1,605 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + */ +static const char *__doc__ = " XDP RX-queue info extract example\n\n" + "Monitor how many packets per sec (pps) are received\n" + "per NIC RX queue index and which CPU processed the packet\n" + ; + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <locale.h> +#include <sys/resource.h> +#include <getopt.h> +#include <net/if.h> +#include <time.h> + +#include <arpa/inet.h> +#include <linux/if_link.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; +static __u32 prog_id; + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + +static struct bpf_map *stats_global_map; +static struct bpf_map *rx_queue_index_map; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"sec", required_argument, NULL, 's' }, + {"no-separators", no_argument, NULL, 'z' }, + {"action", required_argument, NULL, 'a' }, + {"readmem", no_argument, NULL, 'r' }, + {"swapmac", no_argument, NULL, 'm' }, + {"force", no_argument, NULL, 'F' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (ifindex > -1) { + if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(EXIT_FAIL); + } + if (prog_id == curr_prog_id) { + fprintf(stderr, + "Interrupted: Removing XDP program on ifindex:%d device:%s\n", + ifindex, ifname); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + } else if (!curr_prog_id) { + printf("couldn't find a prog id on a given iface\n"); + } else { + printf("program on interface changed, not removing\n"); + } + } + exit(EXIT_OK); +} + +struct config { + __u32 action; + int ifindex; + __u32 options; +}; +enum cfg_options_flags { + NO_TOUCH = 0x0U, + READ_MEM = 0x1U, + SWAP_MAC = 0x2U, +}; +#define XDP_ACTION_MAX (XDP_TX + 1) +#define XDP_ACTION_MAX_STRLEN 11 +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", +}; + +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} + +static int parse_xdp_action(char *action_str) +{ + size_t maxlen; + __u64 action = -1; + int i; + + for (i = 0; i < XDP_ACTION_MAX; i++) { + maxlen = XDP_ACTION_MAX_STRLEN; + if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) { + action = i; + break; + } + } + return action; +} + +static void list_xdp_actions(void) +{ + int i; + + printf("Available XDP --action <options>\n"); + for (i = 0; i < XDP_ACTION_MAX; i++) + printf("\t%s\n", xdp_action_names[i]); + printf("\n"); +} + +static char* options2str(enum cfg_options_flags flag) +{ + if (flag == NO_TOUCH) + return "no_touch"; + if (flag & SWAP_MAC) + return "swapmac"; + if (flag & READ_MEM) + return "read"; + fprintf(stderr, "ERR: Unknown config option flags"); + exit(EXIT_FAIL); +} + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); + list_xdp_actions(); +} + +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAIL); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 issue; +}; +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct stats_record { + struct record stats; + struct record *rxq; +}; + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec *array; + + array = calloc(nr_cpus, sizeof(struct datarec)); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct record *alloc_record_per_rxq(void) +{ + unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + struct record *array; + + array = calloc(nr_rxqs, sizeof(struct record)); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + struct stats_record *rec; + int i; + + rec = calloc(1, sizeof(struct stats_record)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + rec->rxq = alloc_record_per_rxq(); + for (i = 0; i < nr_rxqs; i++) + rec->rxq[i].cpu = alloc_record_per_cpu(); + + rec->stats.cpu = alloc_record_per_cpu(); + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + int i; + + for (i = 0; i < nr_rxqs; i++) + free(r->rxq[i].cpu); + + free(r->rxq); + free(r->stats.cpu); + free(r); +} + +static bool map_collect_percpu(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_issue = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].issue = values[i].issue; + sum_issue += values[i].issue; + } + rec->total.processed = sum_processed; + rec->total.issue = sum_issue; + return true; +} + +static void stats_collect(struct stats_record *rec) +{ + int fd, i, max_rxqs; + + fd = bpf_map__fd(stats_global_map); + map_collect_percpu(fd, 0, &rec->stats); + + fd = bpf_map__fd(rx_queue_index_map); + max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + for (i = 0; i < max_rxqs; i++) + map_collect_percpu(fd, i, &rec->rxq[i]); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, + struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = packets / period_; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + int action, __u32 cfg_opt) +{ + unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; + unsigned int nr_cpus = bpf_num_possible_cpus(); + double pps = 0, err = 0; + struct record *rec, *prev; + double t; + int rxq; + int i; + + /* Header */ + printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s options:%s\n", + ifname, ifindex, action2str(action), options2str(cfg_opt)); + + /* stats_global_map */ + { + char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %-7s %'-11.0f\n"; + char *errstr = ""; + + printf("%-15s %-7s %-11s %-11s\n", + "XDP stats", "CPU", "pps", "issue-pps"); + + rec = &stats_rec->stats; + prev = &stats_prev->stats; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps (r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + errstr = "invalid-ifindex"; + if (pps > 0) + printf(fmt_rx, "XDP-RX CPU", + i, pps, err, errstr); + } + pps = calc_pps (&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + printf(fm2_rx, "XDP-RX CPU", "total", pps, err); + } + + /* rx_queue_index_map */ + printf("\n%-15s %-7s %-11s %-11s\n", + "RXQ stats", "RXQ:CPU", "pps", "issue-pps"); + + for (rxq = 0; rxq < nr_rxqs; rxq++) { + char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n"; + char *errstr = ""; + int rxq_ = rxq; + + /* Last RXQ in map catch overflows */ + if (rxq_ == nr_rxqs - 1) + rxq_ = -1; + + rec = &stats_rec->rxq[rxq]; + prev = &stats_prev->rxq[rxq]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps (r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) { + if (rxq_ == -1) + errstr = "map-overflow-RXQ"; + else + errstr = "err"; + } + if (pps > 0) + printf(fmt_rx, "rx_queue_index", + rxq_, i, pps, err, errstr); + } + pps = calc_pps (&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (pps || err) + printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err); + } +} + + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static void stats_poll(int interval, int action, __u32 cfg_opt) +{ + struct stats_record *record, *prev; + + record = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(record); + + while (1) { + swap(&prev, &record); + stats_collect(record); + stats_print(record, prev, action, cfg_opt); + sleep(interval); + } + + free_stats_record(record); + free_stats_record(prev); +} + + +int main(int argc, char **argv) +{ + __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + int prog_fd, map_fd, opt, err; + bool use_separators = true; + struct config cfg = { 0 }; + struct bpf_object *obj; + struct bpf_map *map; + char filename[256]; + int longindex = 0; + int interval = 2; + __u32 key = 0; + + + char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 }; + int action = XDP_PASS; /* Default action */ + char *action_str = NULL; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return EXIT_FAIL; + + map = bpf_object__find_map_by_name(obj, "config_map"); + stats_global_map = bpf_object__find_map_by_name(obj, "stats_global_map"); + rx_queue_index_map = bpf_object__find_map_by_name(obj, "rx_queue_index_map"); + if (!map || !stats_global_map || !rx_queue_index_map) { + printf("finding a map in obj file failed\n"); + return EXIT_FAIL; + } + map_fd = bpf_map__fd(map); + + if (!prog_fd) { + fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", strerror(errno)); + return EXIT_FAIL; + } + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "FhSrmzd:s:a:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 's': + interval = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'z': + use_separators = false; + break; + case 'a': + action_str = (char *)&action_str_buf; + strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN); + break; + case 'r': + cfg_options |= READ_MEM; + break; + case 'm': + cfg_options |= SWAP_MAC; + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'h': + error: + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + cfg.ifindex = ifindex; + + /* Parse action string */ + if (action_str) { + action = parse_xdp_action(action_str); + if (action < 0) { + fprintf(stderr, "ERR: Invalid XDP --action: %s\n", + action_str); + list_xdp_actions(); + return EXIT_FAIL_OPTION; + } + } + cfg.action = action; + + /* XDP_TX requires changing MAC-addrs, else HW may drop */ + if (action == XDP_TX) + cfg_options |= SWAP_MAC; + cfg.options = cfg_options; + + /* Trick to pretty printf with thousands separators use %' */ + if (use_separators) + setlocale(LC_NUMERIC, "en_US"); + + /* User-side setup ifindex in config_map */ + err = bpf_map_update_elem(map_fd, &key, &cfg, 0); + if (err) { + fprintf(stderr, "Store config failed (err:%d)\n", err); + exit(EXIT_FAIL_BPF); + } + + /* Remove XDP program when program is interrupted or killed */ + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + fprintf(stderr, "link set xdp fd failed\n"); + return EXIT_FAIL_XDP; + } + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return err; + } + prog_id = info.id; + + stats_poll(interval, action, cfg_options); + return EXIT_OK; +} diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c new file mode 100644 index 000000000..9cf76b340 --- /dev/null +++ b/samples/bpf/xdp_sample_pkts_kern.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#define SAMPLE_SIZE 64ul + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); +} my_map SEC(".maps"); + +SEC("xdp_sample") +int xdp_sample_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + + /* Metadata will be in the perf event before the packet data. */ + struct S { + u16 cookie; + u16 pkt_len; + } __packed metadata; + + if (data < data_end) { + /* The XDP perf_event_output handler will use the upper 32 bits + * of the flags argument as a number of bytes to include of the + * packet payload in the event data. If the size is too big, the + * call to bpf_perf_event_output will fail and return -EFAULT. + * + * See bpf_xdp_event_output in net/core/filter.c. + * + * The BPF_F_CURRENT_CPU flag means that the event output fd + * will be indexed by the CPU number in the event map. + */ + u64 flags = BPF_F_CURRENT_CPU; + u16 sample_size; + int ret; + + metadata.cookie = 0xdead; + metadata.pkt_len = (u16)(data_end - data); + sample_size = min(metadata.pkt_len, SAMPLE_SIZE); + flags |= (u64)sample_size << 32; + + ret = bpf_perf_event_output(ctx, &my_map, flags, + &metadata, sizeof(metadata)); + if (ret) + bpf_printk("perf_event_output failed: %d\n", ret); + } + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c new file mode 100644 index 000000000..4b2a300c7 --- /dev/null +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/perf_event.h> +#include <linux/bpf.h> +#include <net/if.h> +#include <errno.h> +#include <assert.h> +#include <sys/sysinfo.h> +#include <sys/ioctl.h> +#include <signal.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include <sys/resource.h> +#include <libgen.h> +#include <linux/if_link.h> + +#include "perf-sys.h" + +static int if_idx; +static char *if_name; +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static __u32 prog_id; +static struct perf_buffer *pb = NULL; + +static int do_attach(int idx, int fd, const char *name) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + int err; + + err = bpf_set_link_xdp_fd(idx, fd, xdp_flags); + if (err < 0) { + printf("ERROR: failed to attach program to %s\n", name); + return err; + } + + err = bpf_obj_get_info_by_fd(fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return err; + } + prog_id = info.id; + + return err; +} + +static int do_detach(int idx, const char *name) +{ + __u32 curr_prog_id = 0; + int err = 0; + + err = bpf_get_link_xdp_id(idx, &curr_prog_id, xdp_flags); + if (err) { + printf("bpf_get_link_xdp_id failed\n"); + return err; + } + if (prog_id == curr_prog_id) { + err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); + if (err < 0) + printf("ERROR: failed to detach prog from %s\n", name); + } else if (!curr_prog_id) { + printf("couldn't find a prog id on a %s\n", name); + } else { + printf("program on interface changed, not removing\n"); + } + + return err; +} + +#define SAMPLE_SIZE 64 + +static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) +{ + struct { + __u16 cookie; + __u16 pkt_len; + __u8 pkt_data[SAMPLE_SIZE]; + } __packed *e = data; + int i; + + if (e->cookie != 0xdead) { + printf("BUG cookie %x sized %d\n", e->cookie, size); + return; + } + + printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len); + for (i = 0; i < 14 && i < e->pkt_len; i++) + printf("%02x ", e->pkt_data[i]); + printf("\n"); +} + +static void sig_handler(int signo) +{ + do_detach(if_idx, if_name); + perf_buffer__free(pb); + exit(0); +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "%s: %s [OPTS] <ifname|ifindex>\n\n" + "OPTS:\n" + " -F force loading prog\n", + __func__, prog); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + struct perf_buffer_opts pb_opts = {}; + const char *optstr = "FS"; + int prog_fd, map_fd, opt; + struct bpf_object *obj; + struct bpf_map *map; + char filename[256]; + int ret, err; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + if (optind == argc) { + usage(basename(argv[0])); + return 1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + if (!prog_fd) { + printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + return 1; + } + + map = bpf_map__next(NULL, obj); + if (!map) { + printf("finding a map in obj file failed\n"); + return 1; + } + map_fd = bpf_map__fd(map); + + if_idx = if_nametoindex(argv[optind]); + if (!if_idx) + if_idx = strtoul(argv[optind], NULL, 0); + + if (!if_idx) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + if_name = argv[optind]; + err = do_attach(if_idx, prog_fd, if_name); + if (err) + return err; + + if (signal(SIGINT, sig_handler) || + signal(SIGHUP, sig_handler) || + signal(SIGTERM, sig_handler)) { + perror("signal"); + return 1; + } + + pb_opts.sample_cb = print_bpf_output; + pb = perf_buffer__new(map_fd, 8, &pb_opts); + err = libbpf_get_error(pb); + if (err) { + perror("perf_buffer setup failed"); + return 1; + } + + while ((ret = perf_buffer__poll(pb, 1000)) >= 0) { + } + + kill(0, SIGINT); + return ret; +} diff --git a/samples/bpf/xdp_tx_iptunnel_common.h b/samples/bpf/xdp_tx_iptunnel_common.h new file mode 100644 index 000000000..be839892c --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_common.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2016 Facebook + */ +#ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H +#define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H + +#include <linux/types.h> + +#define MAX_IPTNL_ENTRIES 256U + +struct vip { + union { + __u32 v6[4]; + __u32 v4; + } daddr; + __u16 dport; + __u16 family; + __u8 protocol; +}; + +struct iptnl_info { + union { + __u32 v6[4]; + __u32 v4; + } saddr; + union { + __u32 v6[4]; + __u32 v4; + } daddr; + __u16 family; + __u8 dmac[6]; +}; + +#endif diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c new file mode 100644 index 000000000..575d57e4b --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_kern.c @@ -0,0 +1,237 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program shows how to use bpf_xdp_adjust_head() by + * encapsulating the incoming packet in an IPv4/v6 header + * and then XDP_TX it out. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <bpf/bpf_helpers.h> +#include "xdp_tx_iptunnel_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 256); +} rxcnt SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct vip); + __type(value, struct iptnl_info); + __uint(max_entries, MAX_IPTNL_ENTRIES); +} vip2tnl SEC(".maps"); + +static __always_inline void count_tx(u32 protocol) +{ + u64 *rxcnt_count; + + rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol); + if (rxcnt_count) + *rxcnt_count += 1; +} + +static __always_inline int get_dport(void *trans_data, void *data_end, + u8 protocol) +{ + struct tcphdr *th; + struct udphdr *uh; + + switch (protocol) { + case IPPROTO_TCP: + th = (struct tcphdr *)trans_data; + if (th + 1 > data_end) + return -1; + return th->dest; + case IPPROTO_UDP: + uh = (struct udphdr *)trans_data; + if (uh + 1 > data_end) + return -1; + return uh->dest; + default: + return 0; + } +} + +static __always_inline void set_ethhdr(struct ethhdr *new_eth, + const struct ethhdr *old_eth, + const struct iptnl_info *tnl, + __be16 h_proto) +{ + memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); + memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest)); + new_eth->h_proto = h_proto; +} + +static __always_inline int handle_ipv4(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct iptnl_info *tnl; + struct ethhdr *new_eth; + struct ethhdr *old_eth; + struct iphdr *iph = data + sizeof(struct ethhdr); + u16 *next_iph_u16; + u16 payload_len; + struct vip vip = {}; + int dport; + u32 csum = 0; + int i; + + if (iph + 1 > data_end) + return XDP_DROP; + + dport = get_dport(iph + 1, data_end, iph->protocol); + if (dport == -1) + return XDP_DROP; + + vip.protocol = iph->protocol; + vip.family = AF_INET; + vip.daddr.v4 = iph->daddr; + vip.dport = dport; + payload_len = ntohs(iph->tot_len); + + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); + /* It only does v4-in-v4 */ + if (!tnl || tnl->family != AF_INET) + return XDP_PASS; + + /* The vip key is found. Add an IP header and send it out */ + + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr))) + return XDP_DROP; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + iph = data + sizeof(*new_eth); + old_eth = data + sizeof(*iph); + + if (new_eth + 1 > data_end || + old_eth + 1 > data_end || + iph + 1 > data_end) + return XDP_DROP; + + set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP)); + + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = 0; + iph->protocol = IPPROTO_IPIP; + iph->check = 0; + iph->tos = 0; + iph->tot_len = htons(payload_len + sizeof(*iph)); + iph->daddr = tnl->daddr.v4; + iph->saddr = tnl->saddr.v4; + iph->ttl = 8; + + next_iph_u16 = (u16 *)iph; +#pragma clang loop unroll(full) + for (i = 0; i < sizeof(*iph) >> 1; i++) + csum += *next_iph_u16++; + + iph->check = ~((csum & 0xffff) + (csum >> 16)); + + count_tx(vip.protocol); + + return XDP_TX; +} + +static __always_inline int handle_ipv6(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct iptnl_info *tnl; + struct ethhdr *new_eth; + struct ethhdr *old_eth; + struct ipv6hdr *ip6h = data + sizeof(struct ethhdr); + __u16 payload_len; + struct vip vip = {}; + int dport; + + if (ip6h + 1 > data_end) + return XDP_DROP; + + dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr); + if (dport == -1) + return XDP_DROP; + + vip.protocol = ip6h->nexthdr; + vip.family = AF_INET6; + memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr)); + vip.dport = dport; + payload_len = ip6h->payload_len; + + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); + /* It only does v6-in-v6 */ + if (!tnl || tnl->family != AF_INET6) + return XDP_PASS; + + /* The vip key is found. Add an IP header and send it out */ + + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr))) + return XDP_DROP; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + ip6h = data + sizeof(*new_eth); + old_eth = data + sizeof(*ip6h); + + if (new_eth + 1 > data_end || + old_eth + 1 > data_end || + ip6h + 1 > data_end) + return XDP_DROP; + + set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6)); + + ip6h->version = 6; + ip6h->priority = 0; + memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl)); + ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h)); + ip6h->nexthdr = IPPROTO_IPV6; + ip6h->hop_limit = 8; + memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6)); + memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6)); + + count_tx(vip.protocol); + + return XDP_TX; +} + +SEC("xdp_tx_iptunnel") +int _xdp_tx_iptunnel(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct ethhdr *eth = data; + __u16 h_proto; + + if (eth + 1 > data_end) + return XDP_DROP; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_IP)) + return handle_ipv4(xdp); + else if (h_proto == htons(ETH_P_IPV6)) + + return handle_ipv6(xdp); + else + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c new file mode 100644 index 000000000..a419bee15 --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Facebook + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <net/if.h> +#include <sys/resource.h> +#include <arpa/inet.h> +#include <netinet/ether.h> +#include <unistd.h> +#include <time.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "bpf_util.h" +#include "xdp_tx_iptunnel_common.h" + +#define STATS_INTERVAL_S 2U + +static int ifindex = -1; +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int rxcnt_map_fd; +static __u32 prog_id; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (ifindex > -1) { + if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(1); + } + if (prog_id == curr_prog_id) + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on a given iface\n"); + else + printf("program on interface changed, not removing\n"); + } + exit(0); +} + +/* simple per-protocol drop counter + */ +static void poll_stats(unsigned int kill_after_s) +{ + const unsigned int nr_protos = 256; + unsigned int nr_cpus = bpf_num_possible_cpus(); + time_t started_at = time(NULL); + __u64 values[nr_cpus], prev[nr_protos][nr_cpus]; + __u32 proto; + int i; + + memset(prev, 0, sizeof(prev)); + + while (!kill_after_s || time(NULL) - started_at <= kill_after_s) { + sleep(STATS_INTERVAL_S); + + for (proto = 0; proto < nr_protos; proto++) { + __u64 sum = 0; + + assert(bpf_map_lookup_elem(rxcnt_map_fd, &proto, + values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[proto][i]); + + if (sum) + printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n", + proto, sum, sum / STATS_INTERVAL_S); + memcpy(prev[proto], values, sizeof(values)); + } + } +} + +static void usage(const char *cmd) +{ + printf("Start a XDP prog which encapsulates incoming packets\n" + "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n" + "is used to select packets to encapsulate\n\n"); + printf("Usage: %s [...]\n", cmd); + printf(" -i <ifname|ifindex> Interface\n"); + printf(" -a <vip-service-address> IPv4 or IPv6\n"); + printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n"); + printf(" -s <source-ip> Used in the IPTunnel header\n"); + printf(" -d <dest-ip> Used in the IPTunnel header\n"); + printf(" -m <dest-MAC> Used in sending the IP Tunneled pkt\n"); + printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n"); + printf(" -P <IP-Protocol> Default is TCP\n"); + printf(" -S use skb-mode\n"); + printf(" -N enforce native mode\n"); + printf(" -F Force loading the XDP prog\n"); + printf(" -h Display this help\n"); +} + +static int parse_ipstr(const char *ipstr, unsigned int *addr) +{ + if (inet_pton(AF_INET6, ipstr, addr) == 1) { + return AF_INET6; + } else if (inet_pton(AF_INET, ipstr, addr) == 1) { + addr[1] = addr[2] = addr[3] = 0; + return AF_INET; + } + + fprintf(stderr, "%s is an invalid IP\n", ipstr); + return AF_UNSPEC; +} + +static int parse_ports(const char *port_str, int *min_port, int *max_port) +{ + char *end; + long tmp_min_port; + long tmp_max_port; + + tmp_min_port = strtol(optarg, &end, 10); + if (tmp_min_port < 1 || tmp_min_port > 65535) { + fprintf(stderr, "Invalid port(s):%s\n", optarg); + return 1; + } + + if (*end == '-') { + end++; + tmp_max_port = strtol(end, NULL, 10); + if (tmp_max_port < 1 || tmp_max_port > 65535) { + fprintf(stderr, "Invalid port(s):%s\n", optarg); + return 1; + } + } else { + tmp_max_port = tmp_min_port; + } + + if (tmp_min_port > tmp_max_port) { + fprintf(stderr, "Invalid port(s):%s\n", optarg); + return 1; + } + + if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) { + fprintf(stderr, "Port range (%s) is larger than %u\n", + port_str, MAX_IPTNL_ENTRIES); + return 1; + } + *min_port = tmp_min_port; + *max_port = tmp_max_port; + + return 0; +} + +int main(int argc, char **argv) +{ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int min_port = 0, max_port = 0, vip2tnl_map_fd; + const char *optstr = "i:a:p:s:d:m:T:P:FSNh"; + unsigned char opt_flags[256] = {}; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + unsigned int kill_after_s = 0; + struct iptnl_info tnl = {}; + struct bpf_object *obj; + struct vip vip = {}; + char filename[256]; + int opt, prog_fd; + int i, err; + + tnl.family = AF_UNSPEC; + vip.protocol = IPPROTO_TCP; + + for (i = 0; i < strlen(optstr); i++) + if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z') + opt_flags[(unsigned char)optstr[i]] = 1; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + unsigned short family; + unsigned int *v6; + + switch (opt) { + case 'i': + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); + break; + case 'a': + vip.family = parse_ipstr(optarg, vip.daddr.v6); + if (vip.family == AF_UNSPEC) + return 1; + break; + case 'p': + if (parse_ports(optarg, &min_port, &max_port)) + return 1; + break; + case 'P': + vip.protocol = atoi(optarg); + break; + case 's': + case 'd': + if (opt == 's') + v6 = tnl.saddr.v6; + else + v6 = tnl.daddr.v6; + + family = parse_ipstr(optarg, v6); + if (family == AF_UNSPEC) + return 1; + if (tnl.family == AF_UNSPEC) { + tnl.family = family; + } else if (tnl.family != family) { + fprintf(stderr, + "The IP version of the src and dst addresses used in the IP encapsulation does not match\n"); + return 1; + } + break; + case 'm': + if (!ether_aton_r(optarg, + (struct ether_addr *)tnl.dmac)) { + fprintf(stderr, "Invalid mac address:%s\n", + optarg); + return 1; + } + break; + case 'T': + kill_after_s = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + default: + usage(argv[0]); + return 1; + } + opt_flags[opt] = 0; + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + for (i = 0; i < strlen(optstr); i++) { + if (opt_flags[(unsigned int)optstr[i]]) { + fprintf(stderr, "Missing argument -%c\n", optstr[i]); + usage(argv[0]); + return 1; + } + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + return 1; + } + + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; + + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + if (!prog_fd) { + printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + return 1; + } + + rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); + vip2tnl_map_fd = bpf_object__find_map_fd_by_name(obj, "vip2tnl"); + if (vip2tnl_map_fd < 0 || rxcnt_map_fd < 0) { + printf("bpf_object__find_map_fd_by_name failed\n"); + return 1; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + while (min_port <= max_port) { + vip.dport = htons(min_port++); + if (bpf_map_update_elem(vip2tnl_map_fd, &vip, &tnl, + BPF_NOEXIST)) { + perror("bpf_map_update_elem(&vip2tnl)"); + return 1; + } + } + + if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { + printf("link set xdp fd failed\n"); + return 1; + } + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return err; + } + prog_id = info.id; + + poll_stats(kill_after_s); + + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + + return 0; +} diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h new file mode 100644 index 000000000..b7eca15c7 --- /dev/null +++ b/samples/bpf/xdpsock.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright(c) 2019 Intel Corporation. + */ + +#ifndef XDPSOCK_H_ +#define XDPSOCK_H_ + +#define MAX_SOCKS 4 + +#endif /* XDPSOCK_H */ diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c new file mode 100644 index 000000000..054304843 --- /dev/null +++ b/samples/bpf/xdpsock_kern.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "xdpsock.h" + +/* This XDP program is only needed for the XDP_SHARED_UMEM mode. + * If you do not use this mode, libbpf can supply an XDP program for you. + */ + +struct { + __uint(type, BPF_MAP_TYPE_XSKMAP); + __uint(max_entries, MAX_SOCKS); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} xsks_map SEC(".maps"); + +static unsigned int rr; + +SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) +{ + rr = (rr + 1) & (MAX_SOCKS - 1); + + return bpf_redirect_map(&xsks_map, rr, XDP_DROP); +} diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c new file mode 100644 index 000000000..cf5b0a895 --- /dev/null +++ b/samples/bpf/xdpsock_user.c @@ -0,0 +1,1550 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017 - 2018 Intel Corporation. */ + +#include <asm/barrier.h> +#include <errno.h> +#include <getopt.h> +#include <libgen.h> +#include <linux/bpf.h> +#include <linux/compiler.h> +#include <linux/if_link.h> +#include <linux/if_xdp.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/limits.h> +#include <linux/udp.h> +#include <arpa/inet.h> +#include <locale.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <poll.h> +#include <pthread.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> + +#include <bpf/libbpf.h> +#include <bpf/xsk.h> +#include <bpf/bpf.h> +#include "xdpsock.h" + +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif + +#ifndef AF_XDP +#define AF_XDP 44 +#endif + +#ifndef PF_XDP +#define PF_XDP AF_XDP +#endif + +#define NUM_FRAMES (4 * 1024) +#define MIN_PKT_SIZE 64 + +#define DEBUG_HEXDUMP 0 + +typedef __u64 u64; +typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; + +static unsigned long prev_time; + +enum benchmark_type { + BENCH_RXDROP = 0, + BENCH_TXONLY = 1, + BENCH_L2FWD = 2, +}; + +static enum benchmark_type opt_bench = BENCH_RXDROP; +static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static const char *opt_if = ""; +static int opt_ifindex; +static int opt_queue; +static unsigned long opt_duration; +static unsigned long start_time; +static bool benchmark_done; +static u32 opt_batch_size = 64; +static int opt_pkt_count; +static u16 opt_pkt_size = MIN_PKT_SIZE; +static u32 opt_pkt_fill_pattern = 0x12345678; +static bool opt_extra_stats; +static bool opt_quiet; +static bool opt_app_stats; +static const char *opt_irq_str = ""; +static u32 irq_no; +static int irqs_at_init = -1; +static int opt_poll; +static int opt_interval = 1; +static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u32 opt_umem_flags; +static int opt_unaligned_chunks; +static int opt_mmap_flags; +static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; +static int opt_timeout = 1000; +static bool opt_need_wakeup = true; +static u32 opt_num_xsks = 1; +static u32 prog_id; + +struct xsk_ring_stats { + unsigned long rx_npkts; + unsigned long tx_npkts; + unsigned long rx_dropped_npkts; + unsigned long rx_invalid_npkts; + unsigned long tx_invalid_npkts; + unsigned long rx_full_npkts; + unsigned long rx_fill_empty_npkts; + unsigned long tx_empty_npkts; + unsigned long prev_rx_npkts; + unsigned long prev_tx_npkts; + unsigned long prev_rx_dropped_npkts; + unsigned long prev_rx_invalid_npkts; + unsigned long prev_tx_invalid_npkts; + unsigned long prev_rx_full_npkts; + unsigned long prev_rx_fill_empty_npkts; + unsigned long prev_tx_empty_npkts; +}; + +struct xsk_driver_stats { + unsigned long intrs; + unsigned long prev_intrs; +}; + +struct xsk_app_stats { + unsigned long rx_empty_polls; + unsigned long fill_fail_polls; + unsigned long copy_tx_sendtos; + unsigned long tx_wakeup_sendtos; + unsigned long opt_polls; + unsigned long prev_rx_empty_polls; + unsigned long prev_fill_fail_polls; + unsigned long prev_copy_tx_sendtos; + unsigned long prev_tx_wakeup_sendtos; + unsigned long prev_opt_polls; +}; + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + struct xsk_ring_stats ring_stats; + struct xsk_app_stats app_stats; + struct xsk_driver_stats drv_stats; + u32 outstanding_tx; +}; + +static int num_socks; +struct xsk_socket_info *xsks[MAX_SOCKS]; + +static unsigned long get_nsecs(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000UL + ts.tv_nsec; +} + +static void print_benchmark(bool running) +{ + const char *bench_str = "INVALID"; + + if (opt_bench == BENCH_RXDROP) + bench_str = "rxdrop"; + else if (opt_bench == BENCH_TXONLY) + bench_str = "txonly"; + else if (opt_bench == BENCH_L2FWD) + bench_str = "l2fwd"; + + printf("%s:%d %s ", opt_if, opt_queue, bench_str); + if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) + printf("xdp-skb "); + else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) + printf("xdp-drv "); + else + printf(" "); + + if (opt_poll) + printf("poll() "); + + if (running) { + printf("running..."); + fflush(stdout); + } +} + +static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) +{ + struct xdp_statistics stats; + socklen_t optlen; + int err; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) + return err; + + if (optlen == sizeof(struct xdp_statistics)) { + xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; + xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; + xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; + xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; + xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; + xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; + return 0; + } + + return -EINVAL; +} + +static void dump_app_stats(long dt) +{ + int i; + + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-18s %'-14.0f %'-14lu\n"; + double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, + tx_wakeup_sendtos_ps, opt_polls_ps; + + rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls - + xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt; + fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls - + xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt; + copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos - + xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt; + tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos - + xsks[i]->app_stats.prev_tx_wakeup_sendtos) + * 1000000000. / dt; + opt_polls_ps = (xsks[i]->app_stats.opt_polls - + xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt; + + printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count"); + printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls); + printf(fmt, "fill fail polls", fill_fail_polls_ps, + xsks[i]->app_stats.fill_fail_polls); + printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps, + xsks[i]->app_stats.copy_tx_sendtos); + printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps, + xsks[i]->app_stats.tx_wakeup_sendtos); + printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls); + + xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls; + xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls; + xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos; + xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; + xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; + } +} + +static bool get_interrupt_number(void) +{ + FILE *f_int_proc; + char line[4096]; + bool found = false; + + f_int_proc = fopen("/proc/interrupts", "r"); + if (f_int_proc == NULL) { + printf("Failed to open /proc/interrupts.\n"); + return found; + } + + while (!feof(f_int_proc) && !found) { + /* Make sure to read a full line at a time */ + if (fgets(line, sizeof(line), f_int_proc) == NULL || + line[strlen(line) - 1] != '\n') { + printf("Error reading from interrupts file\n"); + break; + } + + /* Extract interrupt number from line */ + if (strstr(line, opt_irq_str) != NULL) { + irq_no = atoi(line); + found = true; + break; + } + } + + fclose(f_int_proc); + + return found; +} + +static int get_irqs(void) +{ + char count_path[PATH_MAX]; + int total_intrs = -1; + FILE *f_count_proc; + char line[4096]; + + snprintf(count_path, sizeof(count_path), + "/sys/kernel/irq/%i/per_cpu_count", irq_no); + f_count_proc = fopen(count_path, "r"); + if (f_count_proc == NULL) { + printf("Failed to open %s\n", count_path); + return total_intrs; + } + + if (fgets(line, sizeof(line), f_count_proc) == NULL || + line[strlen(line) - 1] != '\n') { + printf("Error reading from %s\n", count_path); + } else { + static const char com[2] = ","; + char *token; + + total_intrs = 0; + token = strtok(line, com); + while (token != NULL) { + /* sum up interrupts across all cores */ + total_intrs += atoi(token); + token = strtok(NULL, com); + } + } + + fclose(f_count_proc); + + return total_intrs; +} + +static void dump_driver_stats(long dt) +{ + int i; + + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-18s %'-14.0f %'-14lu\n"; + double intrs_ps; + int n_ints = get_irqs(); + + if (n_ints < 0) { + printf("error getting intr info for intr %i\n", irq_no); + return; + } + xsks[i]->drv_stats.intrs = n_ints - irqs_at_init; + + intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) * + 1000000000. / dt; + + printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count"); + printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs); + + xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs; + } +} + +static void dump_stats(void) +{ + unsigned long now = get_nsecs(); + long dt = now - prev_time; + int i; + + prev_time = now; + + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-18s %'-14.0f %'-14lu\n"; + double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, + tx_invalid_pps, tx_empty_pps; + + rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) * + 1000000000. / dt; + tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) * + 1000000000. / dt; + + printf("\n sock%d@", i); + print_benchmark(false); + printf("\n"); + + printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts", + dt / 1000000000.); + printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); + printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); + + xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts; + xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts; + + if (opt_extra_stats) { + if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) { + dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts - + xsks[i]->ring_stats.prev_rx_dropped_npkts) * + 1000000000. / dt; + rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts - + xsks[i]->ring_stats.prev_rx_invalid_npkts) * + 1000000000. / dt; + tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts - + xsks[i]->ring_stats.prev_tx_invalid_npkts) * + 1000000000. / dt; + full_pps = (xsks[i]->ring_stats.rx_full_npkts - + xsks[i]->ring_stats.prev_rx_full_npkts) * + 1000000000. / dt; + fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts - + xsks[i]->ring_stats.prev_rx_fill_empty_npkts) * + 1000000000. / dt; + tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts - + xsks[i]->ring_stats.prev_tx_empty_npkts) * + 1000000000. / dt; + + printf(fmt, "rx dropped", dropped_pps, + xsks[i]->ring_stats.rx_dropped_npkts); + printf(fmt, "rx invalid", rx_invalid_pps, + xsks[i]->ring_stats.rx_invalid_npkts); + printf(fmt, "tx invalid", tx_invalid_pps, + xsks[i]->ring_stats.tx_invalid_npkts); + printf(fmt, "rx queue full", full_pps, + xsks[i]->ring_stats.rx_full_npkts); + printf(fmt, "fill ring empty", fill_empty_pps, + xsks[i]->ring_stats.rx_fill_empty_npkts); + printf(fmt, "tx ring empty", tx_empty_pps, + xsks[i]->ring_stats.tx_empty_npkts); + + xsks[i]->ring_stats.prev_rx_dropped_npkts = + xsks[i]->ring_stats.rx_dropped_npkts; + xsks[i]->ring_stats.prev_rx_invalid_npkts = + xsks[i]->ring_stats.rx_invalid_npkts; + xsks[i]->ring_stats.prev_tx_invalid_npkts = + xsks[i]->ring_stats.tx_invalid_npkts; + xsks[i]->ring_stats.prev_rx_full_npkts = + xsks[i]->ring_stats.rx_full_npkts; + xsks[i]->ring_stats.prev_rx_fill_empty_npkts = + xsks[i]->ring_stats.rx_fill_empty_npkts; + xsks[i]->ring_stats.prev_tx_empty_npkts = + xsks[i]->ring_stats.tx_empty_npkts; + } else { + printf("%-15s\n", "Error retrieving extra stats"); + } + } + } + + if (opt_app_stats) + dump_app_stats(dt); + if (irq_no) + dump_driver_stats(dt); +} + +static bool is_benchmark_done(void) +{ + if (opt_duration > 0) { + unsigned long dt = (get_nsecs() - start_time); + + if (dt >= opt_duration) + benchmark_done = true; + } + return benchmark_done; +} + +static void *poller(void *arg) +{ + (void)arg; + while (!is_benchmark_done()) { + sleep(opt_interval); + dump_stats(); + } + + return NULL; +} + +static void remove_xdp_program(void) +{ + u32 curr_prog_id = 0; + + if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(EXIT_FAILURE); + } + if (prog_id == curr_prog_id) + bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on a given interface\n"); + else + printf("program on interface changed, not removing\n"); +} + +static void int_exit(int sig) +{ + benchmark_done = true; +} + +static void xdpsock_cleanup(void) +{ + struct xsk_umem *umem = xsks[0]->umem->umem; + int i; + + dump_stats(); + for (i = 0; i < num_socks; i++) + xsk_socket__delete(xsks[i]->xsk); + (void)xsk_umem__delete(umem); + remove_xdp_program(); +} + +static void __exit_with_error(int error, const char *file, const char *func, + int line) +{ + fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, + line, error, strerror(error)); + dump_stats(); + remove_xdp_program(); + exit(EXIT_FAILURE); +} + +#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ + __LINE__) +static void swap_mac_addresses(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp; + + tmp = *src_addr; + *src_addr = *dst_addr; + *dst_addr = tmp; +} + +static void hex_dump(void *pkt, size_t length, u64 addr) +{ + const unsigned char *address = (unsigned char *)pkt; + const unsigned char *line = address; + size_t line_size = 32; + unsigned char c; + char buf[32]; + int i = 0; + + if (!DEBUG_HEXDUMP) + return; + + sprintf(buf, "addr=%llu", addr); + printf("length = %zu\n", length); + printf("%s | ", buf); + while (length-- > 0) { + printf("%02X ", *address++); + if (!(++i % line_size) || (length == 0 && i % line_size)) { + if (length == 0) { + while (i++ % line_size) + printf("__ "); + } + printf(" | "); /* right close */ + while (line < address) { + c = *line++; + printf("%c", (c < 33 || c == 255) ? 0x2E : c); + } + printf("\n"); + if (length > 0) + printf("%s | ", buf); + } + } + printf("\n"); +} + +static void *memset32_htonl(void *dest, u32 val, u32 size) +{ + u32 *ptr = (u32 *)dest; + int i; + + val = htonl(val); + + for (i = 0; i < (size & (~0x3)); i += 4) + ptr[i >> 2] = val; + + for (; i < size; i++) + ((char *)dest)[i] = ((char *)&val)[i & 3]; + + return dest; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline unsigned short from32to16(unsigned int x) +{ + /* add up 16-bit and 16-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static unsigned int do_csum(const unsigned char *buff, int len) +{ + unsigned int result = 0; + int odd; + + if (len <= 0) + goto out; + odd = 1 & (unsigned long)buff; + if (odd) { +#ifdef __LITTLE_ENDIAN + result += (*buff << 8); +#else + result = *buff; +#endif + len--; + buff++; + } + if (len >= 2) { + if (2 & (unsigned long)buff) { + result += *(unsigned short *)buff; + len -= 2; + buff += 2; + } + if (len >= 4) { + const unsigned char *end = buff + + ((unsigned int)len & ~3); + unsigned int carry = 0; + + do { + unsigned int w = *(unsigned int *)buff; + + buff += 4; + result += carry; + result += w; + carry = (w > result); + } while (buff < end); + result += carry; + result = (result & 0xffff) + (result >> 16); + } + if (len & 2) { + result += *(unsigned short *)buff; + buff += 2; + } + } + if (len & 1) +#ifdef __LITTLE_ENDIAN + result += *buff; +#else + result += (*buff << 8); +#endif + result = from32to16(result); + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +out: + return result; +} + +__sum16 ip_fast_csum(const void *iph, unsigned int ihl); + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +__sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + return (__force __sum16)~do_csum(iph, ihl * 4); +} + +/* + * Fold a partial checksum + * This function code has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __sum16 csum_fold(__wsum csum) +{ + u32 sum = (__force u32)csum; + + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (__force __sum16)~sum; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + +__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, __wsum sum); + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, __wsum sum) +{ + unsigned long long s = (__force u32)sum; + + s += (__force u32)saddr; + s += (__force u32)daddr; +#ifdef __BIG_ENDIAN__ + s += proto + len; +#else + s += (proto + len) << 8; +#endif + return (__force __wsum)from64to32(s); +} + +/* + * This function has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __sum16 +csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, + u8 proto, u16 *udp_pkt) +{ + u32 csum = 0; + u32 cnt = 0; + + /* udp hdr and data */ + for (; cnt < len; cnt += 2) + csum += udp_pkt[cnt >> 1]; + + return csum_tcpudp_magic(saddr, daddr, len, proto, csum); +} + +#define ETH_FCS_SIZE 4 + +#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr)) + +#define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) +#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) +#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) +#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) + +static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; + +static void gen_eth_hdr_data(void) +{ + struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + + sizeof(struct ethhdr) + + sizeof(struct iphdr)); + struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + + sizeof(struct ethhdr)); + struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; + + /* ethernet header */ + memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); + memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_IP); + + /* IP header */ + ip_hdr->version = IPVERSION; + ip_hdr->ihl = 0x5; /* 20 byte header */ + ip_hdr->tos = 0x0; + ip_hdr->tot_len = htons(IP_PKT_SIZE); + ip_hdr->id = 0; + ip_hdr->frag_off = 0; + ip_hdr->ttl = IPDEFTTL; + ip_hdr->protocol = IPPROTO_UDP; + ip_hdr->saddr = htonl(0x0a0a0a10); + ip_hdr->daddr = htonl(0x0a0a0a20); + + /* IP header checksum */ + ip_hdr->check = 0; + ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); + + /* UDP header */ + udp_hdr->source = htons(0x1000); + udp_hdr->dest = htons(0x1000); + udp_hdr->len = htons(UDP_PKT_SIZE); + + /* UDP data */ + memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern, + UDP_PKT_DATA_SIZE); + + /* UDP header checksum */ + udp_hdr->check = 0; + udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, + IPPROTO_UDP, (u16 *)udp_hdr); +} + +static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) +{ + memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, + PKT_SIZE); +} + +static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) +{ + struct xsk_umem_info *umem; + struct xsk_umem_config cfg = { + /* We recommend that you set the fill ring size >= HW RX ring size + + * AF_XDP RX ring size. Make sure you fill up the fill ring + * with buffers at regular intervals, and you will with this setting + * avoid allocation failures in the driver. These are usually quite + * expensive since drivers have not been written to assume that + * allocation failures are common. For regular sockets, kernel + * allocated memory is used that only runs out in OOM situations + * that should be rare. + */ + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = opt_xsk_frame_size, + .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + .flags = opt_umem_flags + }; + int ret; + + umem = calloc(1, sizeof(*umem)); + if (!umem) + exit_with_error(errno); + + ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, + &cfg); + if (ret) + exit_with_error(-ret); + + umem->buffer = buffer; + return umem; +} + +static void xsk_populate_fill_ring(struct xsk_umem_info *umem) +{ + int ret, i; + u32 idx; + + ret = xsk_ring_prod__reserve(&umem->fq, + XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx); + if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2) + exit_with_error(-ret); + for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++) + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = + i * opt_xsk_frame_size; + xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2); +} + +static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, + bool rx, bool tx) +{ + struct xsk_socket_config cfg; + struct xsk_socket_info *xsk; + struct xsk_ring_cons *rxr; + struct xsk_ring_prod *txr; + int ret; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + exit_with_error(errno); + + xsk->umem = umem; + cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + if (opt_num_xsks > 1) + cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; + else + cfg.libbpf_flags = 0; + cfg.xdp_flags = opt_xdp_flags; + cfg.bind_flags = opt_xdp_bind_flags; + + rxr = rx ? &xsk->rx : NULL; + txr = tx ? &xsk->tx : NULL; + ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, + rxr, txr, &cfg); + if (ret) + exit_with_error(-ret); + + ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags); + if (ret) + exit_with_error(-ret); + + xsk->app_stats.rx_empty_polls = 0; + xsk->app_stats.fill_fail_polls = 0; + xsk->app_stats.copy_tx_sendtos = 0; + xsk->app_stats.tx_wakeup_sendtos = 0; + xsk->app_stats.opt_polls = 0; + xsk->app_stats.prev_rx_empty_polls = 0; + xsk->app_stats.prev_fill_fail_polls = 0; + xsk->app_stats.prev_copy_tx_sendtos = 0; + xsk->app_stats.prev_tx_wakeup_sendtos = 0; + xsk->app_stats.prev_opt_polls = 0; + + return xsk; +} + +static struct option long_options[] = { + {"rxdrop", no_argument, 0, 'r'}, + {"txonly", no_argument, 0, 't'}, + {"l2fwd", no_argument, 0, 'l'}, + {"interface", required_argument, 0, 'i'}, + {"queue", required_argument, 0, 'q'}, + {"poll", no_argument, 0, 'p'}, + {"xdp-skb", no_argument, 0, 'S'}, + {"xdp-native", no_argument, 0, 'N'}, + {"interval", required_argument, 0, 'n'}, + {"zero-copy", no_argument, 0, 'z'}, + {"copy", no_argument, 0, 'c'}, + {"frame-size", required_argument, 0, 'f'}, + {"no-need-wakeup", no_argument, 0, 'm'}, + {"unaligned", no_argument, 0, 'u'}, + {"shared-umem", no_argument, 0, 'M'}, + {"force", no_argument, 0, 'F'}, + {"duration", required_argument, 0, 'd'}, + {"batch-size", required_argument, 0, 'b'}, + {"tx-pkt-count", required_argument, 0, 'C'}, + {"tx-pkt-size", required_argument, 0, 's'}, + {"tx-pkt-pattern", required_argument, 0, 'P'}, + {"extra-stats", no_argument, 0, 'x'}, + {"quiet", no_argument, 0, 'Q'}, + {"app-stats", no_argument, 0, 'a'}, + {"irq-string", no_argument, 0, 'I'}, + {0, 0, 0, 0} +}; + +static void usage(const char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -r, --rxdrop Discard all incoming packets (default)\n" + " -t, --txonly Only send packets\n" + " -l, --l2fwd MAC swap L2 forwarding\n" + " -i, --interface=n Run on interface n\n" + " -q, --queue=n Use queue n (default 0)\n" + " -p, --poll Use poll syscall\n" + " -S, --xdp-skb=n Use XDP skb-mod\n" + " -N, --xdp-native=n Enforce XDP native mode\n" + " -n, --interval=n Specify statistics update interval (default 1 sec).\n" + " -z, --zero-copy Force zero-copy mode.\n" + " -c, --copy Force copy mode.\n" + " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" + " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" + " -u, --unaligned Enable unaligned chunk placement\n" + " -M, --shared-umem Enable XDP_SHARED_UMEM\n" + " -F, --force Force loading the XDP prog\n" + " -d, --duration=n Duration in secs to run command.\n" + " Default: forever.\n" + " -b, --batch-size=n Batch size for sending or receiving\n" + " packets. Default: %d\n" + " -C, --tx-pkt-count=n Number of packets to send.\n" + " Default: Continuous packets.\n" + " -s, --tx-pkt-size=n Transmit packet size.\n" + " (Default: %d bytes)\n" + " Min size: %d, Max size %d.\n" + " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" + " -x, --extra-stats Display extra statistics.\n" + " -Q, --quiet Do not display any stats.\n" + " -a, --app-stats Display application (syscall) statistics.\n" + " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" + "\n"; + fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, + opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, + XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern); + + exit(EXIT_FAILURE); +} + +static void parse_command_line(int argc, char **argv) +{ + int option_index, c; + + opterr = 0; + + for (;;) { + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'r': + opt_bench = BENCH_RXDROP; + break; + case 't': + opt_bench = BENCH_TXONLY; + break; + case 'l': + opt_bench = BENCH_L2FWD; + break; + case 'i': + opt_if = optarg; + break; + case 'q': + opt_queue = atoi(optarg); + break; + case 'p': + opt_poll = 1; + break; + case 'S': + opt_xdp_flags |= XDP_FLAGS_SKB_MODE; + opt_xdp_bind_flags |= XDP_COPY; + break; + case 'N': + /* default, set below */ + break; + case 'n': + opt_interval = atoi(optarg); + break; + case 'z': + opt_xdp_bind_flags |= XDP_ZEROCOPY; + break; + case 'c': + opt_xdp_bind_flags |= XDP_COPY; + break; + case 'u': + opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; + opt_unaligned_chunks = 1; + opt_mmap_flags = MAP_HUGETLB; + break; + case 'F': + opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'f': + opt_xsk_frame_size = atoi(optarg); + break; + case 'm': + opt_need_wakeup = false; + opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP; + break; + case 'M': + opt_num_xsks = MAX_SOCKS; + break; + case 'd': + opt_duration = atoi(optarg); + opt_duration *= 1000000000; + break; + case 'b': + opt_batch_size = atoi(optarg); + break; + case 'C': + opt_pkt_count = atoi(optarg); + break; + case 's': + opt_pkt_size = atoi(optarg); + if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) || + opt_pkt_size < MIN_PKT_SIZE) { + fprintf(stderr, + "ERROR: Invalid frame size %d\n", + opt_pkt_size); + usage(basename(argv[0])); + } + break; + case 'P': + opt_pkt_fill_pattern = strtol(optarg, NULL, 16); + break; + case 'x': + opt_extra_stats = 1; + break; + case 'Q': + opt_quiet = 1; + break; + case 'a': + opt_app_stats = 1; + break; + case 'I': + opt_irq_str = optarg; + if (get_interrupt_number()) + irqs_at_init = get_irqs(); + if (irqs_at_init < 0) { + fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); + usage(basename(argv[0])); + } + + break; + default: + usage(basename(argv[0])); + } + } + + if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE)) + opt_xdp_flags |= XDP_FLAGS_DRV_MODE; + + opt_ifindex = if_nametoindex(opt_if); + if (!opt_ifindex) { + fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", + opt_if); + usage(basename(argv[0])); + } + + if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) && + !opt_unaligned_chunks) { + fprintf(stderr, "--frame-size=%d is not a power of two\n", + opt_xsk_frame_size); + usage(basename(argv[0])); + } +} + +static void kick_tx(struct xsk_socket_info *xsk) +{ + int ret; + + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || + errno == EBUSY || errno == ENETDOWN) + return; + exit_with_error(errno); +} + +static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, + struct pollfd *fds) +{ + struct xsk_umem_info *umem = xsk->umem; + u32 idx_cq = 0, idx_fq = 0; + unsigned int rcvd; + size_t ndescs; + + if (!xsk->outstanding_tx) + return; + + /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to + * really send the packets. In zero-copy mode we do not have to do this, since Tx + * is driven by the NAPI loop. So as an optimization, we do not have to call + * sendto() all the time in zero-copy mode for l2fwd. + */ + if (opt_xdp_bind_flags & XDP_COPY) { + xsk->app_stats.copy_tx_sendtos++; + kick_tx(xsk); + } + + ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : + xsk->outstanding_tx; + + /* re-add completed Tx buffers */ + rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq); + if (rcvd > 0) { + unsigned int i; + int ret; + + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + xsk->app_stats.fill_fail_polls++; + ret = poll(fds, num_socks, opt_timeout); + } + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + } + + for (i = 0; i < rcvd; i++) + *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = + *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); + + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->umem->cq, rcvd); + xsk->outstanding_tx -= rcvd; + xsk->ring_stats.tx_npkts += rcvd; + } +} + +static inline void complete_tx_only(struct xsk_socket_info *xsk, + int batch_size) +{ + unsigned int rcvd; + u32 idx; + + if (!xsk->outstanding_tx) + return; + + if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { + xsk->app_stats.tx_wakeup_sendtos++; + kick_tx(xsk); + } + + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); + if (rcvd > 0) { + xsk_ring_cons__release(&xsk->umem->cq, rcvd); + xsk->outstanding_tx -= rcvd; + xsk->ring_stats.tx_npkts += rcvd; + } +} + +static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) +{ + unsigned int rcvd, i; + u32 idx_rx = 0, idx_fq = 0; + int ret; + + rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.rx_empty_polls++; + ret = poll(fds, num_socks, opt_timeout); + } + return; + } + + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.fill_fail_polls++; + ret = poll(fds, num_socks, opt_timeout); + } + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + } + + for (i = 0; i < rcvd; i++) { + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + u64 orig = xsk_umem__extract_addr(addr); + + addr = xsk_umem__add_offset_to_addr(addr); + char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); + + hex_dump(pkt, len, addr); + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; + } + + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); + xsk->ring_stats.rx_npkts += rcvd; +} + +static void rx_drop_all(void) +{ + struct pollfd fds[MAX_SOCKS] = {}; + int i, ret; + + for (i = 0; i < num_socks; i++) { + fds[i].fd = xsk_socket__fd(xsks[i]->xsk); + fds[i].events = POLLIN; + } + + for (;;) { + if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; + ret = poll(fds, num_socks, opt_timeout); + if (ret <= 0) + continue; + } + + for (i = 0; i < num_socks; i++) + rx_drop(xsks[i], fds); + + if (benchmark_done) + break; + } +} + +static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) +{ + u32 idx; + unsigned int i; + + while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < + batch_size) { + complete_tx_only(xsk, batch_size); + if (benchmark_done) + return; + } + + for (i = 0; i < batch_size; i++) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, + idx + i); + tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size; + tx_desc->len = PKT_SIZE; + } + + xsk_ring_prod__submit(&xsk->tx, batch_size); + xsk->outstanding_tx += batch_size; + *frame_nb += batch_size; + *frame_nb %= NUM_FRAMES; + complete_tx_only(xsk, batch_size); +} + +static inline int get_batch_size(int pkt_cnt) +{ + if (!opt_pkt_count) + return opt_batch_size; + + if (pkt_cnt + opt_batch_size <= opt_pkt_count) + return opt_batch_size; + + return opt_pkt_count - pkt_cnt; +} + +static void complete_tx_only_all(void) +{ + bool pending; + int i; + + do { + pending = false; + for (i = 0; i < num_socks; i++) { + if (xsks[i]->outstanding_tx) { + complete_tx_only(xsks[i], opt_batch_size); + pending = !!xsks[i]->outstanding_tx; + } + } + } while (pending); +} + +static void tx_only_all(void) +{ + struct pollfd fds[MAX_SOCKS] = {}; + u32 frame_nb[MAX_SOCKS] = {}; + int pkt_cnt = 0; + int i, ret; + + for (i = 0; i < num_socks; i++) { + fds[0].fd = xsk_socket__fd(xsks[i]->xsk); + fds[0].events = POLLOUT; + } + + while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { + int batch_size = get_batch_size(pkt_cnt); + + if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; + ret = poll(fds, num_socks, opt_timeout); + if (ret <= 0) + continue; + + if (!(fds[0].revents & POLLOUT)) + continue; + } + + for (i = 0; i < num_socks; i++) + tx_only(xsks[i], &frame_nb[i], batch_size); + + pkt_cnt += batch_size; + + if (benchmark_done) + break; + } + + if (opt_pkt_count) + complete_tx_only_all(); +} + +static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) +{ + unsigned int rcvd, i; + u32 idx_rx = 0, idx_tx = 0; + int ret; + + complete_tx_l2fwd(xsk, fds); + + rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.rx_empty_polls++; + ret = poll(fds, num_socks, opt_timeout); + } + return; + } + + ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + complete_tx_l2fwd(xsk, fds); + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + xsk->app_stats.tx_wakeup_sendtos++; + kick_tx(xsk); + } + ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); + } + + for (i = 0; i < rcvd; i++) { + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + u64 orig = addr; + + addr = xsk_umem__add_offset_to_addr(addr); + char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); + + swap_mac_addresses(pkt); + + hex_dump(pkt, len, addr); + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig; + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; + } + + xsk_ring_prod__submit(&xsk->tx, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); + + xsk->ring_stats.rx_npkts += rcvd; + xsk->outstanding_tx += rcvd; +} + +static void l2fwd_all(void) +{ + struct pollfd fds[MAX_SOCKS] = {}; + int i, ret; + + for (i = 0; i < num_socks; i++) { + fds[i].fd = xsk_socket__fd(xsks[i]->xsk); + fds[i].events = POLLOUT | POLLIN; + } + + for (;;) { + if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; + ret = poll(fds, num_socks, opt_timeout); + if (ret <= 0) + continue; + } + + for (i = 0; i < num_socks; i++) + l2fwd(xsks[i], fds); + + if (benchmark_done) + break; + } +} + +static void load_xdp_program(char **argv, struct bpf_object **obj) +{ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + char xdp_filename[256]; + int prog_fd; + + snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); + prog_load_attr.file = xdp_filename; + + if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd)) + exit(EXIT_FAILURE); + if (prog_fd < 0) { + fprintf(stderr, "ERROR: no program found: %s\n", + strerror(prog_fd)); + exit(EXIT_FAILURE); + } + + if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) { + fprintf(stderr, "ERROR: link set xdp fd failed\n"); + exit(EXIT_FAILURE); + } +} + +static void enter_xsks_into_map(struct bpf_object *obj) +{ + struct bpf_map *map; + int i, xsks_map; + + map = bpf_object__find_map_by_name(obj, "xsks_map"); + xsks_map = bpf_map__fd(map); + if (xsks_map < 0) { + fprintf(stderr, "ERROR: no xsks map found: %s\n", + strerror(xsks_map)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < num_socks; i++) { + int fd = xsk_socket__fd(xsks[i]->xsk); + int key, ret; + + key = i; + ret = bpf_map_update_elem(xsks_map, &key, &fd, 0); + if (ret) { + fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); + exit(EXIT_FAILURE); + } + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + bool rx = false, tx = false; + struct xsk_umem_info *umem; + struct bpf_object *obj; + pthread_t pt; + int i, ret; + void *bufs; + + parse_command_line(argc, argv); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (opt_num_xsks > 1) + load_xdp_program(argv, &obj); + + /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ + bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0); + if (bufs == MAP_FAILED) { + printf("ERROR: mmap failed\n"); + exit(EXIT_FAILURE); + } + + /* Create sockets... */ + umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); + if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) { + rx = true; + xsk_populate_fill_ring(umem); + } + if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY) + tx = true; + for (i = 0; i < opt_num_xsks; i++) + xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); + + if (opt_bench == BENCH_TXONLY) { + gen_eth_hdr_data(); + + for (i = 0; i < NUM_FRAMES; i++) + gen_eth_frame(umem, i * opt_xsk_frame_size); + } + + if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) + enter_xsks_into_map(obj); + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + signal(SIGABRT, int_exit); + + setlocale(LC_ALL, ""); + + prev_time = get_nsecs(); + start_time = prev_time; + + if (!opt_quiet) { + ret = pthread_create(&pt, NULL, poller, NULL); + if (ret) + exit_with_error(ret); + } + + + if (opt_bench == BENCH_RXDROP) + rx_drop_all(); + else if (opt_bench == BENCH_TXONLY) + tx_only_all(); + else + l2fwd_all(); + + benchmark_done = true; + + if (!opt_quiet) + pthread_join(pt, NULL); + + xdpsock_cleanup(); + + munmap(bufs, NUM_FRAMES * opt_xsk_frame_size); + + return 0; +} diff --git a/samples/bpf/xsk_fwd.c b/samples/bpf/xsk_fwd.c new file mode 100644 index 000000000..1cd97c84c --- /dev/null +++ b/samples/bpf/xsk_fwd.c @@ -0,0 +1,1085 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 Intel Corporation. */ + +#define _GNU_SOURCE +#include <poll.h> +#include <pthread.h> +#include <signal.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> +#include <getopt.h> +#include <netinet/ether.h> +#include <net/if.h> + +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <linux/if_xdp.h> + +#include <bpf/libbpf.h> +#include <bpf/xsk.h> +#include <bpf/bpf.h> + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +typedef __u64 u64; +typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; + +/* This program illustrates the packet forwarding between multiple AF_XDP + * sockets in multi-threaded environment. All threads are sharing a common + * buffer pool, with each socket having its own private buffer cache. + * + * Example 1: Single thread handling two sockets. The packets received by socket + * A (interface IFA, queue QA) are forwarded to socket B (interface IFB, queue + * QB), while the packets received by socket B are forwarded to socket A. The + * thread is running on CPU core X: + * + * ./xsk_fwd -i IFA -q QA -i IFB -q QB -c X + * + * Example 2: Two threads, each handling two sockets. The thread running on CPU + * core X forwards all the packets received by socket A to socket B, and all the + * packets received by socket B to socket A. The thread running on CPU core Y is + * performing the same packet forwarding between sockets C and D: + * + * ./xsk_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD + * -c CX -c CY + */ + +/* + * Buffer pool and buffer cache + * + * For packet forwarding, the packet buffers are typically allocated from the + * pool for packet reception and freed back to the pool for further reuse once + * the packet transmission is completed. + * + * The buffer pool is shared between multiple threads. In order to minimize the + * access latency to the shared buffer pool, each thread creates one (or + * several) buffer caches, which, unlike the buffer pool, are private to the + * thread that creates them and therefore cannot be shared with other threads. + * The access to the shared pool is only needed either (A) when the cache gets + * empty due to repeated buffer allocations and it needs to be replenished from + * the pool, or (B) when the cache gets full due to repeated buffer free and it + * needs to be flushed back to the pull. + * + * In a packet forwarding system, a packet received on any input port can + * potentially be transmitted on any output port, depending on the forwarding + * configuration. For AF_XDP sockets, for this to work with zero-copy of the + * packet buffers when, it is required that the buffer pool memory fits into the + * UMEM area shared by all the sockets. + */ + +struct bpool_params { + u32 n_buffers; + u32 buffer_size; + int mmap_flags; + + u32 n_users_max; + u32 n_buffers_per_slab; +}; + +/* This buffer pool implementation organizes the buffers into equally sized + * slabs of *n_buffers_per_slab*. Initially, there are *n_slabs* slabs in the + * pool that are completely filled with buffer pointers (full slabs). + * + * Each buffer cache has a slab for buffer allocation and a slab for buffer + * free, with both of these slabs initially empty. When the cache's allocation + * slab goes empty, it is swapped with one of the available full slabs from the + * pool, if any is available. When the cache's free slab goes full, it is + * swapped for one of the empty slabs from the pool, which is guaranteed to + * succeed. + * + * Partially filled slabs never get traded between the cache and the pool + * (except when the cache itself is destroyed), which enables fast operation + * through pointer swapping. + */ +struct bpool { + struct bpool_params params; + pthread_mutex_t lock; + void *addr; + + u64 **slabs; + u64 **slabs_reserved; + u64 *buffers; + u64 *buffers_reserved; + + u64 n_slabs; + u64 n_slabs_reserved; + u64 n_buffers; + + u64 n_slabs_available; + u64 n_slabs_reserved_available; + + struct xsk_umem_config umem_cfg; + struct xsk_ring_prod umem_fq; + struct xsk_ring_cons umem_cq; + struct xsk_umem *umem; +}; + +static struct bpool * +bpool_init(struct bpool_params *params, + struct xsk_umem_config *umem_cfg) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + u64 n_slabs, n_slabs_reserved, n_buffers, n_buffers_reserved; + u64 slabs_size, slabs_reserved_size; + u64 buffers_size, buffers_reserved_size; + u64 total_size, i; + struct bpool *bp; + u8 *p; + int status; + + /* mmap prep. */ + if (setrlimit(RLIMIT_MEMLOCK, &r)) + return NULL; + + /* bpool internals dimensioning. */ + n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) / + params->n_buffers_per_slab; + n_slabs_reserved = params->n_users_max * 2; + n_buffers = n_slabs * params->n_buffers_per_slab; + n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab; + + slabs_size = n_slabs * sizeof(u64 *); + slabs_reserved_size = n_slabs_reserved * sizeof(u64 *); + buffers_size = n_buffers * sizeof(u64); + buffers_reserved_size = n_buffers_reserved * sizeof(u64); + + total_size = sizeof(struct bpool) + + slabs_size + slabs_reserved_size + + buffers_size + buffers_reserved_size; + + /* bpool memory allocation. */ + p = calloc(total_size, sizeof(u8)); + if (!p) + return NULL; + + /* bpool memory initialization. */ + bp = (struct bpool *)p; + memcpy(&bp->params, params, sizeof(*params)); + bp->params.n_buffers = n_buffers; + + bp->slabs = (u64 **)&p[sizeof(struct bpool)]; + bp->slabs_reserved = (u64 **)&p[sizeof(struct bpool) + + slabs_size]; + bp->buffers = (u64 *)&p[sizeof(struct bpool) + + slabs_size + slabs_reserved_size]; + bp->buffers_reserved = (u64 *)&p[sizeof(struct bpool) + + slabs_size + slabs_reserved_size + buffers_size]; + + bp->n_slabs = n_slabs; + bp->n_slabs_reserved = n_slabs_reserved; + bp->n_buffers = n_buffers; + + for (i = 0; i < n_slabs; i++) + bp->slabs[i] = &bp->buffers[i * params->n_buffers_per_slab]; + bp->n_slabs_available = n_slabs; + + for (i = 0; i < n_slabs_reserved; i++) + bp->slabs_reserved[i] = &bp->buffers_reserved[i * + params->n_buffers_per_slab]; + bp->n_slabs_reserved_available = n_slabs_reserved; + + for (i = 0; i < n_buffers; i++) + bp->buffers[i] = i * params->buffer_size; + + /* lock. */ + status = pthread_mutex_init(&bp->lock, NULL); + if (status) { + free(p); + return NULL; + } + + /* mmap. */ + bp->addr = mmap(NULL, + n_buffers * params->buffer_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | params->mmap_flags, + -1, + 0); + if (bp->addr == MAP_FAILED) { + pthread_mutex_destroy(&bp->lock); + free(p); + return NULL; + } + + /* umem. */ + status = xsk_umem__create(&bp->umem, + bp->addr, + bp->params.n_buffers * bp->params.buffer_size, + &bp->umem_fq, + &bp->umem_cq, + umem_cfg); + if (status) { + munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); + pthread_mutex_destroy(&bp->lock); + free(p); + return NULL; + } + memcpy(&bp->umem_cfg, umem_cfg, sizeof(*umem_cfg)); + + return bp; +} + +static void +bpool_free(struct bpool *bp) +{ + if (!bp) + return; + + xsk_umem__delete(bp->umem); + munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); + pthread_mutex_destroy(&bp->lock); + free(bp); +} + +struct bcache { + struct bpool *bp; + + u64 *slab_cons; + u64 *slab_prod; + + u64 n_buffers_cons; + u64 n_buffers_prod; +}; + +static u32 +bcache_slab_size(struct bcache *bc) +{ + struct bpool *bp = bc->bp; + + return bp->params.n_buffers_per_slab; +} + +static struct bcache * +bcache_init(struct bpool *bp) +{ + struct bcache *bc; + + bc = calloc(1, sizeof(struct bcache)); + if (!bc) + return NULL; + + bc->bp = bp; + bc->n_buffers_cons = 0; + bc->n_buffers_prod = 0; + + pthread_mutex_lock(&bp->lock); + if (bp->n_slabs_reserved_available == 0) { + pthread_mutex_unlock(&bp->lock); + free(bc); + return NULL; + } + + bc->slab_cons = bp->slabs_reserved[bp->n_slabs_reserved_available - 1]; + bc->slab_prod = bp->slabs_reserved[bp->n_slabs_reserved_available - 2]; + bp->n_slabs_reserved_available -= 2; + pthread_mutex_unlock(&bp->lock); + + return bc; +} + +static void +bcache_free(struct bcache *bc) +{ + struct bpool *bp; + + if (!bc) + return; + + /* In order to keep this example simple, the case of freeing any + * existing buffers from the cache back to the pool is ignored. + */ + + bp = bc->bp; + pthread_mutex_lock(&bp->lock); + bp->slabs_reserved[bp->n_slabs_reserved_available] = bc->slab_prod; + bp->slabs_reserved[bp->n_slabs_reserved_available + 1] = bc->slab_cons; + bp->n_slabs_reserved_available += 2; + pthread_mutex_unlock(&bp->lock); + + free(bc); +} + +/* To work correctly, the implementation requires that the *n_buffers* input + * argument is never greater than the buffer pool's *n_buffers_per_slab*. This + * is typically the case, with one exception taking place when large number of + * buffers are allocated at init time (e.g. for the UMEM fill queue setup). + */ +static inline u32 +bcache_cons_check(struct bcache *bc, u32 n_buffers) +{ + struct bpool *bp = bc->bp; + u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; + u64 n_buffers_cons = bc->n_buffers_cons; + u64 n_slabs_available; + u64 *slab_full; + + /* + * Consumer slab is not empty: Use what's available locally. Do not + * look for more buffers from the pool when the ask can only be + * partially satisfied. + */ + if (n_buffers_cons) + return (n_buffers_cons < n_buffers) ? + n_buffers_cons : + n_buffers; + + /* + * Consumer slab is empty: look to trade the current consumer slab + * (full) for a full slab from the pool, if any is available. + */ + pthread_mutex_lock(&bp->lock); + n_slabs_available = bp->n_slabs_available; + if (!n_slabs_available) { + pthread_mutex_unlock(&bp->lock); + return 0; + } + + n_slabs_available--; + slab_full = bp->slabs[n_slabs_available]; + bp->slabs[n_slabs_available] = bc->slab_cons; + bp->n_slabs_available = n_slabs_available; + pthread_mutex_unlock(&bp->lock); + + bc->slab_cons = slab_full; + bc->n_buffers_cons = n_buffers_per_slab; + return n_buffers; +} + +static inline u64 +bcache_cons(struct bcache *bc) +{ + u64 n_buffers_cons = bc->n_buffers_cons - 1; + u64 buffer; + + buffer = bc->slab_cons[n_buffers_cons]; + bc->n_buffers_cons = n_buffers_cons; + return buffer; +} + +static inline void +bcache_prod(struct bcache *bc, u64 buffer) +{ + struct bpool *bp = bc->bp; + u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; + u64 n_buffers_prod = bc->n_buffers_prod; + u64 n_slabs_available; + u64 *slab_empty; + + /* + * Producer slab is not yet full: store the current buffer to it. + */ + if (n_buffers_prod < n_buffers_per_slab) { + bc->slab_prod[n_buffers_prod] = buffer; + bc->n_buffers_prod = n_buffers_prod + 1; + return; + } + + /* + * Producer slab is full: trade the cache's current producer slab + * (full) for an empty slab from the pool, then store the current + * buffer to the new producer slab. As one full slab exists in the + * cache, it is guaranteed that there is at least one empty slab + * available in the pool. + */ + pthread_mutex_lock(&bp->lock); + n_slabs_available = bp->n_slabs_available; + slab_empty = bp->slabs[n_slabs_available]; + bp->slabs[n_slabs_available] = bc->slab_prod; + bp->n_slabs_available = n_slabs_available + 1; + pthread_mutex_unlock(&bp->lock); + + slab_empty[0] = buffer; + bc->slab_prod = slab_empty; + bc->n_buffers_prod = 1; +} + +/* + * Port + * + * Each of the forwarding ports sits on top of an AF_XDP socket. In order for + * packet forwarding to happen with no packet buffer copy, all the sockets need + * to share the same UMEM area, which is used as the buffer pool memory. + */ +#ifndef MAX_BURST_RX +#define MAX_BURST_RX 64 +#endif + +#ifndef MAX_BURST_TX +#define MAX_BURST_TX 64 +#endif + +struct burst_rx { + u64 addr[MAX_BURST_RX]; + u32 len[MAX_BURST_RX]; +}; + +struct burst_tx { + u64 addr[MAX_BURST_TX]; + u32 len[MAX_BURST_TX]; + u32 n_pkts; +}; + +struct port_params { + struct xsk_socket_config xsk_cfg; + struct bpool *bp; + const char *iface; + u32 iface_queue; +}; + +struct port { + struct port_params params; + + struct bcache *bc; + + struct xsk_ring_cons rxq; + struct xsk_ring_prod txq; + struct xsk_ring_prod umem_fq; + struct xsk_ring_cons umem_cq; + struct xsk_socket *xsk; + int umem_fq_initialized; + + u64 n_pkts_rx; + u64 n_pkts_tx; +}; + +static void +port_free(struct port *p) +{ + if (!p) + return; + + /* To keep this example simple, the code to free the buffers from the + * socket's receive and transmit queues, as well as from the UMEM fill + * and completion queues, is not included. + */ + + if (p->xsk) + xsk_socket__delete(p->xsk); + + bcache_free(p->bc); + + free(p); +} + +static struct port * +port_init(struct port_params *params) +{ + struct port *p; + u32 umem_fq_size, pos = 0; + int status, i; + + /* Memory allocation and initialization. */ + p = calloc(sizeof(struct port), 1); + if (!p) + return NULL; + + memcpy(&p->params, params, sizeof(p->params)); + umem_fq_size = params->bp->umem_cfg.fill_size; + + /* bcache. */ + p->bc = bcache_init(params->bp); + if (!p->bc || + (bcache_slab_size(p->bc) < umem_fq_size) || + (bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size)) { + port_free(p); + return NULL; + } + + /* xsk socket. */ + status = xsk_socket__create_shared(&p->xsk, + params->iface, + params->iface_queue, + params->bp->umem, + &p->rxq, + &p->txq, + &p->umem_fq, + &p->umem_cq, + ¶ms->xsk_cfg); + if (status) { + port_free(p); + return NULL; + } + + /* umem fq. */ + xsk_ring_prod__reserve(&p->umem_fq, umem_fq_size, &pos); + + for (i = 0; i < umem_fq_size; i++) + *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = + bcache_cons(p->bc); + + xsk_ring_prod__submit(&p->umem_fq, umem_fq_size); + p->umem_fq_initialized = 1; + + return p; +} + +static inline u32 +port_rx_burst(struct port *p, struct burst_rx *b) +{ + u32 n_pkts, pos, i; + + /* Free buffers for FQ replenish. */ + n_pkts = ARRAY_SIZE(b->addr); + + n_pkts = bcache_cons_check(p->bc, n_pkts); + if (!n_pkts) + return 0; + + /* RXQ. */ + n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos); + if (!n_pkts) { + if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { + struct pollfd pollfd = { + .fd = xsk_socket__fd(p->xsk), + .events = POLLIN, + }; + + poll(&pollfd, 1, 0); + } + return 0; + } + + for (i = 0; i < n_pkts; i++) { + b->addr[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->addr; + b->len[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->len; + } + + xsk_ring_cons__release(&p->rxq, n_pkts); + p->n_pkts_rx += n_pkts; + + /* UMEM FQ. */ + for ( ; ; ) { + int status; + + status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos); + if (status == n_pkts) + break; + + if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { + struct pollfd pollfd = { + .fd = xsk_socket__fd(p->xsk), + .events = POLLIN, + }; + + poll(&pollfd, 1, 0); + } + } + + for (i = 0; i < n_pkts; i++) + *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = + bcache_cons(p->bc); + + xsk_ring_prod__submit(&p->umem_fq, n_pkts); + + return n_pkts; +} + +static inline void +port_tx_burst(struct port *p, struct burst_tx *b) +{ + u32 n_pkts, pos, i; + int status; + + /* UMEM CQ. */ + n_pkts = p->params.bp->umem_cfg.comp_size; + + n_pkts = xsk_ring_cons__peek(&p->umem_cq, n_pkts, &pos); + + for (i = 0; i < n_pkts; i++) { + u64 addr = *xsk_ring_cons__comp_addr(&p->umem_cq, pos + i); + + bcache_prod(p->bc, addr); + } + + xsk_ring_cons__release(&p->umem_cq, n_pkts); + + /* TXQ. */ + n_pkts = b->n_pkts; + + for ( ; ; ) { + status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos); + if (status == n_pkts) + break; + + if (xsk_ring_prod__needs_wakeup(&p->txq)) + sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, + NULL, 0); + } + + for (i = 0; i < n_pkts; i++) { + xsk_ring_prod__tx_desc(&p->txq, pos + i)->addr = b->addr[i]; + xsk_ring_prod__tx_desc(&p->txq, pos + i)->len = b->len[i]; + } + + xsk_ring_prod__submit(&p->txq, n_pkts); + if (xsk_ring_prod__needs_wakeup(&p->txq)) + sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + p->n_pkts_tx += n_pkts; +} + +/* + * Thread + * + * Packet forwarding threads. + */ +#ifndef MAX_PORTS_PER_THREAD +#define MAX_PORTS_PER_THREAD 16 +#endif + +struct thread_data { + struct port *ports_rx[MAX_PORTS_PER_THREAD]; + struct port *ports_tx[MAX_PORTS_PER_THREAD]; + u32 n_ports_rx; + struct burst_rx burst_rx; + struct burst_tx burst_tx[MAX_PORTS_PER_THREAD]; + u32 cpu_core_id; + int quit; +}; + +static void swap_mac_addresses(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp; + + tmp = *src_addr; + *src_addr = *dst_addr; + *dst_addr = tmp; +} + +static void * +thread_func(void *arg) +{ + struct thread_data *t = arg; + cpu_set_t cpu_cores; + u32 i; + + CPU_ZERO(&cpu_cores); + CPU_SET(t->cpu_core_id, &cpu_cores); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); + + for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) { + struct port *port_rx = t->ports_rx[i]; + struct port *port_tx = t->ports_tx[i]; + struct burst_rx *brx = &t->burst_rx; + struct burst_tx *btx = &t->burst_tx[i]; + u32 n_pkts, j; + + /* RX. */ + n_pkts = port_rx_burst(port_rx, brx); + if (!n_pkts) + continue; + + /* Process & TX. */ + for (j = 0; j < n_pkts; j++) { + u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]); + u8 *pkt = xsk_umem__get_data(port_rx->params.bp->addr, + addr); + + swap_mac_addresses(pkt); + + btx->addr[btx->n_pkts] = brx->addr[j]; + btx->len[btx->n_pkts] = brx->len[j]; + btx->n_pkts++; + + if (btx->n_pkts == MAX_BURST_TX) { + port_tx_burst(port_tx, btx); + btx->n_pkts = 0; + } + } + } + + return NULL; +} + +/* + * Process + */ +static const struct bpool_params bpool_params_default = { + .n_buffers = 64 * 1024, + .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .mmap_flags = 0, + + .n_users_max = 16, + .n_buffers_per_slab = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, +}; + +static const struct xsk_umem_config umem_cfg_default = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + .flags = 0, +}; + +static const struct port_params port_params_default = { + .xsk_cfg = { + .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .libbpf_flags = 0, + .xdp_flags = XDP_FLAGS_DRV_MODE, + .bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY, + }, + + .bp = NULL, + .iface = NULL, + .iface_queue = 0, +}; + +#ifndef MAX_PORTS +#define MAX_PORTS 64 +#endif + +#ifndef MAX_THREADS +#define MAX_THREADS 64 +#endif + +static struct bpool_params bpool_params; +static struct xsk_umem_config umem_cfg; +static struct bpool *bp; + +static struct port_params port_params[MAX_PORTS]; +static struct port *ports[MAX_PORTS]; +static u64 n_pkts_rx[MAX_PORTS]; +static u64 n_pkts_tx[MAX_PORTS]; +static int n_ports; + +static pthread_t threads[MAX_THREADS]; +static struct thread_data thread_data[MAX_THREADS]; +static int n_threads; + +static void +print_usage(char *prog_name) +{ + const char *usage = + "Usage:\n" + "\t%s [ -b SIZE ] -c CORE -i INTERFACE [ -q QUEUE ]\n" + "\n" + "-c CORE CPU core to run a packet forwarding thread\n" + " on. May be invoked multiple times.\n" + "\n" + "-b SIZE Number of buffers in the buffer pool shared\n" + " by all the forwarding threads. Default: %u.\n" + "\n" + "-i INTERFACE Network interface. Each (INTERFACE, QUEUE)\n" + " pair specifies one forwarding port. May be\n" + " invoked multiple times.\n" + "\n" + "-q QUEUE Network interface queue for RX and TX. Each\n" + " (INTERFACE, QUEUE) pair specified one\n" + " forwarding port. Default: %u. May be invoked\n" + " multiple times.\n" + "\n"; + printf(usage, + prog_name, + bpool_params_default.n_buffers, + port_params_default.iface_queue); +} + +static int +parse_args(int argc, char **argv) +{ + struct option lgopts[] = { + { NULL, 0, 0, 0 } + }; + int opt, option_index; + + /* Parse the input arguments. */ + for ( ; ;) { + opt = getopt_long(argc, argv, "c:i:q:", lgopts, &option_index); + if (opt == EOF) + break; + + switch (opt) { + case 'b': + bpool_params.n_buffers = atoi(optarg); + break; + + case 'c': + if (n_threads == MAX_THREADS) { + printf("Max number of threads (%d) reached.\n", + MAX_THREADS); + return -1; + } + + thread_data[n_threads].cpu_core_id = atoi(optarg); + n_threads++; + break; + + case 'i': + if (n_ports == MAX_PORTS) { + printf("Max number of ports (%d) reached.\n", + MAX_PORTS); + return -1; + } + + port_params[n_ports].iface = optarg; + port_params[n_ports].iface_queue = 0; + n_ports++; + break; + + case 'q': + if (n_ports == 0) { + printf("No port specified for queue.\n"); + return -1; + } + port_params[n_ports - 1].iface_queue = atoi(optarg); + break; + + default: + printf("Illegal argument.\n"); + return -1; + } + } + + optind = 1; /* reset getopt lib */ + + /* Check the input arguments. */ + if (!n_ports) { + printf("No ports specified.\n"); + return -1; + } + + if (!n_threads) { + printf("No threads specified.\n"); + return -1; + } + + if (n_ports % n_threads) { + printf("Ports cannot be evenly distributed to threads.\n"); + return -1; + } + + return 0; +} + +static void +print_port(u32 port_id) +{ + struct port *port = ports[port_id]; + + printf("Port %u: interface = %s, queue = %u\n", + port_id, port->params.iface, port->params.iface_queue); +} + +static void +print_thread(u32 thread_id) +{ + struct thread_data *t = &thread_data[thread_id]; + u32 i; + + printf("Thread %u (CPU core %u): ", + thread_id, t->cpu_core_id); + + for (i = 0; i < t->n_ports_rx; i++) { + struct port *port_rx = t->ports_rx[i]; + struct port *port_tx = t->ports_tx[i]; + + printf("(%s, %u) -> (%s, %u), ", + port_rx->params.iface, + port_rx->params.iface_queue, + port_tx->params.iface, + port_tx->params.iface_queue); + } + + printf("\n"); +} + +static void +print_port_stats_separator(void) +{ + printf("+-%4s-+-%12s-+-%13s-+-%12s-+-%13s-+\n", + "----", + "------------", + "-------------", + "------------", + "-------------"); +} + +static void +print_port_stats_header(void) +{ + print_port_stats_separator(); + printf("| %4s | %12s | %13s | %12s | %13s |\n", + "Port", + "RX packets", + "RX rate (pps)", + "TX packets", + "TX_rate (pps)"); + print_port_stats_separator(); +} + +static void +print_port_stats_trailer(void) +{ + print_port_stats_separator(); + printf("\n"); +} + +static void +print_port_stats(int port_id, u64 ns_diff) +{ + struct port *p = ports[port_id]; + double rx_pps, tx_pps; + + rx_pps = (p->n_pkts_rx - n_pkts_rx[port_id]) * 1000000000. / ns_diff; + tx_pps = (p->n_pkts_tx - n_pkts_tx[port_id]) * 1000000000. / ns_diff; + + printf("| %4d | %12llu | %13.0f | %12llu | %13.0f |\n", + port_id, + p->n_pkts_rx, + rx_pps, + p->n_pkts_tx, + tx_pps); + + n_pkts_rx[port_id] = p->n_pkts_rx; + n_pkts_tx[port_id] = p->n_pkts_tx; +} + +static void +print_port_stats_all(u64 ns_diff) +{ + int i; + + print_port_stats_header(); + for (i = 0; i < n_ports; i++) + print_port_stats(i, ns_diff); + print_port_stats_trailer(); +} + +static int quit; + +static void +signal_handler(int sig) +{ + quit = 1; +} + +static void remove_xdp_program(void) +{ + int i; + + for (i = 0 ; i < n_ports; i++) + bpf_set_link_xdp_fd(if_nametoindex(port_params[i].iface), -1, + port_params[i].xsk_cfg.xdp_flags); +} + +int main(int argc, char **argv) +{ + struct timespec time; + u64 ns0; + int i; + + /* Parse args. */ + memcpy(&bpool_params, &bpool_params_default, + sizeof(struct bpool_params)); + memcpy(&umem_cfg, &umem_cfg_default, + sizeof(struct xsk_umem_config)); + for (i = 0; i < MAX_PORTS; i++) + memcpy(&port_params[i], &port_params_default, + sizeof(struct port_params)); + + if (parse_args(argc, argv)) { + print_usage(argv[0]); + return -1; + } + + /* Buffer pool initialization. */ + bp = bpool_init(&bpool_params, &umem_cfg); + if (!bp) { + printf("Buffer pool initialization failed.\n"); + return -1; + } + printf("Buffer pool created successfully.\n"); + + /* Ports initialization. */ + for (i = 0; i < MAX_PORTS; i++) + port_params[i].bp = bp; + + for (i = 0; i < n_ports; i++) { + ports[i] = port_init(&port_params[i]); + if (!ports[i]) { + printf("Port %d initialization failed.\n", i); + return -1; + } + print_port(i); + } + printf("All ports created successfully.\n"); + + /* Threads. */ + for (i = 0; i < n_threads; i++) { + struct thread_data *t = &thread_data[i]; + u32 n_ports_per_thread = n_ports / n_threads, j; + + for (j = 0; j < n_ports_per_thread; j++) { + t->ports_rx[j] = ports[i * n_ports_per_thread + j]; + t->ports_tx[j] = ports[i * n_ports_per_thread + + (j + 1) % n_ports_per_thread]; + } + + t->n_ports_rx = n_ports_per_thread; + + print_thread(i); + } + + for (i = 0; i < n_threads; i++) { + int status; + + status = pthread_create(&threads[i], + NULL, + thread_func, + &thread_data[i]); + if (status) { + printf("Thread %d creation failed.\n", i); + return -1; + } + } + printf("All threads created successfully.\n"); + + /* Print statistics. */ + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + signal(SIGABRT, signal_handler); + + clock_gettime(CLOCK_MONOTONIC, &time); + ns0 = time.tv_sec * 1000000000UL + time.tv_nsec; + for ( ; !quit; ) { + u64 ns1, ns_diff; + + sleep(1); + clock_gettime(CLOCK_MONOTONIC, &time); + ns1 = time.tv_sec * 1000000000UL + time.tv_nsec; + ns_diff = ns1 - ns0; + ns0 = ns1; + + print_port_stats_all(ns_diff); + } + + /* Threads completion. */ + printf("Quit.\n"); + for (i = 0; i < n_threads; i++) + thread_data[i].quit = 1; + + for (i = 0; i < n_threads; i++) + pthread_join(threads[i], NULL); + + for (i = 0; i < n_ports; i++) + port_free(ports[i]); + + bpool_free(bp); + + remove_xdp_program(); + + return 0; +} diff --git a/samples/configfs/Makefile b/samples/configfs/Makefile new file mode 100644 index 000000000..92d661fcb --- /dev/null +++ b/samples/configfs/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs_sample.o diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c new file mode 100644 index 000000000..f9008be7a --- /dev/null +++ b/samples/configfs/configfs_sample.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * vim: noexpandtab ts=8 sts=0 sw=8: + * + * configfs_example_macros.c - This file is a demonstration module + * containing a number of configfs subsystems. It uses the helper + * macros defined by configfs.h + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/configfs.h> + +/* + * 01-childless + * + * This first example is a childless subsystem. It cannot create + * any config_items. It just has attributes. + * + * Note that we are enclosing the configfs_subsystem inside a container. + * This is not necessary if a subsystem has no attributes directly + * on the subsystem. See the next example, 02-simple-children, for + * such a subsystem. + */ + +struct childless { + struct configfs_subsystem subsys; + int showme; + int storeme; +}; + +static inline struct childless *to_childless(struct config_item *item) +{ + return container_of(to_configfs_subsystem(to_config_group(item)), + struct childless, subsys); +} + +static ssize_t childless_showme_show(struct config_item *item, char *page) +{ + struct childless *childless = to_childless(item); + ssize_t pos; + + pos = sprintf(page, "%d\n", childless->showme); + childless->showme++; + + return pos; +} + +static ssize_t childless_storeme_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_childless(item)->storeme); +} + +static ssize_t childless_storeme_store(struct config_item *item, + const char *page, size_t count) +{ + struct childless *childless = to_childless(item); + int ret; + + ret = kstrtoint(page, 10, &childless->storeme); + if (ret) + return ret; + + return count; +} + +static ssize_t childless_description_show(struct config_item *item, char *page) +{ + return sprintf(page, +"[01-childless]\n" +"\n" +"The childless subsystem is the simplest possible subsystem in\n" +"configfs. It does not support the creation of child config_items.\n" +"It only has a few attributes. In fact, it isn't much different\n" +"than a directory in /proc.\n"); +} + +CONFIGFS_ATTR_RO(childless_, showme); +CONFIGFS_ATTR(childless_, storeme); +CONFIGFS_ATTR_RO(childless_, description); + +static struct configfs_attribute *childless_attrs[] = { + &childless_attr_showme, + &childless_attr_storeme, + &childless_attr_description, + NULL, +}; + +static const struct config_item_type childless_type = { + .ct_attrs = childless_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct childless childless_subsys = { + .subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "01-childless", + .ci_type = &childless_type, + }, + }, + }, +}; + +/* ----------------------------------------------------------------- */ + +/* + * 02-simple-children + * + * This example merely has a simple one-attribute child. Note that + * there is no extra attribute structure, as the child's attribute is + * known from the get-go. Also, there is no container for the + * subsystem, as it has no attributes of its own. + */ + +struct simple_child { + struct config_item item; + int storeme; +}; + +static inline struct simple_child *to_simple_child(struct config_item *item) +{ + return container_of(item, struct simple_child, item); +} + +static ssize_t simple_child_storeme_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_simple_child(item)->storeme); +} + +static ssize_t simple_child_storeme_store(struct config_item *item, + const char *page, size_t count) +{ + struct simple_child *simple_child = to_simple_child(item); + int ret; + + ret = kstrtoint(page, 10, &simple_child->storeme); + if (ret) + return ret; + + return count; +} + +CONFIGFS_ATTR(simple_child_, storeme); + +static struct configfs_attribute *simple_child_attrs[] = { + &simple_child_attr_storeme, + NULL, +}; + +static void simple_child_release(struct config_item *item) +{ + kfree(to_simple_child(item)); +} + +static struct configfs_item_operations simple_child_item_ops = { + .release = simple_child_release, +}; + +static const struct config_item_type simple_child_type = { + .ct_item_ops = &simple_child_item_ops, + .ct_attrs = simple_child_attrs, + .ct_owner = THIS_MODULE, +}; + +struct simple_children { + struct config_group group; +}; + +static inline struct simple_children *to_simple_children(struct config_item *item) +{ + return container_of(to_config_group(item), + struct simple_children, group); +} + +static struct config_item *simple_children_make_item(struct config_group *group, + const char *name) +{ + struct simple_child *simple_child; + + simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL); + if (!simple_child) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&simple_child->item, name, + &simple_child_type); + + return &simple_child->item; +} + +static ssize_t simple_children_description_show(struct config_item *item, + char *page) +{ + return sprintf(page, +"[02-simple-children]\n" +"\n" +"This subsystem allows the creation of child config_items. These\n" +"items have only one attribute that is readable and writeable.\n"); +} + +CONFIGFS_ATTR_RO(simple_children_, description); + +static struct configfs_attribute *simple_children_attrs[] = { + &simple_children_attr_description, + NULL, +}; + +static void simple_children_release(struct config_item *item) +{ + kfree(to_simple_children(item)); +} + +static struct configfs_item_operations simple_children_item_ops = { + .release = simple_children_release, +}; + +/* + * Note that, since no extra work is required on ->drop_item(), + * no ->drop_item() is provided. + */ +static struct configfs_group_operations simple_children_group_ops = { + .make_item = simple_children_make_item, +}; + +static const struct config_item_type simple_children_type = { + .ct_item_ops = &simple_children_item_ops, + .ct_group_ops = &simple_children_group_ops, + .ct_attrs = simple_children_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem simple_children_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "02-simple-children", + .ci_type = &simple_children_type, + }, + }, +}; + +/* ----------------------------------------------------------------- */ + +/* + * 03-group-children + * + * This example reuses the simple_children group from above. However, + * the simple_children group is not the subsystem itself, it is a + * child of the subsystem. Creation of a group in the subsystem creates + * a new simple_children group. That group can then have simple_child + * children of its own. + */ + +static struct config_group *group_children_make_group( + struct config_group *group, const char *name) +{ + struct simple_children *simple_children; + + simple_children = kzalloc(sizeof(struct simple_children), + GFP_KERNEL); + if (!simple_children) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&simple_children->group, name, + &simple_children_type); + + return &simple_children->group; +} + +static ssize_t group_children_description_show(struct config_item *item, + char *page) +{ + return sprintf(page, +"[03-group-children]\n" +"\n" +"This subsystem allows the creation of child config_groups. These\n" +"groups are like the subsystem simple-children.\n"); +} + +CONFIGFS_ATTR_RO(group_children_, description); + +static struct configfs_attribute *group_children_attrs[] = { + &group_children_attr_description, + NULL, +}; + +/* + * Note that, since no extra work is required on ->drop_item(), + * no ->drop_item() is provided. + */ +static struct configfs_group_operations group_children_group_ops = { + .make_group = group_children_make_group, +}; + +static const struct config_item_type group_children_type = { + .ct_group_ops = &group_children_group_ops, + .ct_attrs = group_children_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem group_children_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "03-group-children", + .ci_type = &group_children_type, + }, + }, +}; + +/* ----------------------------------------------------------------- */ + +/* + * We're now done with our subsystem definitions. + * For convenience in this module, here's a list of them all. It + * allows the init function to easily register them. Most modules + * will only have one subsystem, and will only call register_subsystem + * on it directly. + */ +static struct configfs_subsystem *example_subsys[] = { + &childless_subsys.subsys, + &simple_children_subsys, + &group_children_subsys, + NULL, +}; + +static int __init configfs_example_init(void) +{ + struct configfs_subsystem *subsys; + int ret, i; + + for (i = 0; example_subsys[i]; i++) { + subsys = example_subsys[i]; + + config_group_init(&subsys->su_group); + mutex_init(&subsys->su_mutex); + ret = configfs_register_subsystem(subsys); + if (ret) { + pr_err("Error %d while registering subsystem %s\n", + ret, subsys->su_group.cg_item.ci_namebuf); + goto out_unregister; + } + } + + return 0; + +out_unregister: + for (i--; i >= 0; i--) + configfs_unregister_subsystem(example_subsys[i]); + + return ret; +} + +static void __exit configfs_example_exit(void) +{ + int i; + + for (i = 0; example_subsys[i]; i++) + configfs_unregister_subsystem(example_subsys[i]); +} + +module_init(configfs_example_init); +module_exit(configfs_example_exit); +MODULE_LICENSE("GPL"); diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore new file mode 100644 index 000000000..d86f2ff9c --- /dev/null +++ b/samples/connector/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +ucon diff --git a/samples/connector/Makefile b/samples/connector/Makefile new file mode 100644 index 000000000..d98a9e047 --- /dev/null +++ b/samples/connector/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o + +userprogs-always-$(CONFIG_CC_CAN_LINK) += ucon + +userccflags += -I usr/include diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c new file mode 100644 index 000000000..0958a171d --- /dev/null +++ b/samples/connector/cn_test.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * cn_test.c + * + * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net> + * All rights reserved. + */ + +#define pr_fmt(fmt) "cn_test: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/timer.h> + +#include <linux/connector.h> + +static struct cb_id cn_test_id = { CN_NETLINK_USERS + 3, 0x456 }; +static char cn_test_name[] = "cn_test"; +static struct sock *nls; +static struct timer_list cn_test_timer; + +static void cn_test_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ + pr_info("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n", + __func__, jiffies, msg->id.idx, msg->id.val, + msg->seq, msg->ack, msg->len, + msg->len ? (char *)msg->data : ""); +} + +/* + * Do not remove this function even if no one is using it as + * this is an example of how to get notifications about new + * connector user registration + */ +#if 0 +static int cn_test_want_notify(void) +{ + struct cn_ctl_msg *ctl; + struct cn_notify_req *req; + struct cn_msg *msg = NULL; + int size, size0; + struct sk_buff *skb; + struct nlmsghdr *nlh; + u32 group = 1; + + size0 = sizeof(*msg) + sizeof(*ctl) + 3 * sizeof(*req); + + size = NLMSG_SPACE(size0); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + pr_err("failed to allocate new skb with size=%u\n", size); + return -ENOMEM; + } + + nlh = nlmsg_put(skb, 0, 0x123, NLMSG_DONE, size - sizeof(*nlh), 0); + if (!nlh) { + kfree_skb(skb); + return -EMSGSIZE; + } + + msg = nlmsg_data(nlh); + + memset(msg, 0, size0); + + msg->id.idx = -1; + msg->id.val = -1; + msg->seq = 0x123; + msg->ack = 0x345; + msg->len = size0 - sizeof(*msg); + + ctl = (struct cn_ctl_msg *)(msg + 1); + + ctl->idx_notify_num = 1; + ctl->val_notify_num = 2; + ctl->group = group; + ctl->len = msg->len - sizeof(*ctl); + + req = (struct cn_notify_req *)(ctl + 1); + + /* + * Idx. + */ + req->first = cn_test_id.idx; + req->range = 10; + + /* + * Val 0. + */ + req++; + req->first = cn_test_id.val; + req->range = 10; + + /* + * Val 1. + */ + req++; + req->first = cn_test_id.val + 20; + req->range = 10; + + NETLINK_CB(skb).dst_group = ctl->group; + //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC); + netlink_unicast(nls, skb, 0, 0); + + pr_info("request was sent: group=0x%x\n", ctl->group); + + return 0; +} +#endif + +static u32 cn_test_timer_counter; +static void cn_test_timer_func(struct timer_list *unused) +{ + struct cn_msg *m; + char data[32]; + + pr_debug("%s: timer fired\n", __func__); + + m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); + if (m) { + + memcpy(&m->id, &cn_test_id, sizeof(m->id)); + m->seq = cn_test_timer_counter; + m->len = sizeof(data); + + m->len = + scnprintf(data, sizeof(data), "counter = %u", + cn_test_timer_counter) + 1; + + memcpy(m + 1, data, m->len); + + cn_netlink_send(m, 0, 0, GFP_ATOMIC); + kfree(m); + } + + cn_test_timer_counter++; + + mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); +} + +static int cn_test_init(void) +{ + int err; + + err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback); + if (err) + goto err_out; + cn_test_id.val++; + err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback); + if (err) { + cn_del_callback(&cn_test_id); + goto err_out; + } + + timer_setup(&cn_test_timer, cn_test_timer_func, 0); + mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); + + pr_info("initialized with id={%u.%u}\n", + cn_test_id.idx, cn_test_id.val); + + return 0; + + err_out: + if (nls && nls->sk_socket) + sock_release(nls->sk_socket); + + return err; +} + +static void cn_test_fini(void) +{ + del_timer_sync(&cn_test_timer); + cn_del_callback(&cn_test_id); + cn_test_id.val--; + cn_del_callback(&cn_test_id); + if (nls && nls->sk_socket) + sock_release(nls->sk_socket); +} + +module_init(cn_test_init); +module_exit(cn_test_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); +MODULE_DESCRIPTION("Connector's test module"); diff --git a/samples/connector/ucon.c b/samples/connector/ucon.c new file mode 100644 index 000000000..fa17f8642 --- /dev/null +++ b/samples/connector/ucon.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ucon.c + * + * Copyright (c) 2004+ Evgeniy Polyakov <zbr@ioremap.net> + */ + +#include <asm/types.h> + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/poll.h> + +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +#include <arpa/inet.h> + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <getopt.h> + +#include <linux/connector.h> + +#define DEBUG +#define NETLINK_CONNECTOR 11 + +/* Hopefully your userspace connector.h matches this kernel */ +#define CN_TEST_IDX CN_NETLINK_USERS + 3 +#define CN_TEST_VAL 0x456 + +#ifdef DEBUG +#define ulog(f, a...) fprintf(stdout, f, ##a) +#else +#define ulog(f, a...) do {} while (0) +#endif + +static int need_exit; +static __u32 seq; + +static int netlink_send(int s, struct cn_msg *msg) +{ + struct nlmsghdr *nlh; + unsigned int size; + int err; + char buf[128]; + struct cn_msg *m; + + size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len); + + nlh = (struct nlmsghdr *)buf; + nlh->nlmsg_seq = seq++; + nlh->nlmsg_pid = getpid(); + nlh->nlmsg_type = NLMSG_DONE; + nlh->nlmsg_len = size; + nlh->nlmsg_flags = 0; + + m = NLMSG_DATA(nlh); +#if 0 + ulog("%s: [%08x.%08x] len=%u, seq=%u, ack=%u.\n", + __func__, msg->id.idx, msg->id.val, msg->len, msg->seq, msg->ack); +#endif + memcpy(m, msg, sizeof(*m) + msg->len); + + err = send(s, nlh, size, 0); + if (err == -1) + ulog("Failed to send: %s [%d].\n", + strerror(errno), errno); + + return err; +} + +static void usage(void) +{ + printf( + "Usage: ucon [options] [output file]\n" + "\n" + "\t-h\tthis help screen\n" + "\t-s\tsend buffers to the test module\n" + "\n" + "The default behavior of ucon is to subscribe to the test module\n" + "and wait for state messages. Any ones received are dumped to the\n" + "specified output file (or stdout). The test module is assumed to\n" + "have an id of {%u.%u}\n" + "\n" + "If you get no output, then verify the cn_test module id matches\n" + "the expected id above.\n" + , CN_TEST_IDX, CN_TEST_VAL + ); +} + +int main(int argc, char *argv[]) +{ + int s; + char buf[1024]; + int len; + struct nlmsghdr *reply; + struct sockaddr_nl l_local; + struct cn_msg *data; + FILE *out; + time_t tm; + struct pollfd pfd; + bool send_msgs = false; + + while ((s = getopt(argc, argv, "hs")) != -1) { + switch (s) { + case 's': + send_msgs = true; + break; + + case 'h': + usage(); + return 0; + + default: + /* getopt() outputs an error for us */ + usage(); + return 1; + } + } + + if (argc != optind) { + out = fopen(argv[optind], "a+"); + if (!out) { + ulog("Unable to open %s for writing: %s\n", + argv[1], strerror(errno)); + out = stdout; + } + } else + out = stdout; + + memset(buf, 0, sizeof(buf)); + + s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); + if (s == -1) { + perror("socket"); + return -1; + } + + l_local.nl_family = AF_NETLINK; + l_local.nl_groups = -1; /* bitmask of requested groups */ + l_local.nl_pid = 0; + + ulog("subscribing to %u.%u\n", CN_TEST_IDX, CN_TEST_VAL); + + if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { + perror("bind"); + close(s); + return -1; + } + +#if 0 + { + int on = 0x57; /* Additional group number */ + setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); + } +#endif + if (send_msgs) { + int i, j; + + memset(buf, 0, sizeof(buf)); + + data = (struct cn_msg *)buf; + + data->id.idx = CN_TEST_IDX; + data->id.val = CN_TEST_VAL; + data->seq = seq++; + data->ack = 0; + data->len = 0; + + for (j=0; j<10; ++j) { + for (i=0; i<1000; ++i) { + len = netlink_send(s, data); + } + + ulog("%d messages have been sent to %08x.%08x.\n", i, data->id.idx, data->id.val); + } + + return 0; + } + + + pfd.fd = s; + + while (!need_exit) { + pfd.events = POLLIN; + pfd.revents = 0; + switch (poll(&pfd, 1, -1)) { + case 0: + need_exit = 1; + break; + case -1: + if (errno != EINTR) { + need_exit = 1; + break; + } + continue; + } + if (need_exit) + break; + + memset(buf, 0, sizeof(buf)); + len = recv(s, buf, sizeof(buf), 0); + if (len == -1) { + perror("recv buf"); + close(s); + return -1; + } + reply = (struct nlmsghdr *)buf; + + switch (reply->nlmsg_type) { + case NLMSG_ERROR: + fprintf(out, "Error message received.\n"); + fflush(out); + break; + case NLMSG_DONE: + data = (struct cn_msg *)NLMSG_DATA(reply); + + time(&tm); + fprintf(out, "%.24s : [%x.%x] [%08u.%08u].\n", + ctime(&tm), data->id.idx, data->id.val, data->seq, data->ack); + fflush(out); + break; + default: + break; + } + } + + close(s); + return 0; +} diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile new file mode 100644 index 000000000..4ce896e10 --- /dev/null +++ b/samples/ftrace/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o + +CFLAGS_sample-trace-array.o := -I$(src) +obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c new file mode 100644 index 000000000..d620f3da0 --- /dev/null +++ b/samples/ftrace/ftrace-direct-modify.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/ftrace.h> + +extern void my_direct_func1(void); +extern void my_direct_func2(void); + +void my_direct_func1(void) +{ + trace_printk("my direct func1\n"); +} + +void my_direct_func2(void) +{ + trace_printk("my direct func2\n"); +} + +extern void my_tramp1(void *); +extern void my_tramp2(void *); + +static unsigned long my_ip = (unsigned long)schedule; + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" call my_direct_func1\n" +" leave\n" +" .size my_tramp1, .-my_tramp1\n" + ASM_RET +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" call my_direct_func2\n" +" leave\n" + ASM_RET +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +static unsigned long my_tramp = (unsigned long)my_tramp1; +static unsigned long tramps[2] = { + (unsigned long)my_tramp1, + (unsigned long)my_tramp2, +}; + +static int simple_thread(void *arg) +{ + static int t; + int ret = 0; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2 * HZ); + + if (ret) + continue; + t ^= 1; + ret = modify_ftrace_direct(my_ip, my_tramp, tramps[t]); + if (!ret) + my_tramp = tramps[t]; + WARN_ON_ONCE(ret); + } + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init ftrace_direct_init(void) +{ + int ret; + + ret = register_ftrace_direct(my_ip, my_tramp); + if (!ret) + simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); + return ret; +} + +static void __exit ftrace_direct_exit(void) +{ + kthread_stop(simple_tsk); + unregister_ftrace_direct(my_ip, my_tramp); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c new file mode 100644 index 000000000..4bdd67916 --- /dev/null +++ b/samples/ftrace/ftrace-direct-too.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> + +#include <linux/mm.h> /* for handle_mm_fault() */ +#include <linux/ftrace.h> + +extern void my_direct_func(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs); + +void my_direct_func(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs) +{ + trace_printk("handle mm fault vma=%p address=%lx flags=%x regs=%p\n", + vma, address, flags, regs); +} + +extern void my_tramp(void *); + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" pushq %rsi\n" +" pushq %rdx\n" +" pushq %rcx\n" +" call my_direct_func\n" +" popq %rcx\n" +" popq %rdx\n" +" popq %rsi\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + + +static int __init ftrace_direct_init(void) +{ + return register_ftrace_direct((unsigned long)handle_mm_fault, + (unsigned long)my_tramp); +} + +static void __exit ftrace_direct_exit(void) +{ + unregister_ftrace_direct((unsigned long)handle_mm_fault, + (unsigned long)my_tramp); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c new file mode 100644 index 000000000..1e901bb8d --- /dev/null +++ b/samples/ftrace/ftrace-direct.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> + +#include <linux/sched.h> /* for wake_up_process() */ +#include <linux/ftrace.h> + +extern void my_direct_func(struct task_struct *p); + +void my_direct_func(struct task_struct *p) +{ + trace_printk("waking up %s-%d\n", p->comm, p->pid); +} + +extern void my_tramp(void *); + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" call my_direct_func\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + + +static int __init ftrace_direct_init(void) +{ + return register_ftrace_direct((unsigned long)wake_up_process, + (unsigned long)my_tramp); +} + +static void __exit ftrace_direct_exit(void) +{ + unregister_ftrace_direct((unsigned long)wake_up_process, + (unsigned long)my_tramp); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c new file mode 100644 index 000000000..6aba02a31 --- /dev/null +++ b/samples/ftrace/sample-trace-array.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace.h> +#include <linux/trace_events.h> +#include <linux/timer.h> +#include <linux/err.h> +#include <linux/jiffies.h> +#include <linux/workqueue.h> + +/* + * Any file that uses trace points, must include the header. + * But only one file, must include the header by defining + * CREATE_TRACE_POINTS first. This will make the C code that + * creates the handles for the trace points. + */ +#define CREATE_TRACE_POINTS +#include "sample-trace-array.h" + +struct trace_array *tr; +static void mytimer_handler(struct timer_list *unused); +static struct task_struct *simple_tsk; + +static void trace_work_fn(struct work_struct *work) +{ + /* + * Disable tracing for event "sample_event". + */ + trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", + false); +} +static DECLARE_WORK(trace_work, trace_work_fn); + +/* + * mytimer: Timer setup to disable tracing for event "sample_event". This + * timer is only for the purposes of the sample module to demonstrate access of + * Ftrace instances from within kernel. + */ +static DEFINE_TIMER(mytimer, mytimer_handler); + +static void mytimer_handler(struct timer_list *unused) +{ + schedule_work(&trace_work); +} + +static void simple_thread_func(int count) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + + /* + * Printing count value using trace_array_printk() - trace_printk() + * equivalent for the instance buffers. + */ + trace_array_printk(tr, _THIS_IP_, "trace_array_printk: count=%d\n", + count); + /* + * Tracepoint for event "sample_event". This will print the + * current value of count and current jiffies. + */ + trace_sample_event(count, jiffies); +} + +static int simple_thread(void *arg) +{ + int count = 0; + unsigned long delay = msecs_to_jiffies(5000); + + /* + * Enable tracing for "sample_event". + */ + trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", true); + + /* + * Adding timer - mytimer. This timer will disable tracing after + * delay seconds. + * + */ + add_timer(&mytimer); + mod_timer(&mytimer, jiffies+delay); + + while (!kthread_should_stop()) + simple_thread_func(count++); + + del_timer(&mytimer); + cancel_work_sync(&trace_work); + + /* + * trace_array_put() decrements the reference counter associated with + * the trace array - "tr". We are done using the trace array, hence + * decrement the reference counter so that it can be destroyed using + * trace_array_destroy(). + */ + trace_array_put(tr); + + return 0; +} + +static int __init sample_trace_array_init(void) +{ + /* + * Return a pointer to the trace array with name "sample-instance" if it + * exists, else create a new trace array. + * + * NOTE: This function increments the reference counter + * associated with the trace array - "tr". + */ + tr = trace_array_get_by_name("sample-instance"); + + if (!tr) + return -1; + /* + * If context specific per-cpu buffers havent already been allocated. + */ + trace_printk_init_buffers(); + + simple_tsk = kthread_run(simple_thread, NULL, "sample-instance"); + if (IS_ERR(simple_tsk)) { + trace_array_put(tr); + trace_array_destroy(tr); + return -1; + } + + return 0; +} + +static void __exit sample_trace_array_exit(void) +{ + kthread_stop(simple_tsk); + + /* + * We are unloading our module and no longer require the trace array. + * Remove/destroy "tr" using trace_array_destroy() + */ + trace_array_destroy(tr); +} + +module_init(sample_trace_array_init); +module_exit(sample_trace_array_exit); + +MODULE_AUTHOR("Divya Indi"); +MODULE_DESCRIPTION("Sample module for kernel access to Ftrace instances"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/sample-trace-array.h b/samples/ftrace/sample-trace-array.h new file mode 100644 index 000000000..6f8962428 --- /dev/null +++ b/samples/ftrace/sample-trace-array.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * If TRACE_SYSTEM is defined, that will be the directory created + * in the ftrace directory under /sys/kernel/tracing/events/<system> + * + * The define_trace.h below will also look for a file name of + * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. + * In this case, it would look for sample-trace.h + * + * If the header name will be different than the system name + * (as in this case), then you can override the header name that + * define_trace.h will look up by defining TRACE_INCLUDE_FILE + * + * This file is called sample-trace-array.h but we want the system + * to be called "sample-subsystem". Therefore we must define the name of this + * file: + * + * #define TRACE_INCLUDE_FILE sample-trace-array + * + * As we do in the bottom of this file. + * + * Notice that TRACE_SYSTEM should be defined outside of #if + * protection, just like TRACE_INCLUDE_FILE. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sample-subsystem + +/* + * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric + * and underscore), although it may start with numbers. If for some + * reason it is not, you need to add the following lines: + */ +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR sample_subsystem + +/* + * But the above is only needed if TRACE_SYSTEM is not alpha-numeric + * and underscored. By default, TRACE_SYSTEM_VAR will be equal to + * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if + * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with + * only alpha-numeric and underscores. + * + * The TRACE_SYSTEM_VAR is only used internally and not visible to + * user space. + */ + +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_SAMPLE_TRACE_ARRAY_H) || defined(TRACE_HEADER_MULTI_READ) +#define _SAMPLE_TRACE_ARRAY_H + +#include <linux/tracepoint.h> +TRACE_EVENT(sample_event, + + TP_PROTO(int count, unsigned long time), + + TP_ARGS(count, time), + + TP_STRUCT__entry( + __field(int, count) + __field(unsigned long, time) + ), + + TP_fast_assign( + __entry->count = count; + __entry->time = time; + ), + + TP_printk("count value=%d at jiffies=%lu", __entry->count, + __entry->time) + ); +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE sample-trace-array +#include <trace/define_trace.h> diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore new file mode 100644 index 000000000..d7a6074eb --- /dev/null +++ b/samples/hidraw/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +hid-example diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile new file mode 100644 index 000000000..594d989e5 --- /dev/null +++ b/samples/hidraw/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += hid-example + +userccflags += -I usr/include diff --git a/samples/hidraw/hid-example.c b/samples/hidraw/hid-example.c new file mode 100644 index 000000000..37a0ffcb4 --- /dev/null +++ b/samples/hidraw/hid-example.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hidraw Userspace Example + * + * Copyright (c) 2010 Alan Ott <alan@signal11.us> + * Copyright (c) 2010 Signal 11 Software + * + * The code may be used by anyone for any purpose, + * and can serve as a starting point for developing + * applications using hidraw. + */ + +/* Linux */ +#include <linux/types.h> +#include <linux/input.h> +#include <linux/hidraw.h> + +/* + * Ugly hack to work around failing compilation on systems that don't + * yet populate new version of hidraw.h to userspace. + */ +#ifndef HIDIOCSFEATURE +#warning Please have your distro update the userspace kernel headers +#define HIDIOCSFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x06, len) +#define HIDIOCGFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x07, len) +#endif + +/* Unix */ +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +/* C */ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> + +const char *bus_str(int bus); + +int main(int argc, char **argv) +{ + int fd; + int i, res, desc_size = 0; + char buf[256]; + struct hidraw_report_descriptor rpt_desc; + struct hidraw_devinfo info; + char *device = "/dev/hidraw0"; + + if (argc > 1) + device = argv[1]; + + /* Open the Device with non-blocking reads. In real life, + don't use a hard coded path; use libudev instead. */ + fd = open(device, O_RDWR|O_NONBLOCK); + + if (fd < 0) { + perror("Unable to open device"); + return 1; + } + + memset(&rpt_desc, 0x0, sizeof(rpt_desc)); + memset(&info, 0x0, sizeof(info)); + memset(buf, 0x0, sizeof(buf)); + + /* Get Report Descriptor Size */ + res = ioctl(fd, HIDIOCGRDESCSIZE, &desc_size); + if (res < 0) + perror("HIDIOCGRDESCSIZE"); + else + printf("Report Descriptor Size: %d\n", desc_size); + + /* Get Report Descriptor */ + rpt_desc.size = desc_size; + res = ioctl(fd, HIDIOCGRDESC, &rpt_desc); + if (res < 0) { + perror("HIDIOCGRDESC"); + } else { + printf("Report Descriptor:\n"); + for (i = 0; i < rpt_desc.size; i++) + printf("%hhx ", rpt_desc.value[i]); + puts("\n"); + } + + /* Get Raw Name */ + res = ioctl(fd, HIDIOCGRAWNAME(256), buf); + if (res < 0) + perror("HIDIOCGRAWNAME"); + else + printf("Raw Name: %s\n", buf); + + /* Get Physical Location */ + res = ioctl(fd, HIDIOCGRAWPHYS(256), buf); + if (res < 0) + perror("HIDIOCGRAWPHYS"); + else + printf("Raw Phys: %s\n", buf); + + /* Get Raw Info */ + res = ioctl(fd, HIDIOCGRAWINFO, &info); + if (res < 0) { + perror("HIDIOCGRAWINFO"); + } else { + printf("Raw Info:\n"); + printf("\tbustype: %d (%s)\n", + info.bustype, bus_str(info.bustype)); + printf("\tvendor: 0x%04hx\n", info.vendor); + printf("\tproduct: 0x%04hx\n", info.product); + } + + /* Set Feature */ + buf[0] = 0x9; /* Report Number */ + buf[1] = 0xff; + buf[2] = 0xff; + buf[3] = 0xff; + res = ioctl(fd, HIDIOCSFEATURE(4), buf); + if (res < 0) + perror("HIDIOCSFEATURE"); + else + printf("ioctl HIDIOCSFEATURE returned: %d\n", res); + + /* Get Feature */ + buf[0] = 0x9; /* Report Number */ + res = ioctl(fd, HIDIOCGFEATURE(256), buf); + if (res < 0) { + perror("HIDIOCGFEATURE"); + } else { + printf("ioctl HIDIOCGFEATURE returned: %d\n", res); + printf("Report data (not containing the report number):\n\t"); + for (i = 0; i < res; i++) + printf("%hhx ", buf[i]); + puts("\n"); + } + + /* Send a Report to the Device */ + buf[0] = 0x1; /* Report Number */ + buf[1] = 0x77; + res = write(fd, buf, 2); + if (res < 0) { + printf("Error: %d\n", errno); + perror("write"); + } else { + printf("write() wrote %d bytes\n", res); + } + + /* Get a report from the device */ + res = read(fd, buf, 16); + if (res < 0) { + perror("read"); + } else { + printf("read() read %d bytes:\n\t", res); + for (i = 0; i < res; i++) + printf("%hhx ", buf[i]); + puts("\n"); + } + close(fd); + return 0; +} + +const char * +bus_str(int bus) +{ + switch (bus) { + case BUS_USB: + return "USB"; + break; + case BUS_HIL: + return "HIL"; + break; + case BUS_BLUETOOTH: + return "Bluetooth"; + break; + case BUS_VIRTUAL: + return "Virtual"; + break; + default: + return "Other"; + break; + } +} diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile new file mode 100644 index 000000000..ef4b6fdd7 --- /dev/null +++ b/samples/hw_breakpoint/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c new file mode 100644 index 000000000..b99322f18 --- /dev/null +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address + * + * usage: insmod data_breakpoint.ko ksym=<ksym_name> + * + * This file is a kernel module that places a breakpoint over ksym_name kernel + * variable using Hardware Breakpoint register. The corresponding handler which + * prints a backtrace is invoked every time a write operation is performed on + * that variable. + * + * Copyright (C) IBM Corporation, 2009 + * + * Author: K.Prasad <prasad@linux.vnet.ibm.com> + */ +#include <linux/module.h> /* Needed by all modules */ +#include <linux/kernel.h> /* Needed for KERN_INFO */ +#include <linux/init.h> /* Needed for the macros */ +#include <linux/kallsyms.h> + +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> + +struct perf_event * __percpu *sample_hbp; + +static char ksym_name[KSYM_NAME_LEN] = "jiffies"; +module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); +MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" + " write operations on the kernel symbol"); + +static void sample_hbp_handler(struct perf_event *bp, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + printk(KERN_INFO "%s value is changed\n", ksym_name); + dump_stack(); + printk(KERN_INFO "Dump stack from sample_hbp_handler\n"); +} + +static int __init hw_break_module_init(void) +{ + int ret; + struct perf_event_attr attr; + void *addr = __symbol_get(ksym_name); + + if (!addr) + return -ENXIO; + + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)addr; + attr.bp_len = HW_BREAKPOINT_LEN_4; + attr.bp_type = HW_BREAKPOINT_W; + + sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); + if (IS_ERR((void __force *)sample_hbp)) { + ret = PTR_ERR((void __force *)sample_hbp); + goto fail; + } + + printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name); + + return 0; + +fail: + printk(KERN_INFO "Breakpoint registration failed\n"); + + return ret; +} + +static void __exit hw_break_module_exit(void) +{ + unregister_wide_hw_breakpoint(sample_hbp); +#ifdef CONFIG_MODULE_UNLOAD + __symbol_put(ksym_name); +#endif + printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name); +} + +module_init(hw_break_module_init); +module_exit(hw_break_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("K.Prasad"); +MODULE_DESCRIPTION("ksym breakpoint"); diff --git a/samples/kdb/Makefile b/samples/kdb/Makefile new file mode 100644 index 000000000..947cb8522 --- /dev/null +++ b/samples/kdb/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_KDB) += kdb_hello.o diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c new file mode 100644 index 000000000..c1c2fa0f6 --- /dev/null +++ b/samples/kdb/kdb_hello.c @@ -0,0 +1,60 @@ +/* + * Created by: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (c) 2010 Wind River Systems, Inc. All Rights Reserved. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/kdb.h> + +/* + * All kdb shell command call backs receive argc and argv, where + * argv[0] is the command the end user typed + */ +static int kdb_hello_cmd(int argc, const char **argv) +{ + if (argc > 1) + return KDB_ARGCOUNT; + + if (argc) + kdb_printf("Hello %s.\n", argv[1]); + else + kdb_printf("Hello world!\n"); + + return 0; +} + + +static int __init kdb_hello_cmd_init(void) +{ + /* + * Registration of a dynamically added kdb command is done with + * kdb_register() with the arguments being: + * 1: The name of the shell command + * 2: The function that processes the command + * 3: Description of the usage of any arguments + * 4: Descriptive text when you run help + * 5: Number of characters to complete the command + * 0 == type the whole command + * 1 == match both "g" and "go" for example + */ + kdb_register("hello", kdb_hello_cmd, "[string]", + "Say Hello World or Hello [string]", 0); + return 0; +} + +static void __exit kdb_hello_cmd_exit(void) +{ + kdb_unregister("hello"); +} + +module_init(kdb_hello_cmd_init); +module_exit(kdb_hello_cmd_exit); + +MODULE_AUTHOR("WindRiver"); +MODULE_DESCRIPTION("KDB example to add a hello command"); +MODULE_LICENSE("GPL"); diff --git a/samples/kfifo/Makefile b/samples/kfifo/Makefile new file mode 100644 index 000000000..0af5250ad --- /dev/null +++ b/samples/kfifo/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_KFIFO) += bytestream-example.o dma-example.o inttype-example.o record-example.o diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c new file mode 100644 index 000000000..5a90aa527 --- /dev/null +++ b/samples/kfifo/bytestream-example.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Sample kfifo byte stream implementation + * + * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/mutex.h> +#include <linux/kfifo.h> + +/* + * This module shows how to create a byte stream fifo. + */ + +/* fifo size in elements (bytes) */ +#define FIFO_SIZE 32 + +/* name of the proc entry */ +#define PROC_FIFO "bytestream-fifo" + +/* lock for procfs read access */ +static DEFINE_MUTEX(read_lock); + +/* lock for procfs write access */ +static DEFINE_MUTEX(write_lock); + +/* + * define DYNAMIC in this example for a dynamically allocated fifo. + * + * Otherwise the fifo storage will be a part of the fifo structure. + */ +#if 0 +#define DYNAMIC +#endif + +#ifdef DYNAMIC +static struct kfifo test; +#else +static DECLARE_KFIFO(test, unsigned char, FIFO_SIZE); +#endif + +static const unsigned char expected_result[FIFO_SIZE] = { + 3, 4, 5, 6, 7, 8, 9, 0, + 1, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, +}; + +static int __init testfunc(void) +{ + unsigned char buf[6]; + unsigned char i, j; + unsigned int ret; + + printk(KERN_INFO "byte stream fifo test start\n"); + + /* put string into the fifo */ + kfifo_in(&test, "hello", 5); + + /* put values into the fifo */ + for (i = 0; i != 10; i++) + kfifo_put(&test, i); + + /* show the number of used elements */ + printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); + + /* get max of 5 bytes from the fifo */ + i = kfifo_out(&test, buf, 5); + printk(KERN_INFO "buf: %.*s\n", i, buf); + + /* get max of 2 elements from the fifo */ + ret = kfifo_out(&test, buf, 2); + printk(KERN_INFO "ret: %d\n", ret); + /* and put it back to the end of the fifo */ + ret = kfifo_in(&test, buf, ret); + printk(KERN_INFO "ret: %d\n", ret); + + /* skip first element of the fifo */ + printk(KERN_INFO "skip 1st element\n"); + kfifo_skip(&test); + + /* put values into the fifo until is full */ + for (i = 20; kfifo_put(&test, i); i++) + ; + + printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); + + /* show the first value without removing from the fifo */ + if (kfifo_peek(&test, &i)) + printk(KERN_INFO "%d\n", i); + + /* check the correctness of all values in the fifo */ + j = 0; + while (kfifo_get(&test, &i)) { + printk(KERN_INFO "item = %d\n", i); + if (i != expected_result[j++]) { + printk(KERN_WARNING "value mismatch: test failed\n"); + return -EIO; + } + } + if (j != ARRAY_SIZE(expected_result)) { + printk(KERN_WARNING "size mismatch: test failed\n"); + return -EIO; + } + printk(KERN_INFO "test passed\n"); + + return 0; +} + +static ssize_t fifo_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int ret; + unsigned int copied; + + if (mutex_lock_interruptible(&write_lock)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + + mutex_unlock(&write_lock); + if (ret) + return ret; + + return copied; +} + +static ssize_t fifo_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int ret; + unsigned int copied; + + if (mutex_lock_interruptible(&read_lock)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + + mutex_unlock(&read_lock); + if (ret) + return ret; + + return copied; +} + +static const struct proc_ops fifo_proc_ops = { + .proc_read = fifo_read, + .proc_write = fifo_write, + .proc_lseek = noop_llseek, +}; + +static int __init example_init(void) +{ +#ifdef DYNAMIC + int ret; + + ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL); + if (ret) { + printk(KERN_ERR "error kfifo_alloc\n"); + return ret; + } +#else + INIT_KFIFO(test); +#endif + if (testfunc() < 0) { +#ifdef DYNAMIC + kfifo_free(&test); +#endif + return -EIO; + } + + if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { +#ifdef DYNAMIC + kfifo_free(&test); +#endif + return -ENOMEM; + } + return 0; +} + +static void __exit example_exit(void) +{ + remove_proc_entry(PROC_FIFO, NULL); +#ifdef DYNAMIC + kfifo_free(&test); +#endif +} + +module_init(example_init); +module_exit(example_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c new file mode 100644 index 000000000..0cf27483c --- /dev/null +++ b/samples/kfifo/dma-example.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Sample fifo dma implementation + * + * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kfifo.h> + +/* + * This module shows how to handle fifo dma operations. + */ + +/* fifo size in elements (bytes) */ +#define FIFO_SIZE 32 + +static struct kfifo fifo; + +static int __init example_init(void) +{ + int i; + unsigned int ret; + unsigned int nents; + struct scatterlist sg[10]; + + printk(KERN_INFO "DMA fifo test start\n"); + + if (kfifo_alloc(&fifo, FIFO_SIZE, GFP_KERNEL)) { + printk(KERN_WARNING "error kfifo_alloc\n"); + return -ENOMEM; + } + + printk(KERN_INFO "queue size: %u\n", kfifo_size(&fifo)); + + kfifo_in(&fifo, "test", 4); + + for (i = 0; i != 9; i++) + kfifo_put(&fifo, i); + + /* kick away first byte */ + kfifo_skip(&fifo); + + printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo)); + + /* + * Configure the kfifo buffer to receive data from DMA input. + * + * .--------------------------------------. + * | 0 | 1 | 2 | ... | 12 | 13 | ... | 31 | + * |---|------------------|---------------| + * \_/ \________________/ \_____________/ + * \ \ \ + * \ \_allocated data \ + * \_*free space* \_*free space* + * + * We need two different SG entries: one for the free space area at the + * end of the kfifo buffer (19 bytes) and another for the first free + * byte at the beginning, after the kfifo_skip(). + */ + sg_init_table(sg, ARRAY_SIZE(sg)); + nents = kfifo_dma_in_prepare(&fifo, sg, ARRAY_SIZE(sg), FIFO_SIZE); + printk(KERN_INFO "DMA sgl entries: %d\n", nents); + if (!nents) { + /* fifo is full and no sgl was created */ + printk(KERN_WARNING "error kfifo_dma_in_prepare\n"); + return -EIO; + } + + /* receive data */ + printk(KERN_INFO "scatterlist for receive:\n"); + for (i = 0; i < nents; i++) { + printk(KERN_INFO + "sg[%d] -> " + "page %p offset 0x%.8x length 0x%.8x\n", + i, sg_page(&sg[i]), sg[i].offset, sg[i].length); + + if (sg_is_last(&sg[i])) + break; + } + + /* put here your code to setup and exectute the dma operation */ + /* ... */ + + /* example: zero bytes received */ + ret = 0; + + /* finish the dma operation and update the received data */ + kfifo_dma_in_finish(&fifo, ret); + + /* Prepare to transmit data, example: 8 bytes */ + nents = kfifo_dma_out_prepare(&fifo, sg, ARRAY_SIZE(sg), 8); + printk(KERN_INFO "DMA sgl entries: %d\n", nents); + if (!nents) { + /* no data was available and no sgl was created */ + printk(KERN_WARNING "error kfifo_dma_out_prepare\n"); + return -EIO; + } + + printk(KERN_INFO "scatterlist for transmit:\n"); + for (i = 0; i < nents; i++) { + printk(KERN_INFO + "sg[%d] -> " + "page %p offset 0x%.8x length 0x%.8x\n", + i, sg_page(&sg[i]), sg[i].offset, sg[i].length); + + if (sg_is_last(&sg[i])) + break; + } + + /* put here your code to setup and exectute the dma operation */ + /* ... */ + + /* example: 5 bytes transmitted */ + ret = 5; + + /* finish the dma operation and update the transmitted data */ + kfifo_dma_out_finish(&fifo, ret); + + ret = kfifo_len(&fifo); + printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo)); + + if (ret != 7) { + printk(KERN_WARNING "size mismatch: test failed"); + return -EIO; + } + printk(KERN_INFO "test passed\n"); + + return 0; +} + +static void __exit example_exit(void) +{ + kfifo_free(&fifo); +} + +module_init(example_init); +module_exit(example_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c new file mode 100644 index 000000000..e5403d8c9 --- /dev/null +++ b/samples/kfifo/inttype-example.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Sample kfifo int type implementation + * + * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/mutex.h> +#include <linux/kfifo.h> + +/* + * This module shows how to create a int type fifo. + */ + +/* fifo size in elements (ints) */ +#define FIFO_SIZE 32 + +/* name of the proc entry */ +#define PROC_FIFO "int-fifo" + +/* lock for procfs read access */ +static DEFINE_MUTEX(read_lock); + +/* lock for procfs write access */ +static DEFINE_MUTEX(write_lock); + +/* + * define DYNAMIC in this example for a dynamically allocated fifo. + * + * Otherwise the fifo storage will be a part of the fifo structure. + */ +#if 0 +#define DYNAMIC +#endif + +#ifdef DYNAMIC +static DECLARE_KFIFO_PTR(test, int); +#else +static DEFINE_KFIFO(test, int, FIFO_SIZE); +#endif + +static const int expected_result[FIFO_SIZE] = { + 3, 4, 5, 6, 7, 8, 9, 0, + 1, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, +}; + +static int __init testfunc(void) +{ + int buf[6]; + int i, j; + unsigned int ret; + + printk(KERN_INFO "int fifo test start\n"); + + /* put values into the fifo */ + for (i = 0; i != 10; i++) + kfifo_put(&test, i); + + /* show the number of used elements */ + printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); + + /* get max of 2 elements from the fifo */ + ret = kfifo_out(&test, buf, 2); + printk(KERN_INFO "ret: %d\n", ret); + /* and put it back to the end of the fifo */ + ret = kfifo_in(&test, buf, ret); + printk(KERN_INFO "ret: %d\n", ret); + + /* skip first element of the fifo */ + printk(KERN_INFO "skip 1st element\n"); + kfifo_skip(&test); + + /* put values into the fifo until is full */ + for (i = 20; kfifo_put(&test, i); i++) + ; + + printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); + + /* show the first value without removing from the fifo */ + if (kfifo_peek(&test, &i)) + printk(KERN_INFO "%d\n", i); + + /* check the correctness of all values in the fifo */ + j = 0; + while (kfifo_get(&test, &i)) { + printk(KERN_INFO "item = %d\n", i); + if (i != expected_result[j++]) { + printk(KERN_WARNING "value mismatch: test failed\n"); + return -EIO; + } + } + if (j != ARRAY_SIZE(expected_result)) { + printk(KERN_WARNING "size mismatch: test failed\n"); + return -EIO; + } + printk(KERN_INFO "test passed\n"); + + return 0; +} + +static ssize_t fifo_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int ret; + unsigned int copied; + + if (mutex_lock_interruptible(&write_lock)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + + mutex_unlock(&write_lock); + if (ret) + return ret; + + return copied; +} + +static ssize_t fifo_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int ret; + unsigned int copied; + + if (mutex_lock_interruptible(&read_lock)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + + mutex_unlock(&read_lock); + if (ret) + return ret; + + return copied; +} + +static const struct proc_ops fifo_proc_ops = { + .proc_read = fifo_read, + .proc_write = fifo_write, + .proc_lseek = noop_llseek, +}; + +static int __init example_init(void) +{ +#ifdef DYNAMIC + int ret; + + ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL); + if (ret) { + printk(KERN_ERR "error kfifo_alloc\n"); + return ret; + } +#endif + if (testfunc() < 0) { +#ifdef DYNAMIC + kfifo_free(&test); +#endif + return -EIO; + } + + if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { +#ifdef DYNAMIC + kfifo_free(&test); +#endif + return -ENOMEM; + } + return 0; +} + +static void __exit example_exit(void) +{ + remove_proc_entry(PROC_FIFO, NULL); +#ifdef DYNAMIC + kfifo_free(&test); +#endif +} + +module_init(example_init); +module_exit(example_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c new file mode 100644 index 000000000..f64f3d62d --- /dev/null +++ b/samples/kfifo/record-example.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Sample dynamic sized record fifo implementation + * + * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/mutex.h> +#include <linux/kfifo.h> + +/* + * This module shows how to create a variable sized record fifo. + */ + +/* fifo size in elements (bytes) */ +#define FIFO_SIZE 128 + +/* name of the proc entry */ +#define PROC_FIFO "record-fifo" + +/* lock for procfs read access */ +static DEFINE_MUTEX(read_lock); + +/* lock for procfs write access */ +static DEFINE_MUTEX(write_lock); + +/* + * define DYNAMIC in this example for a dynamically allocated fifo. + * + * Otherwise the fifo storage will be a part of the fifo structure. + */ +#if 0 +#define DYNAMIC +#endif + +/* + * struct kfifo_rec_ptr_1 and STRUCT_KFIFO_REC_1 can handle records of a + * length between 0 and 255 bytes. + * + * struct kfifo_rec_ptr_2 and STRUCT_KFIFO_REC_2 can handle records of a + * length between 0 and 65535 bytes. + */ + +#ifdef DYNAMIC +struct kfifo_rec_ptr_1 test; + +#else +typedef STRUCT_KFIFO_REC_1(FIFO_SIZE) mytest; + +static mytest test; +#endif + +static const char *expected_result[] = { + "a", + "bb", + "ccc", + "dddd", + "eeeee", + "ffffff", + "ggggggg", + "hhhhhhhh", + "iiiiiiiii", + "jjjjjjjjjj", +}; + +static int __init testfunc(void) +{ + char buf[100]; + unsigned int i; + unsigned int ret; + struct { unsigned char buf[6]; } hello = { "hello" }; + + printk(KERN_INFO "record fifo test start\n"); + + kfifo_in(&test, &hello, sizeof(hello)); + + /* show the size of the next record in the fifo */ + printk(KERN_INFO "fifo peek len: %u\n", kfifo_peek_len(&test)); + + /* put in variable length data */ + for (i = 0; i < 10; i++) { + memset(buf, 'a' + i, i + 1); + kfifo_in(&test, buf, i + 1); + } + + /* skip first element of the fifo */ + printk(KERN_INFO "skip 1st element\n"); + kfifo_skip(&test); + + printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); + + /* show the first record without removing from the fifo */ + ret = kfifo_out_peek(&test, buf, sizeof(buf)); + if (ret) + printk(KERN_INFO "%.*s\n", ret, buf); + + /* check the correctness of all values in the fifo */ + i = 0; + while (!kfifo_is_empty(&test)) { + ret = kfifo_out(&test, buf, sizeof(buf)); + buf[ret] = '\0'; + printk(KERN_INFO "item = %.*s\n", ret, buf); + if (strcmp(buf, expected_result[i++])) { + printk(KERN_WARNING "value mismatch: test failed\n"); + return -EIO; + } + } + if (i != ARRAY_SIZE(expected_result)) { + printk(KERN_WARNING "size mismatch: test failed\n"); + return -EIO; + } + printk(KERN_INFO "test passed\n"); + + return 0; +} + +static ssize_t fifo_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int ret; + unsigned int copied; + + if (mutex_lock_interruptible(&write_lock)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + + mutex_unlock(&write_lock); + if (ret) + return ret; + + return copied; +} + +static ssize_t fifo_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int ret; + unsigned int copied; + + if (mutex_lock_interruptible(&read_lock)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + + mutex_unlock(&read_lock); + if (ret) + return ret; + + return copied; +} + +static const struct proc_ops fifo_proc_ops = { + .proc_read = fifo_read, + .proc_write = fifo_write, + .proc_lseek = noop_llseek, +}; + +static int __init example_init(void) +{ +#ifdef DYNAMIC + int ret; + + ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL); + if (ret) { + printk(KERN_ERR "error kfifo_alloc\n"); + return ret; + } +#else + INIT_KFIFO(test); +#endif + if (testfunc() < 0) { +#ifdef DYNAMIC + kfifo_free(&test); +#endif + return -EIO; + } + + if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { +#ifdef DYNAMIC + kfifo_free(&test); +#endif + return -ENOMEM; + } + return 0; +} + +static void __exit example_exit(void) +{ + remove_proc_entry(PROC_FIFO, NULL); +#ifdef DYNAMIC + kfifo_free(&test); +#endif +} + +module_init(example_init); +module_exit(example_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile new file mode 100644 index 000000000..16b6132c5 --- /dev/null +++ b/samples/kmemleak/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c new file mode 100644 index 000000000..7b476eb82 --- /dev/null +++ b/samples/kmemleak/kmemleak-test.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * samples/kmemleak/kmemleak-test.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas <catalin.marinas@arm.com> + */ + +#define pr_fmt(fmt) "kmemleak: " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/fdtable.h> + +#include <linux/kmemleak.h> + +struct test_node { + long header[25]; + struct list_head list; + long footer[25]; +}; + +static LIST_HEAD(test_list); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); + +/* + * Some very simple testing. This function needs to be extended for + * proper testing. + */ +static int __init kmemleak_test_init(void) +{ + struct test_node *elem; + int i; + + pr_info("Kmemleak testing\n"); + + /* make some orphan objects */ + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); +#ifndef CONFIG_MODULES + pr_info("kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); + pr_info("kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); +#endif + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + + /* + * Add elements to a list. They should only appear as orphan + * after the module is removed. + */ + for (i = 0; i < 10; i++) { + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); + if (!elem) + return -ENOMEM; + INIT_LIST_HEAD(&elem->list); + list_add_tail(&elem->list, &test_list); + } + + for_each_possible_cpu(i) { + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); + pr_info("kmalloc(129) = %p\n", + per_cpu(kmemleak_test_pointer, i)); + } + + return 0; +} +module_init(kmemleak_test_init); + +static void __exit kmemleak_test_exit(void) +{ + struct test_node *elem, *tmp; + + /* + * Remove the list elements without actually freeing the + * memory. + */ + list_for_each_entry_safe(elem, tmp, &test_list, list) + list_del(&elem->list); +} +module_exit(kmemleak_test_exit); + +MODULE_LICENSE("GPL"); diff --git a/samples/kobject/Makefile b/samples/kobject/Makefile new file mode 100644 index 000000000..bb5d21997 --- /dev/null +++ b/samples/kobject/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_KOBJECT) += kobject-example.o kset-example.o diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c new file mode 100644 index 000000000..9e383fdba --- /dev/null +++ b/samples/kobject/kobject-example.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Sample kobject implementation + * + * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> + * Copyright (C) 2007 Novell Inc. + */ +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/module.h> +#include <linux/init.h> + +/* + * This module shows how to create a simple subdirectory in sysfs called + * /sys/kernel/kobject-example In that directory, 3 files are created: + * "foo", "baz", and "bar". If an integer is written to these files, it can be + * later read out of it. + */ + +static int foo; +static int baz; +static int bar; + +/* + * The "foo" file where a static variable is read from and written to. + */ +static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", foo); +} + +static ssize_t foo_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + ret = kstrtoint(buf, 10, &foo); + if (ret < 0) + return ret; + + return count; +} + +/* Sysfs attributes cannot be world-writable. */ +static struct kobj_attribute foo_attribute = + __ATTR(foo, 0664, foo_show, foo_store); + +/* + * More complex function where we determine which variable is being accessed by + * looking at the attribute for the "baz" and "bar" files. + */ +static ssize_t b_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int var; + + if (strcmp(attr->attr.name, "baz") == 0) + var = baz; + else + var = bar; + return sprintf(buf, "%d\n", var); +} + +static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int var, ret; + + ret = kstrtoint(buf, 10, &var); + if (ret < 0) + return ret; + + if (strcmp(attr->attr.name, "baz") == 0) + baz = var; + else + bar = var; + return count; +} + +static struct kobj_attribute baz_attribute = + __ATTR(baz, 0664, b_show, b_store); +static struct kobj_attribute bar_attribute = + __ATTR(bar, 0664, b_show, b_store); + + +/* + * Create a group of attributes so that we can create and destroy them all + * at once. + */ +static struct attribute *attrs[] = { + &foo_attribute.attr, + &baz_attribute.attr, + &bar_attribute.attr, + NULL, /* need to NULL terminate the list of attributes */ +}; + +/* + * An unnamed attribute group will put all of the attributes directly in + * the kobject directory. If we specify a name, a subdirectory will be + * created for the attributes with the directory being the name of the + * attribute group. + */ +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *example_kobj; + +static int __init example_init(void) +{ + int retval; + + /* + * Create a simple kobject with the name of "kobject_example", + * located under /sys/kernel/ + * + * As this is a simple directory, no uevent will be sent to + * userspace. That is why this function should not be used for + * any type of dynamic kobjects, where the name and number are + * not known ahead of time. + */ + example_kobj = kobject_create_and_add("kobject_example", kernel_kobj); + if (!example_kobj) + return -ENOMEM; + + /* Create the files associated with this kobject */ + retval = sysfs_create_group(example_kobj, &attr_group); + if (retval) + kobject_put(example_kobj); + + return retval; +} + +static void __exit example_exit(void) +{ + kobject_put(example_kobj); +} + +module_init(example_init); +module_exit(example_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>"); diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c new file mode 100644 index 000000000..c8010f126 --- /dev/null +++ b/samples/kobject/kset-example.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Sample kset and ktype implementation + * + * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> + * Copyright (C) 2007 Novell Inc. + */ +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/init.h> + +/* + * This module shows how to create a kset in sysfs called + * /sys/kernel/kset-example + * Then tree kobjects are created and assigned to this kset, "foo", "baz", + * and "bar". In those kobjects, attributes of the same name are also + * created and if an integer is written to these files, it can be later + * read out of it. + */ + + +/* + * This is our "object" that we will create a few of and register them with + * sysfs. + */ +struct foo_obj { + struct kobject kobj; + int foo; + int baz; + int bar; +}; +#define to_foo_obj(x) container_of(x, struct foo_obj, kobj) + +/* a custom attribute that works just for a struct foo_obj. */ +struct foo_attribute { + struct attribute attr; + ssize_t (*show)(struct foo_obj *foo, struct foo_attribute *attr, char *buf); + ssize_t (*store)(struct foo_obj *foo, struct foo_attribute *attr, const char *buf, size_t count); +}; +#define to_foo_attr(x) container_of(x, struct foo_attribute, attr) + +/* + * The default show function that must be passed to sysfs. This will be + * called by sysfs for whenever a show function is called by the user on a + * sysfs file associated with the kobjects we have registered. We need to + * transpose back from a "default" kobject to our custom struct foo_obj and + * then call the show function for that specific object. + */ +static ssize_t foo_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct foo_attribute *attribute; + struct foo_obj *foo; + + attribute = to_foo_attr(attr); + foo = to_foo_obj(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(foo, attribute, buf); +} + +/* + * Just like the default show function above, but this one is for when the + * sysfs "store" is requested (when a value is written to a file.) + */ +static ssize_t foo_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct foo_attribute *attribute; + struct foo_obj *foo; + + attribute = to_foo_attr(attr); + foo = to_foo_obj(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(foo, attribute, buf, len); +} + +/* Our custom sysfs_ops that we will associate with our ktype later on */ +static const struct sysfs_ops foo_sysfs_ops = { + .show = foo_attr_show, + .store = foo_attr_store, +}; + +/* + * The release function for our object. This is REQUIRED by the kernel to + * have. We free the memory held in our object here. + * + * NEVER try to get away with just a "blank" release function to try to be + * smarter than the kernel. Turns out, no one ever is... + */ +static void foo_release(struct kobject *kobj) +{ + struct foo_obj *foo; + + foo = to_foo_obj(kobj); + kfree(foo); +} + +/* + * The "foo" file where the .foo variable is read from and written to. + */ +static ssize_t foo_show(struct foo_obj *foo_obj, struct foo_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", foo_obj->foo); +} + +static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + ret = kstrtoint(buf, 10, &foo_obj->foo); + if (ret < 0) + return ret; + + return count; +} + +/* Sysfs attributes cannot be world-writable. */ +static struct foo_attribute foo_attribute = + __ATTR(foo, 0664, foo_show, foo_store); + +/* + * More complex function where we determine which variable is being accessed by + * looking at the attribute for the "baz" and "bar" files. + */ +static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr, + char *buf) +{ + int var; + + if (strcmp(attr->attr.name, "baz") == 0) + var = foo_obj->baz; + else + var = foo_obj->bar; + return sprintf(buf, "%d\n", var); +} + +static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr, + const char *buf, size_t count) +{ + int var, ret; + + ret = kstrtoint(buf, 10, &var); + if (ret < 0) + return ret; + + if (strcmp(attr->attr.name, "baz") == 0) + foo_obj->baz = var; + else + foo_obj->bar = var; + return count; +} + +static struct foo_attribute baz_attribute = + __ATTR(baz, 0664, b_show, b_store); +static struct foo_attribute bar_attribute = + __ATTR(bar, 0664, b_show, b_store); + +/* + * Create a group of attributes so that we can create and destroy them all + * at once. + */ +static struct attribute *foo_default_attrs[] = { + &foo_attribute.attr, + &baz_attribute.attr, + &bar_attribute.attr, + NULL, /* need to NULL terminate the list of attributes */ +}; +ATTRIBUTE_GROUPS(foo_default); + +/* + * Our own ktype for our kobjects. Here we specify our sysfs ops, the + * release function, and the set of default attributes we want created + * whenever a kobject of this type is registered with the kernel. + */ +static struct kobj_type foo_ktype = { + .sysfs_ops = &foo_sysfs_ops, + .release = foo_release, + .default_groups = foo_default_groups, +}; + +static struct kset *example_kset; +static struct foo_obj *foo_obj; +static struct foo_obj *bar_obj; +static struct foo_obj *baz_obj; + +static struct foo_obj *create_foo_obj(const char *name) +{ + struct foo_obj *foo; + int retval; + + /* allocate the memory for the whole object */ + foo = kzalloc(sizeof(*foo), GFP_KERNEL); + if (!foo) + return NULL; + + /* + * As we have a kset for this kobject, we need to set it before calling + * the kobject core. + */ + foo->kobj.kset = example_kset; + + /* + * Initialize and add the kobject to the kernel. All the default files + * will be created here. As we have already specified a kset for this + * kobject, we don't have to set a parent for the kobject, the kobject + * will be placed beneath that kset automatically. + */ + retval = kobject_init_and_add(&foo->kobj, &foo_ktype, NULL, "%s", name); + if (retval) { + kobject_put(&foo->kobj); + return NULL; + } + + /* + * We are always responsible for sending the uevent that the kobject + * was added to the system. + */ + kobject_uevent(&foo->kobj, KOBJ_ADD); + + return foo; +} + +static void destroy_foo_obj(struct foo_obj *foo) +{ + kobject_put(&foo->kobj); +} + +static int __init example_init(void) +{ + /* + * Create a kset with the name of "kset_example", + * located under /sys/kernel/ + */ + example_kset = kset_create_and_add("kset_example", NULL, kernel_kobj); + if (!example_kset) + return -ENOMEM; + + /* + * Create three objects and register them with our kset + */ + foo_obj = create_foo_obj("foo"); + if (!foo_obj) + goto foo_error; + + bar_obj = create_foo_obj("bar"); + if (!bar_obj) + goto bar_error; + + baz_obj = create_foo_obj("baz"); + if (!baz_obj) + goto baz_error; + + return 0; + +baz_error: + destroy_foo_obj(bar_obj); +bar_error: + destroy_foo_obj(foo_obj); +foo_error: + kset_unregister(example_kset); + return -EINVAL; +} + +static void __exit example_exit(void) +{ + destroy_foo_obj(baz_obj); + destroy_foo_obj(bar_obj); + destroy_foo_obj(foo_obj); + kset_unregister(example_kset); +} + +module_init(example_init); +module_exit(example_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>"); diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile new file mode 100644 index 000000000..e77459271 --- /dev/null +++ b/samples/kprobes/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +# builds the kprobes example kernel modules; +# then to use one (as root): insmod <module_name.ko> + +obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o +obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c new file mode 100644 index 000000000..365905cb2 --- /dev/null +++ b/samples/kprobes/kprobe_example.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * NOTE: This example is works on x86 and powerpc. + * Here's a sample kernel module showing the use of kprobes to dump a + * stack trace and selected registers when kernel_clone() is called. + * + * For more information on theory of operation of kprobes, see + * Documentation/trace/kprobes.rst + * + * You will see the trace data in /var/log/messages and on the console + * whenever kernel_clone() is invoked to create a new process. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kprobes.h> + +#define MAX_SYMBOL_LEN 64 +static char symbol[MAX_SYMBOL_LEN] = "kernel_clone"; +module_param_string(symbol, symbol, sizeof(symbol), 0644); + +/* For each probe you need to allocate a kprobe structure */ +static struct kprobe kp = { + .symbol_name = symbol, +}; + +/* kprobe pre_handler: called just before the probed instruction is executed */ +static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs) +{ +#ifdef CONFIG_X86 + pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->ip, regs->flags); +#endif +#ifdef CONFIG_PPC + pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n", + p->symbol_name, p->addr, regs->nip, regs->msr); +#endif +#ifdef CONFIG_MIPS + pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n", + p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status); +#endif +#ifdef CONFIG_ARM64 + pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx," + " pstate = 0x%lx\n", + p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate); +#endif +#ifdef CONFIG_S390 + pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->psw.addr, regs->flags); +#endif + + /* A dump_stack() here will give a stack backtrace */ + return 0; +} + +/* kprobe post_handler: called after the probed instruction is executed */ +static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ +#ifdef CONFIG_X86 + pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->flags); +#endif +#ifdef CONFIG_PPC + pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n", + p->symbol_name, p->addr, regs->msr); +#endif +#ifdef CONFIG_MIPS + pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n", + p->symbol_name, p->addr, regs->cp0_status); +#endif +#ifdef CONFIG_ARM64 + pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n", + p->symbol_name, p->addr, (long)regs->pstate); +#endif +#ifdef CONFIG_S390 + pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->flags); +#endif +} + +/* + * fault_handler: this is called if an exception is generated for any + * instruction within the pre- or post-handler, or when Kprobes + * single-steps the probed instruction. + */ +static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) +{ + pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr); + /* Return 0 because we don't handle the fault. */ + return 0; +} +/* NOKPROBE_SYMBOL() is also available */ +NOKPROBE_SYMBOL(handler_fault); + +static int __init kprobe_init(void) +{ + int ret; + kp.pre_handler = handler_pre; + kp.post_handler = handler_post; + kp.fault_handler = handler_fault; + + ret = register_kprobe(&kp); + if (ret < 0) { + pr_err("register_kprobe failed, returned %d\n", ret); + return ret; + } + pr_info("Planted kprobe at %p\n", kp.addr); + return 0; +} + +static void __exit kprobe_exit(void) +{ + unregister_kprobe(&kp); + pr_info("kprobe at %p unregistered\n", kp.addr); +} + +module_init(kprobe_init) +module_exit(kprobe_exit) +MODULE_LICENSE("GPL"); diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c new file mode 100644 index 000000000..228321ecb --- /dev/null +++ b/samples/kprobes/kretprobe_example.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kretprobe_example.c + * + * Here's a sample kernel module showing the use of return probes to + * report the return value and total time taken for probed function + * to run. + * + * usage: insmod kretprobe_example.ko func=<func_name> + * + * If no func_name is specified, kernel_clone is instrumented + * + * For more information on theory of operation of kretprobes, see + * Documentation/trace/kprobes.rst + * + * Build and insert the kernel module as done in the kprobe example. + * You will see the trace data in /var/log/messages and on the console + * whenever the probed function returns. (Some messages may be suppressed + * if syslogd is configured to eliminate duplicate messages.) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/ktime.h> +#include <linux/limits.h> +#include <linux/sched.h> + +static char func_name[NAME_MAX] = "kernel_clone"; +module_param_string(func, func_name, NAME_MAX, S_IRUGO); +MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the" + " function's execution time"); + +/* per-instance private data */ +struct my_data { + ktime_t entry_stamp; +}; + +/* Here we use the entry_hanlder to timestamp function entry */ +static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct my_data *data; + + if (!current->mm) + return 1; /* Skip kernel threads */ + + data = (struct my_data *)ri->data; + data->entry_stamp = ktime_get(); + return 0; +} +NOKPROBE_SYMBOL(entry_handler); + +/* + * Return-probe handler: Log the return value and duration. Duration may turn + * out to be zero consistently, depending upon the granularity of time + * accounting on the platform. + */ +static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long retval = regs_return_value(regs); + struct my_data *data = (struct my_data *)ri->data; + s64 delta; + ktime_t now; + + now = ktime_get(); + delta = ktime_to_ns(ktime_sub(now, data->entry_stamp)); + pr_info("%s returned %lu and took %lld ns to execute\n", + func_name, retval, (long long)delta); + return 0; +} +NOKPROBE_SYMBOL(ret_handler); + +static struct kretprobe my_kretprobe = { + .handler = ret_handler, + .entry_handler = entry_handler, + .data_size = sizeof(struct my_data), + /* Probe up to 20 instances concurrently. */ + .maxactive = 20, +}; + +static int __init kretprobe_init(void) +{ + int ret; + + my_kretprobe.kp.symbol_name = func_name; + ret = register_kretprobe(&my_kretprobe); + if (ret < 0) { + pr_err("register_kretprobe failed, returned %d\n", ret); + return ret; + } + pr_info("Planted return probe at %s: %p\n", + my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr); + return 0; +} + +static void __exit kretprobe_exit(void) +{ + unregister_kretprobe(&my_kretprobe); + pr_info("kretprobe at %p unregistered\n", my_kretprobe.kp.addr); + + /* nmissed > 0 suggests that maxactive was set too low. */ + pr_info("Missed probing %d instances of %s\n", + my_kretprobe.nmissed, my_kretprobe.kp.symbol_name); +} + +module_init(kretprobe_init) +module_exit(kretprobe_exit) +MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile new file mode 100644 index 000000000..9f853eeb6 --- /dev/null +++ b/samples/livepatch/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c new file mode 100644 index 000000000..378e2d402 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-busymod.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module + * + * + * Purpose + * ------- + * + * Simple module to demonstrate livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-callbacks-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/delay.h> + +static int sleep_secs; +module_param(sleep_secs, int, 0644); +MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)"); + +static void busymod_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(work, busymod_work_func); + +static void busymod_work_func(struct work_struct *work) +{ + pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs); + msleep(sleep_secs * 1000); + pr_info("%s exit\n", __func__); +} + +static int livepatch_callbacks_mod_init(void) +{ + pr_info("%s\n", __func__); + schedule_delayed_work(&work, + msecs_to_jiffies(1000 * 0)); + return 0; +} + +static void livepatch_callbacks_mod_exit(void) +{ + cancel_delayed_work_sync(&work); + pr_info("%s\n", __func__); +} + +module_init(livepatch_callbacks_mod_init); +module_exit(livepatch_callbacks_mod_exit); +MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c new file mode 100644 index 000000000..11c3f4357 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-demo.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo + * + * + * Purpose + * ------- + * + * Demonstration of registering livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * Step 1 - load the simple module + * + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * + * + * Step 2 - load the demonstration livepatch (with callbacks) + * + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * + * Step 3 - cleanup + * + * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled + * rmmod livepatch_callbacks_demo + * rmmod livepatch_callbacks_mod + * + * Watch dmesg output to see livepatch enablement, callback execution + * and patching operations for both vmlinux and module targets. + * + * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and + * livepatch-callbacks-demo.ko to observe what happens when a + * target module is loaded after a livepatch with callbacks. + * + * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch + * callback return status. Try setting up a non-zero status + * such as -19 (-ENODEV): + * + * # Load demo livepatch, vmlinux is patched + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * # Setup next pre-patch callback to return -ENODEV + * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret + * + * # Module loader refuses to load the target module + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device + * + * NOTE: There is a second target module, + * livepatch-callbacks-busymod.ko, available for experimenting + * with livepatch (un)patch callbacks. This module contains + * a 'sleep_secs' parameter that parks the module on one of the + * functions that the livepatch demo module wants to patch. + * Modifying this value and tweaking the order of module loads can + * effectively demonstrate stalled patch transitions: + * + * # Load a target module, let it park on 'busymod_work_func' for + * # thirty seconds + * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30 + * + * # Meanwhile load the livepatch + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * # ... then load and unload another target module while the + * # transition is in progress + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * rmmod samples/livepatch/livepatch-callbacks-mod.ko + * + * # Finally cleanup + * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled + * rmmod samples/livepatch/livepatch-callbacks-demo.ko + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> + +static int pre_patch_ret; +module_param(pre_patch_ret, int, 0644); +MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)"); + +static const char *const module_state[] = { + [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state", + [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init", + [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away", + [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up", +}; + +static void callback_info(const char *callback, struct klp_object *obj) +{ + if (obj->mod) + pr_info("%s: %s -> %s\n", callback, obj->mod->name, + module_state[obj->mod->state]); + else + pr_info("%s: vmlinux\n", callback); +} + +/* Executed on object patching (ie, patch enablement) */ +static int pre_patch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); + return pre_patch_ret; +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void post_patch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void pre_unpatch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void post_unpatch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +static void patched_work_func(struct work_struct *work) +{ + pr_info("%s\n", __func__); +} + +static struct klp_func no_funcs[] = { + { } +}; + +static struct klp_func busymod_funcs[] = { + { + .old_name = "busymod_work_func", + .new_func = patched_work_func, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = NULL, /* vmlinux */ + .funcs = no_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { + .name = "livepatch_callbacks_mod", + .funcs = no_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { + .name = "livepatch_callbacks_busymod", + .funcs = busymod_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_callbacks_demo_init(void) +{ + return klp_enable_patch(&patch); +} + +static void livepatch_callbacks_demo_exit(void) +{ +} + +module_init(livepatch_callbacks_demo_init); +module_exit(livepatch_callbacks_demo_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c new file mode 100644 index 000000000..2a074f422 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-mod.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-callbacks-mod.c - (un)patching callbacks demo support module + * + * + * Purpose + * ------- + * + * Simple module to demonstrate livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-callbacks-demo.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +static int livepatch_callbacks_mod_init(void) +{ + pr_info("%s\n", __func__); + return 0; +} + +static void livepatch_callbacks_mod_exit(void) +{ + pr_info("%s\n", __func__); +} + +module_init(livepatch_callbacks_mod_init); +module_exit(livepatch_callbacks_mod_exit); +MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c new file mode 100644 index 000000000..cd76d7ebe --- /dev/null +++ b/samples/livepatch/livepatch-sample.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * livepatch-sample.c - Kernel Live Patching Sample Module + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> + +/* + * This (dumb) live patch overrides the function that prints the + * kernel boot cmdline when /proc/cmdline is read. + * + * Example: + * + * $ cat /proc/cmdline + * <your cmdline> + * + * $ insmod livepatch-sample.ko + * $ cat /proc/cmdline + * this has been live patched + * + * $ echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled + * $ cat /proc/cmdline + * <your cmdline> + */ + +#include <linux/seq_file.h> +static int livepatch_cmdline_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", "this has been live patched"); + return 0; +} + +static struct klp_func funcs[] = { + { + .old_name = "cmdline_proc_show", + .new_func = livepatch_cmdline_proc_show, + }, { } +}; + +static struct klp_object objs[] = { + { + /* name being NULL means vmlinux */ + .funcs = funcs, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_init(void) +{ + return klp_enable_patch(&patch); +} + +static void livepatch_exit(void) +{ +} + +module_init(livepatch_init); +module_exit(livepatch_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c new file mode 100644 index 000000000..918ce17b4 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix1.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-shadow-fix1.c - Shadow variables, livepatch demo + * + * Purpose + * ------- + * + * Fixes the memory leak introduced in livepatch-shadow-mod through the + * use of a shadow variable. This fix demonstrates the "extending" of + * short-lived data structures by patching its allocation and release + * functions. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-shadow-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> +#include <linux/slab.h> + +/* Shadow variable enums */ +#define SV_LEAK 1 + +/* Allocate new dummies every second */ +#define ALLOC_PERIOD 1 +/* Check for expired dummies after a few new ones have been allocated */ +#define CLEANUP_PERIOD (3 * ALLOC_PERIOD) +/* Dummies expire after a few cleanup instances */ +#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +/* + * The constructor makes more sense together with klp_shadow_get_or_alloc(). + * In this example, it would be safe to assign the pointer also to the shadow + * variable returned by klp_shadow_alloc(). But we wanted to show the more + * complicated use of the API. + */ +static int shadow_leak_ctor(void *obj, void *shadow_data, void *ctor_data) +{ + int **shadow_leak = shadow_data; + int **leak = ctor_data; + + if (!ctor_data) + return -EINVAL; + + *shadow_leak = *leak; + return 0; +} + +static struct dummy *livepatch_fix1_dummy_alloc(void) +{ + struct dummy *d; + int *leak; + int **shadow_leak; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return NULL; + + d->jiffies_expire = jiffies + + msecs_to_jiffies(1000 * EXPIRE_PERIOD); + + /* + * Patch: save the extra memory location into a SV_LEAK shadow + * variable. A patched dummy_free routine can later fetch this + * pointer to handle resource release. + */ + leak = kzalloc(sizeof(*leak), GFP_KERNEL); + if (!leak) + goto err_leak; + + shadow_leak = klp_shadow_alloc(d, SV_LEAK, sizeof(leak), GFP_KERNEL, + shadow_leak_ctor, &leak); + if (!shadow_leak) { + pr_err("%s: failed to allocate shadow variable for the leaking pointer: dummy @ %p, leak @ %p\n", + __func__, d, leak); + goto err_shadow; + } + + pr_info("%s: dummy @ %p, expires @ %lx\n", + __func__, d, d->jiffies_expire); + + return d; + +err_shadow: + kfree(leak); +err_leak: + kfree(d); + return NULL; +} + +static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data) +{ + void *d = obj; + int **shadow_leak = shadow_data; + + kfree(*shadow_leak); + pr_info("%s: dummy @ %p, prevented leak @ %p\n", + __func__, d, *shadow_leak); +} + +static void livepatch_fix1_dummy_free(struct dummy *d) +{ + int **shadow_leak; + + /* + * Patch: fetch the saved SV_LEAK shadow variable, detach and + * free it. Note: handle cases where this shadow variable does + * not exist (ie, dummy structures allocated before this livepatch + * was loaded.) + */ + shadow_leak = klp_shadow_get(d, SV_LEAK); + if (shadow_leak) + klp_shadow_free(d, SV_LEAK, livepatch_fix1_dummy_leak_dtor); + else + pr_info("%s: dummy @ %p leaked!\n", __func__, d); + + kfree(d); +} + +static struct klp_func funcs[] = { + { + .old_name = "dummy_alloc", + .new_func = livepatch_fix1_dummy_alloc, + }, + { + .old_name = "dummy_free", + .new_func = livepatch_fix1_dummy_free, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = "livepatch_shadow_mod", + .funcs = funcs, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_shadow_fix1_init(void) +{ + return klp_enable_patch(&patch); +} + +static void livepatch_shadow_fix1_exit(void) +{ + /* Cleanup any existing SV_LEAK shadow variables */ + klp_shadow_free_all(SV_LEAK, livepatch_fix1_dummy_leak_dtor); +} + +module_init(livepatch_shadow_fix1_init); +module_exit(livepatch_shadow_fix1_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c new file mode 100644 index 000000000..29fe5cd42 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix2.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-shadow-fix2.c - Shadow variables, livepatch demo + * + * Purpose + * ------- + * + * Adds functionality to livepatch-shadow-mod's in-flight data + * structures through a shadow variable. The livepatch patches a + * routine that periodically inspects data structures, incrementing a + * per-data-structure counter, creating the counter if needed. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-shadow-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> +#include <linux/slab.h> + +/* Shadow variable enums */ +#define SV_LEAK 1 +#define SV_COUNTER 2 + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +static bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies) +{ + int *shadow_count; + + /* + * Patch: handle in-flight dummy structures, if they do not + * already have a SV_COUNTER shadow variable, then attach a + * new one. + */ + shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER, + sizeof(*shadow_count), GFP_NOWAIT, + NULL, NULL); + if (shadow_count) + *shadow_count += 1; + + return time_after(jiffies, d->jiffies_expire); +} + +static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data) +{ + void *d = obj; + int **shadow_leak = shadow_data; + + kfree(*shadow_leak); + pr_info("%s: dummy @ %p, prevented leak @ %p\n", + __func__, d, *shadow_leak); +} + +static void livepatch_fix2_dummy_free(struct dummy *d) +{ + int **shadow_leak; + int *shadow_count; + + /* Patch: copy the memory leak patch from the fix1 module. */ + shadow_leak = klp_shadow_get(d, SV_LEAK); + if (shadow_leak) + klp_shadow_free(d, SV_LEAK, livepatch_fix2_dummy_leak_dtor); + else + pr_info("%s: dummy @ %p leaked!\n", __func__, d); + + /* + * Patch: fetch the SV_COUNTER shadow variable and display + * the final count. Detach the shadow variable. + */ + shadow_count = klp_shadow_get(d, SV_COUNTER); + if (shadow_count) { + pr_info("%s: dummy @ %p, check counter = %d\n", + __func__, d, *shadow_count); + klp_shadow_free(d, SV_COUNTER, NULL); + } + + kfree(d); +} + +static struct klp_func funcs[] = { + { + .old_name = "dummy_check", + .new_func = livepatch_fix2_dummy_check, + }, + { + .old_name = "dummy_free", + .new_func = livepatch_fix2_dummy_free, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = "livepatch_shadow_mod", + .funcs = funcs, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_shadow_fix2_init(void) +{ + return klp_enable_patch(&patch); +} + +static void livepatch_shadow_fix2_exit(void) +{ + /* Cleanup any existing SV_COUNTER shadow variables */ + klp_shadow_free_all(SV_COUNTER, NULL); +} + +module_init(livepatch_shadow_fix2_init); +module_exit(livepatch_shadow_fix2_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c new file mode 100644 index 000000000..7e753b0d2 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-mod.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-shadow-mod.c - Shadow variables, buggy module demo + * + * Purpose + * ------- + * + * As a demonstration of livepatch shadow variable API, this module + * introduces memory leak behavior that livepatch modules + * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and + * enhance. + * + * WARNING - even though the livepatch-shadow-fix modules patch the + * memory leak, please load these modules at your own risk -- some + * amount of memory may leaked before the bug is patched. + * + * + * Usage + * ----- + * + * Step 1 - Load the buggy demonstration module: + * + * insmod samples/livepatch/livepatch-shadow-mod.ko + * + * Watch dmesg output for a few moments to see new dummy being allocated + * and a periodic cleanup check. (Note: a small amount of memory is + * being leaked.) + * + * + * Step 2 - Load livepatch fix1: + * + * insmod samples/livepatch/livepatch-shadow-fix1.ko + * + * Continue watching dmesg and note that now livepatch_fix1_dummy_free() + * and livepatch_fix1_dummy_alloc() are logging messages about leaked + * memory and eventually leaks prevented. + * + * + * Step 3 - Load livepatch fix2 (on top of fix1): + * + * insmod samples/livepatch/livepatch-shadow-fix2.ko + * + * This module extends functionality through shadow variables, as a new + * "check" counter is added to the dummy structure. Periodic dmesg + * messages will log these as dummies are cleaned up. + * + * + * Step 4 - Cleanup + * + * Unwind the demonstration by disabling the livepatch fix modules, then + * removing them and the demo module: + * + * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled + * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled + * rmmod livepatch-shadow-fix2 + * rmmod livepatch-shadow-fix1 + * rmmod livepatch-shadow-mod + */ + + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/workqueue.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>"); +MODULE_DESCRIPTION("Buggy module for shadow variable demo"); + +/* Allocate new dummies every second */ +#define ALLOC_PERIOD 1 +/* Check for expired dummies after a few new ones have been allocated */ +#define CLEANUP_PERIOD (3 * ALLOC_PERIOD) +/* Dummies expire after a few cleanup instances */ +#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) + +/* + * Keep a list of all the dummies so we can clean up any residual ones + * on module exit + */ +static LIST_HEAD(dummy_list); +static DEFINE_MUTEX(dummy_list_mutex); + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +static __used noinline struct dummy *dummy_alloc(void) +{ + struct dummy *d; + int *leak; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return NULL; + + d->jiffies_expire = jiffies + + msecs_to_jiffies(1000 * EXPIRE_PERIOD); + + /* Oops, forgot to save leak! */ + leak = kzalloc(sizeof(*leak), GFP_KERNEL); + if (!leak) { + kfree(d); + return NULL; + } + + pr_info("%s: dummy @ %p, expires @ %lx\n", + __func__, d, d->jiffies_expire); + + return d; +} + +static __used noinline void dummy_free(struct dummy *d) +{ + pr_info("%s: dummy @ %p, expired = %lx\n", + __func__, d, d->jiffies_expire); + + kfree(d); +} + +static __used noinline bool dummy_check(struct dummy *d, + unsigned long jiffies) +{ + return time_after(jiffies, d->jiffies_expire); +} + +/* + * alloc_work_func: allocates new dummy structures, allocates additional + * memory, aptly named "leak", but doesn't keep + * permanent record of it. + */ + +static void alloc_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func); + +static void alloc_work_func(struct work_struct *work) +{ + struct dummy *d; + + d = dummy_alloc(); + if (!d) + return; + + mutex_lock(&dummy_list_mutex); + list_add(&d->list, &dummy_list); + mutex_unlock(&dummy_list_mutex); + + schedule_delayed_work(&alloc_dwork, + msecs_to_jiffies(1000 * ALLOC_PERIOD)); +} + +/* + * cleanup_work_func: frees dummy structures. Without knownledge of + * "leak", it leaks the additional memory that + * alloc_work_func created. + */ + +static void cleanup_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func); + +static void cleanup_work_func(struct work_struct *work) +{ + struct dummy *d, *tmp; + unsigned long j; + + j = jiffies; + pr_info("%s: jiffies = %lx\n", __func__, j); + + mutex_lock(&dummy_list_mutex); + list_for_each_entry_safe(d, tmp, &dummy_list, list) { + + /* Kick out and free any expired dummies */ + if (dummy_check(d, j)) { + list_del(&d->list); + dummy_free(d); + } + } + mutex_unlock(&dummy_list_mutex); + + schedule_delayed_work(&cleanup_dwork, + msecs_to_jiffies(1000 * CLEANUP_PERIOD)); +} + +static int livepatch_shadow_mod_init(void) +{ + schedule_delayed_work(&alloc_dwork, + msecs_to_jiffies(1000 * ALLOC_PERIOD)); + schedule_delayed_work(&cleanup_dwork, + msecs_to_jiffies(1000 * CLEANUP_PERIOD)); + + return 0; +} + +static void livepatch_shadow_mod_exit(void) +{ + struct dummy *d, *tmp; + + /* Wait for any dummies at work */ + cancel_delayed_work_sync(&alloc_dwork); + cancel_delayed_work_sync(&cleanup_dwork); + + /* Cleanup residual dummies */ + list_for_each_entry_safe(d, tmp, &dummy_list, list) { + list_del(&d->list); + dummy_free(d); + } +} + +module_init(livepatch_shadow_mod_init); +module_exit(livepatch_shadow_mod_exit); diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore new file mode 100644 index 000000000..db5e802f0 --- /dev/null +++ b/samples/mei/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +mei-amt-version diff --git a/samples/mei/Makefile b/samples/mei/Makefile new file mode 100644 index 000000000..c54b8a0ab --- /dev/null +++ b/samples/mei/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2012-2019, Intel Corporation. All rights reserved. +userprogs-always-y += mei-amt-version + +userccflags += -I usr/include diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c new file mode 100644 index 000000000..ad3e56042 --- /dev/null +++ b/samples/mei/mei-amt-version.c @@ -0,0 +1,479 @@ +/****************************************************************************** + * Intel Management Engine Interface (Intel MEI) Linux driver + * Intel MEI Interface Header + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2012 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, + * USA + * + * The full GNU General Public License is included in this distribution + * in the file called LICENSE.GPL. + * + * Contact Information: + * Intel Corporation. + * linux-mei@linux.intel.com + * http://www.intel.com + * + * BSD LICENSE + * + * Copyright(c) 2003 - 2012 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *****************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <unistd.h> +#include <errno.h> +#include <stdint.h> +#include <stdbool.h> +#include <bits/wordsize.h> +#include <linux/mei.h> + +/***************************************************************************** + * Intel Management Engine Interface + *****************************************************************************/ + +#define mei_msg(_me, fmt, ARGS...) do { \ + if (_me->verbose) \ + fprintf(stderr, fmt, ##ARGS); \ +} while (0) + +#define mei_err(_me, fmt, ARGS...) do { \ + fprintf(stderr, "Error: " fmt, ##ARGS); \ +} while (0) + +struct mei { + uuid_le guid; + bool initialized; + bool verbose; + unsigned int buf_size; + unsigned char prot_ver; + int fd; +}; + +static void mei_deinit(struct mei *cl) +{ + if (cl->fd != -1) + close(cl->fd); + cl->fd = -1; + cl->buf_size = 0; + cl->prot_ver = 0; + cl->initialized = false; +} + +static bool mei_init(struct mei *me, const uuid_le *guid, + unsigned char req_protocol_version, bool verbose) +{ + int result; + struct mei_client *cl; + struct mei_connect_client_data data; + + me->verbose = verbose; + + me->fd = open("/dev/mei0", O_RDWR); + if (me->fd == -1) { + mei_err(me, "Cannot establish a handle to the Intel MEI driver\n"); + goto err; + } + memcpy(&me->guid, guid, sizeof(*guid)); + memset(&data, 0, sizeof(data)); + me->initialized = true; + + memcpy(&data.in_client_uuid, &me->guid, sizeof(me->guid)); + result = ioctl(me->fd, IOCTL_MEI_CONNECT_CLIENT, &data); + if (result) { + mei_err(me, "IOCTL_MEI_CONNECT_CLIENT receive message. err=%d\n", result); + goto err; + } + cl = &data.out_client_properties; + mei_msg(me, "max_message_length %d\n", cl->max_msg_length); + mei_msg(me, "protocol_version %d\n", cl->protocol_version); + + if ((req_protocol_version > 0) && + (cl->protocol_version != req_protocol_version)) { + mei_err(me, "Intel MEI protocol version not supported\n"); + goto err; + } + + me->buf_size = cl->max_msg_length; + me->prot_ver = cl->protocol_version; + + return true; +err: + mei_deinit(me); + return false; +} + +static ssize_t mei_recv_msg(struct mei *me, unsigned char *buffer, + ssize_t len, unsigned long timeout) +{ + ssize_t rc; + + mei_msg(me, "call read length = %zd\n", len); + + rc = read(me->fd, buffer, len); + if (rc < 0) { + mei_err(me, "read failed with status %zd %s\n", + rc, strerror(errno)); + mei_deinit(me); + } else { + mei_msg(me, "read succeeded with result %zd\n", rc); + } + return rc; +} + +static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer, + ssize_t len, unsigned long timeout) +{ + struct timeval tv; + ssize_t written; + ssize_t rc; + fd_set set; + + tv.tv_sec = timeout / 1000; + tv.tv_usec = (timeout % 1000) * 1000000; + + mei_msg(me, "call write length = %zd\n", len); + + written = write(me->fd, buffer, len); + if (written < 0) { + rc = -errno; + mei_err(me, "write failed with status %zd %s\n", + written, strerror(errno)); + goto out; + } + + FD_ZERO(&set); + FD_SET(me->fd, &set); + rc = select(me->fd + 1 , &set, NULL, NULL, &tv); + if (rc > 0 && FD_ISSET(me->fd, &set)) { + mei_msg(me, "write success\n"); + } else if (rc == 0) { + mei_err(me, "write failed on timeout with status\n"); + goto out; + } else { /* rc < 0 */ + mei_err(me, "write failed on select with status %zd\n", rc); + goto out; + } + + rc = written; +out: + if (rc < 0) + mei_deinit(me); + + return rc; +} + +/*************************************************************************** + * Intel Advanced Management Technology ME Client + ***************************************************************************/ + +#define AMT_MAJOR_VERSION 1 +#define AMT_MINOR_VERSION 1 + +#define AMT_STATUS_SUCCESS 0x0 +#define AMT_STATUS_INTERNAL_ERROR 0x1 +#define AMT_STATUS_NOT_READY 0x2 +#define AMT_STATUS_INVALID_AMT_MODE 0x3 +#define AMT_STATUS_INVALID_MESSAGE_LENGTH 0x4 + +#define AMT_STATUS_HOST_IF_EMPTY_RESPONSE 0x4000 +#define AMT_STATUS_SDK_RESOURCES 0x1004 + + +#define AMT_BIOS_VERSION_LEN 65 +#define AMT_VERSIONS_NUMBER 50 +#define AMT_UNICODE_STRING_LEN 20 + +struct amt_unicode_string { + uint16_t length; + char string[AMT_UNICODE_STRING_LEN]; +} __attribute__((packed)); + +struct amt_version_type { + struct amt_unicode_string description; + struct amt_unicode_string version; +} __attribute__((packed)); + +struct amt_version { + uint8_t major; + uint8_t minor; +} __attribute__((packed)); + +struct amt_code_versions { + uint8_t bios[AMT_BIOS_VERSION_LEN]; + uint32_t count; + struct amt_version_type versions[AMT_VERSIONS_NUMBER]; +} __attribute__((packed)); + +/*************************************************************************** + * Intel Advanced Management Technology Host Interface + ***************************************************************************/ + +struct amt_host_if_msg_header { + struct amt_version version; + uint16_t _reserved; + uint32_t command; + uint32_t length; +} __attribute__((packed)); + +struct amt_host_if_resp_header { + struct amt_host_if_msg_header header; + uint32_t status; + unsigned char data[]; +} __attribute__((packed)); + +const uuid_le MEI_IAMTHIF = UUID_LE(0x12f80028, 0xb4b7, 0x4b2d, \ + 0xac, 0xa8, 0x46, 0xe0, 0xff, 0x65, 0x81, 0x4c); + +#define AMT_HOST_IF_CODE_VERSIONS_REQUEST 0x0400001A +#define AMT_HOST_IF_CODE_VERSIONS_RESPONSE 0x0480001A + +const struct amt_host_if_msg_header CODE_VERSION_REQ = { + .version = {AMT_MAJOR_VERSION, AMT_MINOR_VERSION}, + ._reserved = 0, + .command = AMT_HOST_IF_CODE_VERSIONS_REQUEST, + .length = 0 +}; + + +struct amt_host_if { + struct mei mei_cl; + unsigned long send_timeout; + bool initialized; +}; + + +static bool amt_host_if_init(struct amt_host_if *acmd, + unsigned long send_timeout, bool verbose) +{ + acmd->send_timeout = (send_timeout) ? send_timeout : 20000; + acmd->initialized = mei_init(&acmd->mei_cl, &MEI_IAMTHIF, 0, verbose); + return acmd->initialized; +} + +static void amt_host_if_deinit(struct amt_host_if *acmd) +{ + mei_deinit(&acmd->mei_cl); + acmd->initialized = false; +} + +static uint32_t amt_verify_code_versions(const struct amt_host_if_resp_header *resp) +{ + uint32_t status = AMT_STATUS_SUCCESS; + struct amt_code_versions *code_ver; + size_t code_ver_len; + uint32_t ver_type_cnt; + uint32_t len; + uint32_t i; + + code_ver = (struct amt_code_versions *)resp->data; + /* length - sizeof(status) */ + code_ver_len = resp->header.length - sizeof(uint32_t); + ver_type_cnt = code_ver_len - + sizeof(code_ver->bios) - + sizeof(code_ver->count); + if (code_ver->count != ver_type_cnt / sizeof(struct amt_version_type)) { + status = AMT_STATUS_INTERNAL_ERROR; + goto out; + } + + for (i = 0; i < code_ver->count; i++) { + len = code_ver->versions[i].description.length; + + if (len > AMT_UNICODE_STRING_LEN) { + status = AMT_STATUS_INTERNAL_ERROR; + goto out; + } + + len = code_ver->versions[i].version.length; + if (code_ver->versions[i].version.string[len] != '\0' || + len != strlen(code_ver->versions[i].version.string)) { + status = AMT_STATUS_INTERNAL_ERROR; + goto out; + } + } +out: + return status; +} + +static uint32_t amt_verify_response_header(uint32_t command, + const struct amt_host_if_msg_header *resp_hdr, + uint32_t response_size) +{ + if (response_size < sizeof(struct amt_host_if_resp_header)) { + return AMT_STATUS_INTERNAL_ERROR; + } else if (response_size != (resp_hdr->length + + sizeof(struct amt_host_if_msg_header))) { + return AMT_STATUS_INTERNAL_ERROR; + } else if (resp_hdr->command != command) { + return AMT_STATUS_INTERNAL_ERROR; + } else if (resp_hdr->_reserved != 0) { + return AMT_STATUS_INTERNAL_ERROR; + } else if (resp_hdr->version.major != AMT_MAJOR_VERSION || + resp_hdr->version.minor < AMT_MINOR_VERSION) { + return AMT_STATUS_INTERNAL_ERROR; + } + return AMT_STATUS_SUCCESS; +} + +static uint32_t amt_host_if_call(struct amt_host_if *acmd, + const unsigned char *command, ssize_t command_sz, + uint8_t **read_buf, uint32_t rcmd, + unsigned int expected_sz) +{ + uint32_t in_buf_sz; + ssize_t out_buf_sz; + ssize_t written; + uint32_t status; + struct amt_host_if_resp_header *msg_hdr; + + in_buf_sz = acmd->mei_cl.buf_size; + *read_buf = (uint8_t *)malloc(sizeof(uint8_t) * in_buf_sz); + if (*read_buf == NULL) + return AMT_STATUS_SDK_RESOURCES; + memset(*read_buf, 0, in_buf_sz); + msg_hdr = (struct amt_host_if_resp_header *)*read_buf; + + written = mei_send_msg(&acmd->mei_cl, + command, command_sz, acmd->send_timeout); + if (written != command_sz) + return AMT_STATUS_INTERNAL_ERROR; + + out_buf_sz = mei_recv_msg(&acmd->mei_cl, *read_buf, in_buf_sz, 2000); + if (out_buf_sz <= 0) + return AMT_STATUS_HOST_IF_EMPTY_RESPONSE; + + status = msg_hdr->status; + if (status != AMT_STATUS_SUCCESS) + return status; + + status = amt_verify_response_header(rcmd, + &msg_hdr->header, out_buf_sz); + if (status != AMT_STATUS_SUCCESS) + return status; + + if (expected_sz && expected_sz != out_buf_sz) + return AMT_STATUS_INTERNAL_ERROR; + + return AMT_STATUS_SUCCESS; +} + + +static uint32_t amt_get_code_versions(struct amt_host_if *cmd, + struct amt_code_versions *versions) +{ + struct amt_host_if_resp_header *response = NULL; + uint32_t status; + + status = amt_host_if_call(cmd, + (const unsigned char *)&CODE_VERSION_REQ, + sizeof(CODE_VERSION_REQ), + (uint8_t **)&response, + AMT_HOST_IF_CODE_VERSIONS_RESPONSE, 0); + + if (status != AMT_STATUS_SUCCESS) + goto out; + + status = amt_verify_code_versions(response); + if (status != AMT_STATUS_SUCCESS) + goto out; + + memcpy(versions, response->data, sizeof(struct amt_code_versions)); +out: + if (response != NULL) + free(response); + + return status; +} + +/************************** end of amt_host_if_command ***********************/ +int main(int argc, char **argv) +{ + struct amt_code_versions ver; + struct amt_host_if acmd; + unsigned int i; + uint32_t status; + int ret; + bool verbose; + + verbose = (argc > 1 && strcmp(argv[1], "-v") == 0); + + if (!amt_host_if_init(&acmd, 5000, verbose)) { + ret = 1; + goto out; + } + + status = amt_get_code_versions(&acmd, &ver); + + amt_host_if_deinit(&acmd); + + switch (status) { + case AMT_STATUS_HOST_IF_EMPTY_RESPONSE: + printf("Intel AMT: DISABLED\n"); + ret = 0; + break; + case AMT_STATUS_SUCCESS: + printf("Intel AMT: ENABLED\n"); + for (i = 0; i < ver.count; i++) { + printf("%s:\t%s\n", ver.versions[i].description.string, + ver.versions[i].version.string); + } + ret = 0; + break; + default: + printf("An error has occurred\n"); + ret = 1; + break; + } + +out: + return ret; +} diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore new file mode 100644 index 000000000..827934129 --- /dev/null +++ b/samples/nitro_enclaves/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +ne_ioctl_sample diff --git a/samples/nitro_enclaves/Makefile b/samples/nitro_enclaves/Makefile new file mode 100644 index 000000000..a3ec78fef --- /dev/null +++ b/samples/nitro_enclaves/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +# Enclave lifetime management support for Nitro Enclaves (NE) - ioctl sample +# usage. + +.PHONY: all clean + +CFLAGS += -Wall + +all: + $(CC) $(CFLAGS) -o ne_ioctl_sample ne_ioctl_sample.c -lpthread + +clean: + rm -f ne_ioctl_sample diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c new file mode 100644 index 000000000..480b76314 --- /dev/null +++ b/samples/nitro_enclaves/ne_ioctl_sample.c @@ -0,0 +1,883 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +/** + * DOC: Sample flow of using the ioctl interface provided by the Nitro Enclaves (NE) + * kernel driver. + * + * Usage + * ----- + * + * Load the nitro_enclaves module, setting also the enclave CPU pool. The + * enclave CPUs need to be full cores from the same NUMA node. CPU 0 and its + * siblings have to remain available for the primary / parent VM, so they + * cannot be included in the enclave CPU pool. + * + * See the cpu list section from the kernel documentation. + * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists + * + * insmod drivers/virt/nitro_enclaves/nitro_enclaves.ko + * lsmod + * + * The CPU pool can be set at runtime, after the kernel module is loaded. + * + * echo <cpu-list> > /sys/module/nitro_enclaves/parameters/ne_cpus + * + * NUMA and CPU siblings information can be found using: + * + * lscpu + * /proc/cpuinfo + * + * Check the online / offline CPU list. The CPUs from the pool should be + * offlined. + * + * lscpu + * + * Check dmesg for any warnings / errors through the NE driver lifetime / usage. + * The NE logs contain the "nitro_enclaves" or "pci 0000:00:02.0" pattern. + * + * dmesg + * + * Setup hugetlbfs huge pages. The memory needs to be from the same NUMA node as + * the enclave CPUs. + * + * https://www.kernel.org/doc/html/latest/admin-guide/mm/hugetlbpage.html + * + * By default, the allocation of hugetlb pages are distributed on all possible + * NUMA nodes. Use the following configuration files to set the number of huge + * pages from a NUMA node: + * + * /sys/devices/system/node/node<X>/hugepages/hugepages-2048kB/nr_hugepages + * /sys/devices/system/node/node<X>/hugepages/hugepages-1048576kB/nr_hugepages + * + * or, if not on a system with multiple NUMA nodes, can also set the number + * of 2 MiB / 1 GiB huge pages using + * + * /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + * /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages + * + * In this example 256 hugepages of 2 MiB are used. + * + * Build and run the NE sample. + * + * make -C samples/nitro_enclaves clean + * make -C samples/nitro_enclaves + * ./samples/nitro_enclaves/ne_ioctl_sample <path_to_enclave_image> + * + * Unload the nitro_enclaves module. + * + * rmmod nitro_enclaves + * lsmod + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <poll.h> +#include <pthread.h> +#include <string.h> +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <linux/mman.h> +#include <linux/nitro_enclaves.h> +#include <linux/vm_sockets.h> + +/** + * NE_DEV_NAME - Nitro Enclaves (NE) misc device that provides the ioctl interface. + */ +#define NE_DEV_NAME "/dev/nitro_enclaves" + +/** + * NE_POLL_WAIT_TIME - Timeout in seconds for each poll event. + */ +#define NE_POLL_WAIT_TIME (60) +/** + * NE_POLL_WAIT_TIME_MS - Timeout in milliseconds for each poll event. + */ +#define NE_POLL_WAIT_TIME_MS (NE_POLL_WAIT_TIME * 1000) + +/** + * NE_SLEEP_TIME - Amount of time in seconds for the process to keep the enclave alive. + */ +#define NE_SLEEP_TIME (300) + +/** + * NE_DEFAULT_NR_VCPUS - Default number of vCPUs set for an enclave. + */ +#define NE_DEFAULT_NR_VCPUS (2) + +/** + * NE_MIN_MEM_REGION_SIZE - Minimum size of a memory region - 2 MiB. + */ +#define NE_MIN_MEM_REGION_SIZE (2 * 1024 * 1024) + +/** + * NE_DEFAULT_NR_MEM_REGIONS - Default number of memory regions of 2 MiB set for + * an enclave. + */ +#define NE_DEFAULT_NR_MEM_REGIONS (256) + +/** + * NE_IMAGE_LOAD_HEARTBEAT_CID - Vsock CID for enclave image loading heartbeat logic. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_CID (3) +/** + * NE_IMAGE_LOAD_HEARTBEAT_PORT - Vsock port for enclave image loading heartbeat logic. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_PORT (9000) +/** + * NE_IMAGE_LOAD_HEARTBEAT_VALUE - Heartbeat value for enclave image loading. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_VALUE (0xb7) + +/** + * struct ne_user_mem_region - User space memory region set for an enclave. + * @userspace_addr: Address of the user space memory region. + * @memory_size: Size of the user space memory region. + */ +struct ne_user_mem_region { + void *userspace_addr; + size_t memory_size; +}; + +/** + * ne_create_vm() - Create a slot for the enclave VM. + * @ne_dev_fd: The file descriptor of the NE misc device. + * @slot_uid: The generated slot uid for the enclave. + * @enclave_fd : The generated file descriptor for the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd) +{ + int rc = -EINVAL; + *enclave_fd = ioctl(ne_dev_fd, NE_CREATE_VM, slot_uid); + + if (*enclave_fd < 0) { + rc = *enclave_fd; + switch (errno) { + case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { + printf("Error in create VM, no CPUs available in the NE CPU pool\n"); + + break; + } + + default: + printf("Error in create VM [%m]\n"); + } + + return rc; + } + + return 0; +} + + +/** + * ne_poll_enclave_fd() - Thread function for polling the enclave fd. + * @data: Argument provided for the polling function. + * + * Context: Process context. + * Return: + * * NULL on success / failure. + */ +void *ne_poll_enclave_fd(void *data) +{ + int enclave_fd = *(int *)data; + struct pollfd fds[1] = {}; + int i = 0; + int rc = -EINVAL; + + printf("Running from poll thread, enclave fd %d\n", enclave_fd); + + fds[0].fd = enclave_fd; + fds[0].events = POLLIN | POLLERR | POLLHUP; + + /* Keep on polling until the current process is terminated. */ + while (1) { + printf("[iter %d] Polling ...\n", i); + + rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); + if (rc < 0) { + printf("Error in poll [%m]\n"); + + return NULL; + } + + i++; + + if (!rc) { + printf("Poll: %d seconds elapsed\n", + i * NE_POLL_WAIT_TIME); + + continue; + } + + printf("Poll received value 0x%x\n", fds[0].revents); + + if (fds[0].revents & POLLHUP) { + printf("Received POLLHUP\n"); + + return NULL; + } + + if (fds[0].revents & POLLNVAL) { + printf("Received POLLNVAL\n"); + + return NULL; + } + } + + return NULL; +} + +/** + * ne_alloc_user_mem_region() - Allocate a user space memory region for an enclave. + * @ne_user_mem_region: User space memory region allocated using hugetlbfs. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_alloc_user_mem_region(struct ne_user_mem_region *ne_user_mem_region) +{ + /** + * Check available hugetlb encodings for different huge page sizes in + * include/uapi/linux/mman.h. + */ + ne_user_mem_region->userspace_addr = mmap(NULL, ne_user_mem_region->memory_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_HUGETLB | MAP_HUGE_2MB, -1, 0); + if (ne_user_mem_region->userspace_addr == MAP_FAILED) { + printf("Error in mmap memory [%m]\n"); + + return -1; + } + + return 0; +} + +/** + * ne_load_enclave_image() - Place the enclave image in the enclave memory. + * @enclave_fd : The file descriptor associated with the enclave. + * @ne_user_mem_regions: User space memory regions allocated for the enclave. + * @enclave_image_path : The file path of the enclave image. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_load_enclave_image(int enclave_fd, struct ne_user_mem_region ne_user_mem_regions[], + char *enclave_image_path) +{ + unsigned char *enclave_image = NULL; + int enclave_image_fd = -1; + size_t enclave_image_size = 0; + size_t enclave_memory_size = 0; + unsigned long i = 0; + size_t image_written_bytes = 0; + struct ne_image_load_info image_load_info = { + .flags = NE_EIF_IMAGE, + }; + struct stat image_stat_buf = {}; + int rc = -EINVAL; + size_t temp_image_offset = 0; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) + enclave_memory_size += ne_user_mem_regions[i].memory_size; + + rc = stat(enclave_image_path, &image_stat_buf); + if (rc < 0) { + printf("Error in get image stat info [%m]\n"); + + return rc; + } + + enclave_image_size = image_stat_buf.st_size; + + if (enclave_memory_size < enclave_image_size) { + printf("The enclave memory is smaller than the enclave image size\n"); + + return -ENOMEM; + } + + rc = ioctl(enclave_fd, NE_GET_IMAGE_LOAD_INFO, &image_load_info); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in get image load info, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in get image load info, provided invalid flag\n"); + + break; + } + + default: + printf("Error in get image load info [%m]\n"); + } + + return rc; + } + + printf("Enclave image offset in enclave memory is %lld\n", + image_load_info.memory_offset); + + enclave_image_fd = open(enclave_image_path, O_RDONLY); + if (enclave_image_fd < 0) { + printf("Error in open enclave image file [%m]\n"); + + return enclave_image_fd; + } + + enclave_image = mmap(NULL, enclave_image_size, PROT_READ, + MAP_PRIVATE, enclave_image_fd, 0); + if (enclave_image == MAP_FAILED) { + printf("Error in mmap enclave image [%m]\n"); + + return -1; + } + + temp_image_offset = image_load_info.memory_offset; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + size_t bytes_to_write = 0; + size_t memory_offset = 0; + size_t memory_size = ne_user_mem_regions[i].memory_size; + size_t remaining_bytes = 0; + void *userspace_addr = ne_user_mem_regions[i].userspace_addr; + + if (temp_image_offset >= memory_size) { + temp_image_offset -= memory_size; + + continue; + } else if (temp_image_offset != 0) { + memory_offset = temp_image_offset; + memory_size -= temp_image_offset; + temp_image_offset = 0; + } + + remaining_bytes = enclave_image_size - image_written_bytes; + bytes_to_write = memory_size < remaining_bytes ? + memory_size : remaining_bytes; + + memcpy(userspace_addr + memory_offset, + enclave_image + image_written_bytes, bytes_to_write); + + image_written_bytes += bytes_to_write; + + if (image_written_bytes == enclave_image_size) + break; + } + + munmap(enclave_image, enclave_image_size); + + close(enclave_image_fd); + + return 0; +} + +/** + * ne_set_user_mem_region() - Set a user space memory region for the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @ne_user_mem_region : User space memory region to be set for the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_set_user_mem_region(int enclave_fd, struct ne_user_mem_region ne_user_mem_region) +{ + struct ne_user_memory_region mem_region = { + .flags = NE_DEFAULT_MEMORY_REGION, + .memory_size = ne_user_mem_region.memory_size, + .userspace_addr = (__u64)ne_user_mem_region.userspace_addr, + }; + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_SET_USER_MEMORY_REGION, &mem_region); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in set user memory region, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_MEM_REGION_SIZE: { + printf("Error in set user memory region, mem size not multiple of 2 MiB\n"); + + break; + } + + case NE_ERR_INVALID_MEM_REGION_ADDR: { + printf("Error in set user memory region, invalid user space address\n"); + + break; + } + + case NE_ERR_UNALIGNED_MEM_REGION_ADDR: { + printf("Error in set user memory region, unaligned user space address\n"); + + break; + } + + case NE_ERR_MEM_REGION_ALREADY_USED: { + printf("Error in set user memory region, memory region already used\n"); + + break; + } + + case NE_ERR_MEM_NOT_HUGE_PAGE: { + printf("Error in set user memory region, not backed by huge pages\n"); + + break; + } + + case NE_ERR_MEM_DIFFERENT_NUMA_NODE: { + printf("Error in set user memory region, different NUMA node than CPUs\n"); + + break; + } + + case NE_ERR_MEM_MAX_REGIONS: { + printf("Error in set user memory region, max memory regions reached\n"); + + break; + } + + case NE_ERR_INVALID_PAGE_SIZE: { + printf("Error in set user memory region, has page not multiple of 2 MiB\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in set user memory region, provided invalid flag\n"); + + break; + } + + default: + printf("Error in set user memory region [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_free_mem_regions() - Unmap all the user space memory regions that were set + * aside for the enclave. + * @ne_user_mem_regions: The user space memory regions associated with an enclave. + * + * Context: Process context. + */ +static void ne_free_mem_regions(struct ne_user_mem_region ne_user_mem_regions[]) +{ + unsigned int i = 0; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) + munmap(ne_user_mem_regions[i].userspace_addr, + ne_user_mem_regions[i].memory_size); +} + +/** + * ne_add_vcpu() - Add a vCPU to the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @vcpu_id: vCPU id to be set for the enclave, either provided or + * auto-generated (if provided vCPU id is 0). + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id) +{ + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_ADD_VCPU, vcpu_id); + if (rc < 0) { + switch (errno) { + case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { + printf("Error in add vcpu, no CPUs available in the NE CPU pool\n"); + + break; + } + + case NE_ERR_VCPU_ALREADY_USED: { + printf("Error in add vcpu, the provided vCPU is already used\n"); + + break; + } + + case NE_ERR_VCPU_NOT_IN_CPU_POOL: { + printf("Error in add vcpu, the provided vCPU is not in the NE CPU pool\n"); + + break; + } + + case NE_ERR_VCPU_INVALID_CPU_CORE: { + printf("Error in add vcpu, the core id of the provided vCPU is invalid\n"); + + break; + } + + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in add vcpu, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_VCPU: { + printf("Error in add vcpu, the provided vCPU is out of avail CPUs range\n"); + + break; + } + + default: + printf("Error in add vcpu [%m]\n"); + + } + return rc; + } + + return 0; +} + +/** + * ne_start_enclave() - Start the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @enclave_start_info : Enclave metadata used for starting e.g. vsock CID. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *enclave_start_info) +{ + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_START_ENCLAVE, enclave_start_info); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in start enclave, enclave not in init state\n"); + + break; + } + + case NE_ERR_NO_MEM_REGIONS_ADDED: { + printf("Error in start enclave, no memory regions have been added\n"); + + break; + } + + case NE_ERR_NO_VCPUS_ADDED: { + printf("Error in start enclave, no vCPUs have been added\n"); + + break; + } + + case NE_ERR_FULL_CORES_NOT_USED: { + printf("Error in start enclave, enclave has no full cores set\n"); + + break; + } + + case NE_ERR_ENCLAVE_MEM_MIN_SIZE: { + printf("Error in start enclave, enclave memory is less than min size\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in start enclave, provided invalid flag\n"); + + break; + } + + case NE_ERR_INVALID_ENCLAVE_CID: { + printf("Error in start enclave, provided invalid enclave CID\n"); + + break; + } + + default: + printf("Error in start enclave [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_start_enclave_check_booted() - Start the enclave and wait for a hearbeat + * from it, on a newly created vsock channel, + * to check it has booted. + * @enclave_fd : The file descriptor associated with the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_start_enclave_check_booted(int enclave_fd) +{ + struct sockaddr_vm client_vsock_addr = {}; + int client_vsock_fd = -1; + socklen_t client_vsock_len = sizeof(client_vsock_addr); + struct ne_enclave_start_info enclave_start_info = {}; + struct pollfd fds[1] = {}; + int rc = -EINVAL; + unsigned char recv_buf = 0; + struct sockaddr_vm server_vsock_addr = { + .svm_family = AF_VSOCK, + .svm_cid = NE_IMAGE_LOAD_HEARTBEAT_CID, + .svm_port = NE_IMAGE_LOAD_HEARTBEAT_PORT, + }; + int server_vsock_fd = -1; + + server_vsock_fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (server_vsock_fd < 0) { + rc = server_vsock_fd; + + printf("Error in socket [%m]\n"); + + return rc; + } + + rc = bind(server_vsock_fd, (struct sockaddr *)&server_vsock_addr, + sizeof(server_vsock_addr)); + if (rc < 0) { + printf("Error in bind [%m]\n"); + + goto out; + } + + rc = listen(server_vsock_fd, 1); + if (rc < 0) { + printf("Error in listen [%m]\n"); + + goto out; + } + + rc = ne_start_enclave(enclave_fd, &enclave_start_info); + if (rc < 0) + goto out; + + printf("Enclave started, CID %llu\n", enclave_start_info.enclave_cid); + + fds[0].fd = server_vsock_fd; + fds[0].events = POLLIN; + + rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); + if (rc < 0) { + printf("Error in poll [%m]\n"); + + goto out; + } + + if (!rc) { + printf("Poll timeout, %d seconds elapsed\n", NE_POLL_WAIT_TIME); + + rc = -ETIMEDOUT; + + goto out; + } + + if ((fds[0].revents & POLLIN) == 0) { + printf("Poll received value %d\n", fds[0].revents); + + rc = -EINVAL; + + goto out; + } + + rc = accept(server_vsock_fd, (struct sockaddr *)&client_vsock_addr, + &client_vsock_len); + if (rc < 0) { + printf("Error in accept [%m]\n"); + + goto out; + } + + client_vsock_fd = rc; + + /* + * Read the heartbeat value that the init process in the enclave sends + * after vsock connect. + */ + rc = read(client_vsock_fd, &recv_buf, sizeof(recv_buf)); + if (rc < 0) { + printf("Error in read [%m]\n"); + + goto out; + } + + if (rc != sizeof(recv_buf) || recv_buf != NE_IMAGE_LOAD_HEARTBEAT_VALUE) { + printf("Read %d instead of %d\n", recv_buf, + NE_IMAGE_LOAD_HEARTBEAT_VALUE); + + goto out; + } + + /* Write the heartbeat value back. */ + rc = write(client_vsock_fd, &recv_buf, sizeof(recv_buf)); + if (rc < 0) { + printf("Error in write [%m]\n"); + + goto out; + } + + rc = 0; + +out: + close(server_vsock_fd); + + return rc; +} + +int main(int argc, char *argv[]) +{ + int enclave_fd = -1; + unsigned int i = 0; + int ne_dev_fd = -1; + struct ne_user_mem_region ne_user_mem_regions[NE_DEFAULT_NR_MEM_REGIONS] = {}; + unsigned int ne_vcpus[NE_DEFAULT_NR_VCPUS] = {}; + int rc = -EINVAL; + pthread_t thread_id = 0; + unsigned long slot_uid = 0; + + if (argc != 2) { + printf("Usage: %s <path_to_enclave_image>\n", argv[0]); + + exit(EXIT_FAILURE); + } + + if (strlen(argv[1]) >= PATH_MAX) { + printf("The size of the path to enclave image is higher than max path\n"); + + exit(EXIT_FAILURE); + } + + ne_dev_fd = open(NE_DEV_NAME, O_RDWR | O_CLOEXEC); + if (ne_dev_fd < 0) { + printf("Error in open NE device [%m]\n"); + + exit(EXIT_FAILURE); + } + + printf("Creating enclave slot ...\n"); + + rc = ne_create_vm(ne_dev_fd, &slot_uid, &enclave_fd); + + close(ne_dev_fd); + + if (rc < 0) + exit(EXIT_FAILURE); + + printf("Enclave fd %d\n", enclave_fd); + + rc = pthread_create(&thread_id, NULL, ne_poll_enclave_fd, (void *)&enclave_fd); + if (rc < 0) { + printf("Error in thread create [%m]\n"); + + close(enclave_fd); + + exit(EXIT_FAILURE); + } + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + ne_user_mem_regions[i].memory_size = NE_MIN_MEM_REGION_SIZE; + + rc = ne_alloc_user_mem_region(&ne_user_mem_regions[i]); + if (rc < 0) { + printf("Error in alloc userspace memory region, iter %d\n", i); + + goto release_enclave_fd; + } + } + + rc = ne_load_enclave_image(enclave_fd, ne_user_mem_regions, argv[1]); + if (rc < 0) + goto release_enclave_fd; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + rc = ne_set_user_mem_region(enclave_fd, ne_user_mem_regions[i]); + if (rc < 0) { + printf("Error in set memory region, iter %d\n", i); + + goto release_enclave_fd; + } + } + + printf("Enclave memory regions were added\n"); + + for (i = 0; i < NE_DEFAULT_NR_VCPUS; i++) { + /* + * The vCPU is chosen from the enclave vCPU pool, if the value + * of the vcpu_id is 0. + */ + ne_vcpus[i] = 0; + rc = ne_add_vcpu(enclave_fd, &ne_vcpus[i]); + if (rc < 0) { + printf("Error in add vcpu, iter %d\n", i); + + goto release_enclave_fd; + } + + printf("Added vCPU %d to the enclave\n", ne_vcpus[i]); + } + + printf("Enclave vCPUs were added\n"); + + rc = ne_start_enclave_check_booted(enclave_fd); + if (rc < 0) { + printf("Error in the enclave start / image loading heartbeat logic [rc=%d]\n", rc); + + goto release_enclave_fd; + } + + printf("Entering sleep for %d seconds ...\n", NE_SLEEP_TIME); + + sleep(NE_SLEEP_TIME); + + close(enclave_fd); + + ne_free_mem_regions(ne_user_mem_regions); + + exit(EXIT_SUCCESS); + +release_enclave_fd: + close(enclave_fd); + ne_free_mem_regions(ne_user_mem_regions); + + exit(EXIT_FAILURE); +} diff --git a/samples/pidfd/.gitignore b/samples/pidfd/.gitignore new file mode 100644 index 000000000..eea857fca --- /dev/null +++ b/samples/pidfd/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +pidfd-metadata diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile new file mode 100644 index 000000000..9754e2d81 --- /dev/null +++ b/samples/pidfd/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +usertprogs-always-y += pidfd-metadata + +userccflags += -I usr/include diff --git a/samples/pidfd/pidfd-metadata.c b/samples/pidfd/pidfd-metadata.c new file mode 100644 index 000000000..c459155da --- /dev/null +++ b/samples/pidfd/pidfd-metadata.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <inttypes.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#ifndef CLONE_PIDFD +#define CLONE_PIDFD 0x00001000 +#endif + +#ifndef __NR_pidfd_send_signal +#define __NR_pidfd_send_signal -1 +#endif + +static int do_child(void *args) +{ + printf("%d\n", getpid()); + _exit(EXIT_SUCCESS); +} + +static pid_t pidfd_clone(int flags, int *pidfd) +{ + size_t stack_size = 1024; + char *stack[1024] = { 0 }; + +#ifdef __ia64__ + return __clone2(do_child, stack, stack_size, flags | SIGCHLD, NULL, pidfd); +#else + return clone(do_child, stack + stack_size, flags | SIGCHLD, NULL, pidfd); +#endif +} + +static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, + unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int pidfd_metadata_fd(pid_t pid, int pidfd) +{ + int procfd, ret; + char path[100]; + + snprintf(path, sizeof(path), "/proc/%d", pid); + procfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (procfd < 0) { + warn("Failed to open %s\n", path); + return -1; + } + + /* + * Verify that the pid has not been recycled and our /proc/<pid> handle + * is still valid. + */ + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + if (ret < 0) { + switch (errno) { + case EPERM: + /* Process exists, just not allowed to signal it. */ + break; + default: + warn("Failed to signal process\n"); + close(procfd); + procfd = -1; + } + } + + return procfd; +} + +int main(int argc, char *argv[]) +{ + int pidfd = -1, ret = EXIT_FAILURE; + char buf[4096] = { 0 }; + pid_t pid; + int procfd, statusfd; + ssize_t bytes; + + pid = pidfd_clone(CLONE_PIDFD, &pidfd); + if (pid < 0) + err(ret, "CLONE_PIDFD"); + if (pidfd == -1) { + warnx("CLONE_PIDFD is not supported by the kernel"); + goto out; + } + + procfd = pidfd_metadata_fd(pid, pidfd); + close(pidfd); + if (procfd < 0) + goto out; + + statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC); + close(procfd); + if (statusfd < 0) + goto out; + + bytes = read(statusfd, buf, sizeof(buf)); + if (bytes > 0) + bytes = write(STDOUT_FILENO, buf, bytes); + close(statusfd); + ret = EXIT_SUCCESS; + +out: + (void)wait(NULL); + + exit(ret); +} diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst new file mode 100644 index 000000000..f9c53ca5c --- /dev/null +++ b/samples/pktgen/README.rst @@ -0,0 +1,46 @@ +Sample and benchmark scripts for pktgen (packet generator) +========================================================== +This directory contains some pktgen sample and benchmark scripts, that +can easily be copied and adjusted for your own use-case. + +General doc is located in kernel: Documentation/networking/pktgen.rst + +Helper include files +==================== +This directory contains two helper shell files, that can be "included" +by shell source'ing. Namely "functions.sh" and "parameters.sh". + +Common parameters +----------------- +The parameters.sh file support easy and consistant parameter parsing +across the sample scripts. Usage example is printed on errors:: + + Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX + -i : ($DEV) output interface/device (required) + -s : ($PKT_SIZE) packet size + -d : ($DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed + -m : ($DST_MAC) destination MAC-addr + -p : ($DST_PORT) destination PORT range (e.g. 433-444) is also allowed + -t : ($THREADS) threads to start + -f : ($F_THREAD) index of first thread (zero indexed CPU number) + -c : ($SKB_CLONE) SKB clones send before alloc new SKB + -n : ($COUNT) num messages to send per thread, 0 means indefinitely + -b : ($BURST) HW level bursting of SKBs + -v : ($VERBOSE) verbose + -x : ($DEBUG) debug + +The global variable being set is also listed. E.g. the required +interface/device parameter "-i" sets variable $DEV. + +Common functions +---------------- +The functions.sh file provides; Three different shell functions for +configuring the different components of pktgen: pg_ctrl(), pg_thread() +and pg_set(). + +These functions correspond to pktgens different components. + * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl) + * pg_thread() control the kernel threads and binding to devices + * pg_set() control setup of individual devices + +See sample scripts for usage examples. diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh new file mode 100644 index 000000000..dae06d5b3 --- /dev/null +++ b/samples/pktgen/functions.sh @@ -0,0 +1,334 @@ +# +# Common functions used by pktgen scripts +# - Depending on bash 3 (or higher) syntax +# +# Author: Jesper Dangaaard Brouer +# License: GPL + +set -o errexit + +## -- General shell logging cmds -- +function err() { + local exitcode=$1 + shift + echo "ERROR: $@" >&2 + exit $exitcode +} + +function warn() { + echo "WARN : $@" >&2 +} + +function info() { + if [[ -n "$VERBOSE" ]]; then + echo "INFO : $@" >&2 + fi +} + +## -- Pktgen proc config commands -- ## +export PROC_DIR=/proc/net/pktgen +# +# Three different shell functions for configuring the different +# components of pktgen: +# pg_ctrl(), pg_thread() and pg_set(). +# +# These functions correspond to pktgens different components. +# * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl) +# * pg_thread() control the kernel threads and binding to devices +# * pg_set() control setup of individual devices +function pg_ctrl() { + local proc_file="pgctrl" + proc_cmd ${proc_file} "$@" +} + +function pg_thread() { + local thread=$1 + local proc_file="kpktgend_${thread}" + shift + proc_cmd ${proc_file} "$@" +} + +function pg_set() { + local dev=$1 + local proc_file="$dev" + shift + proc_cmd ${proc_file} "$@" +} + +# More generic replacement for pgset(), that does not depend on global +# variable for proc file. +function proc_cmd() { + local result + local proc_file=$1 + local status=0 + # after shift, the remaining args are contained in $@ + shift + local proc_ctrl=${PROC_DIR}/$proc_file + if [[ ! -e "$proc_ctrl" ]]; then + err 3 "proc file:$proc_ctrl does not exists (dev added to thread?)" + else + if [[ ! -w "$proc_ctrl" ]]; then + err 4 "proc file:$proc_ctrl not writable, not root?!" + fi + fi + + if [[ "$DEBUG" == "yes" ]]; then + echo "cmd: $@ > $proc_ctrl" + fi + # Quoting of "$@" is important for space expansion + echo "$@" > "$proc_ctrl" || status=$? + + if [[ "$proc_file" != "pgctrl" ]]; then + result=$(grep "Result: OK:" $proc_ctrl) || true + if [[ "$result" == "" ]]; then + grep "Result:" $proc_ctrl >&2 + fi + fi + if (( $status != 0 )); then + err 5 "Write error($status) occurred cmd: \"$@ > $proc_ctrl\"" + fi +} + +# Old obsolete "pgset" function, with slightly improved err handling +function pgset() { + local result + + if [[ "$DEBUG" == "yes" ]]; then + echo "cmd: $1 > $PGDEV" + fi + echo $1 > $PGDEV + local status=$? + + result=`cat $PGDEV | fgrep "Result: OK:"` + if [[ "$result" == "" ]]; then + cat $PGDEV | fgrep Result: + fi + if (( $status != 0 )); then + err 5 "Write error($status) occurred cmd: \"$1 > $PGDEV\"" + fi +} + +[[ $EUID -eq 0 ]] && trap 'pg_ctrl "reset"' EXIT + +## -- General shell tricks -- + +function root_check_run_with_sudo() { + # Trick so, program can be run as normal user, will just use "sudo" + # call as root_check_run_as_sudo "$@" + if [ "$EUID" -ne 0 ]; then + if [ -x $0 ]; then # Directly executable use sudo + info "Not root, running with sudo" + sudo "$0" "$@" + exit $? + fi + err 4 "cannot perform sudo run of $0" + fi +} + +# Exact input device's NUMA node info +function get_iface_node() +{ + local node=$(</sys/class/net/$1/device/numa_node) + if [[ $node == -1 ]]; then + echo 0 + else + echo $node + fi +} + +# Given an Dev/iface, get its queues' irq numbers +function get_iface_irqs() +{ + local IFACE=$1 + local queues="${IFACE}-.*TxRx" + + irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\ + do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\ + done) + [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE" + + echo $irqs +} + +# Given a NUMA node, return cpu ids belonging to it. +function get_node_cpus() +{ + local node=$1 + local node_cpu_list + local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \ + /sys/devices/system/node/node$node/cpulist` + + for cpu_range in $node_cpu_range_list + do + node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }` + done + + echo $node_cpu_list +} + +# Check $1 is in between $2, $3 ($2 <= $1 <= $3) +function in_between() { [[ ($1 -ge $2) && ($1 -le $3) ]] ; } + +# Extend shrunken IPv6 address. +# fe80::42:bcff:fe84:e10a => fe80:0:0:0:42:bcff:fe84:e10a +function extend_addr6() +{ + local addr=$1 + local sep=: sep2=:: + local sep_cnt=$(tr -cd $sep <<< $1 | wc -c) + local shrink + + # separator count should be (2 <= $sep_cnt <= 7) + if ! (in_between $sep_cnt 2 7); then + err 5 "Invalid IP6 address: $1" + fi + + # if shrink '::' occurs multiple, it's malformed. + shrink=( $(egrep -o "$sep{2,}" <<< $addr) ) + if [[ ${#shrink[@]} -ne 0 ]]; then + if [[ ${#shrink[@]} -gt 1 || ( ${shrink[0]} != $sep2 ) ]]; then + err 5 "Invalid IP6 address: $1" + fi + fi + + # add 0 at begin & end, and extend addr by adding :0 + [[ ${addr:0:1} == $sep ]] && addr=0${addr} + [[ ${addr: -1} == $sep ]] && addr=${addr}0 + echo "${addr/$sep2/$(printf ':0%.s' $(seq $[8-sep_cnt])):}" +} + +# Given a single IP(v4/v6) address, whether it is valid. +function validate_addr() +{ + # check function is called with (funcname)6 + [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6 + local bitlen=$[ IP6 ? 128 : 32 ] + local len=$[ IP6 ? 8 : 4 ] + local max=$[ 2**(len*2)-1 ] + local net prefix + local addr sep + + IFS='/' read net prefix <<< $1 + [[ $IP6 ]] && net=$(extend_addr6 $net) + + # if prefix exists, check (0 <= $prefix <= $bitlen) + if [[ -n $prefix ]]; then + if ! (in_between $prefix 0 $bitlen); then + err 5 "Invalid prefix: /$prefix" + fi + fi + + # set separator for each IP(v4/v6) + [[ $IP6 ]] && sep=: || sep=. + IFS=$sep read -a addr <<< $net + + # array length + if [[ ${#addr[@]} != $len ]]; then + err 5 "Invalid IP$IP6 address: $1" + fi + + # check each digit (0 <= $digit <= $max) + for digit in "${addr[@]}"; do + [[ $IP6 ]] && digit=$[ 16#$digit ] + if ! (in_between $digit 0 $max); then + err 5 "Invalid IP$IP6 address: $1" + fi + done + + return 0 +} + +function validate_addr6() { validate_addr $@ ; } + +# Given a single IP(v4/v6) or CIDR, return minimum and maximum IP addr. +function parse_addr() +{ + # check function is called with (funcname)6 + [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6 + local net prefix + local min_ip max_ip + + IFS='/' read net prefix <<< $1 + [[ $IP6 ]] && net=$(extend_addr6 $net) + + if [[ -z $prefix ]]; then + min_ip=$net + max_ip=$net + else + # defining array for converting Decimal 2 Binary + # 00000000 00000001 00000010 00000011 00000100 ... + local d2b='{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}' + [[ $IP6 ]] && d2b+=$d2b + eval local D2B=($d2b) + + local bitlen=$[ IP6 ? 128 : 32 ] + local remain=$[ bitlen-prefix ] + local octet=$[ IP6 ? 16 : 8 ] + local min_mask max_mask + local min max + local ip_bit + local ip sep + + # set separator for each IP(v4/v6) + [[ $IP6 ]] && sep=: || sep=. + IFS=$sep read -ra ip <<< $net + + min_mask="$(printf '1%.s' $(seq $prefix))$(printf '0%.s' $(seq $remain))" + max_mask="$(printf '0%.s' $(seq $prefix))$(printf '1%.s' $(seq $remain))" + + # calculate min/max ip with &,| operator + for i in "${!ip[@]}"; do + digit=$[ IP6 ? 16#${ip[$i]} : ${ip[$i]} ] + ip_bit=${D2B[$digit]} + + idx=$[ octet*i ] + min[$i]=$[ 2#$ip_bit & 2#${min_mask:$idx:$octet} ] + max[$i]=$[ 2#$ip_bit | 2#${max_mask:$idx:$octet} ] + [[ $IP6 ]] && { min[$i]=$(printf '%X' ${min[$i]}); + max[$i]=$(printf '%X' ${max[$i]}); } + done + + min_ip=$(IFS=$sep; echo "${min[*]}") + max_ip=$(IFS=$sep; echo "${max[*]}") + fi + + echo $min_ip $max_ip +} + +function parse_addr6() { parse_addr $@ ; } + +# Given a single or range of port(s), return minimum and maximum port number. +function parse_ports() +{ + local port_str=$1 + local port_list + local min_port + local max_port + + IFS="-" read -ra port_list <<< $port_str + + min_port=${port_list[0]} + max_port=${port_list[1]:-$min_port} + + echo $min_port $max_port +} + +# Given a minimum and maximum port, verify port number. +function validate_ports() +{ + local min_port=$1 + local max_port=$2 + + # 1 <= port <= 65535 + if (in_between $min_port 1 65535); then + if (in_between $max_port 1 65535); then + if [[ $min_port -le $max_port ]]; then + return 0 + fi + fi + fi + + err 5 "Invalid port(s): $min_port-$max_port" +} diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh new file mode 100644 index 000000000..ff0ed474f --- /dev/null +++ b/samples/pktgen/parameters.sh @@ -0,0 +1,121 @@ +# +# SPDX-License-Identifier: GPL-2.0 +# Common parameter parsing for pktgen scripts +# + +function usage() { + echo "" + echo "Usage: $0 [-vx] -i ethX" + echo " -i : (\$DEV) output interface/device (required)" + echo " -s : (\$PKT_SIZE) packet size" + echo " -d : (\$DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed" + echo " -m : (\$DST_MAC) destination MAC-addr" + echo " -p : (\$DST_PORT) destination PORT range (e.g. 433-444) is also allowed" + echo " -t : (\$THREADS) threads to start" + echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)" + echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB" + echo " -n : (\$COUNT) num messages to send per thread, 0 means indefinitely" + echo " -b : (\$BURST) HW level bursting of SKBs" + echo " -v : (\$VERBOSE) verbose" + echo " -x : (\$DEBUG) debug" + echo " -6 : (\$IP6) IPv6" + echo "" +} + +## --- Parse command line arguments / parameters --- +## echo "Commandline options:" +while getopts "s:i:d:m:p:f:t:c:n:b:vxh6" option; do + case $option in + i) # interface + export DEV=$OPTARG + info "Output device set to: DEV=$DEV" + ;; + s) + export PKT_SIZE=$OPTARG + info "Packet size set to: PKT_SIZE=$PKT_SIZE bytes" + ;; + d) # destination IP + export DEST_IP=$OPTARG + info "Destination IP set to: DEST_IP=$DEST_IP" + ;; + m) # MAC + export DST_MAC=$OPTARG + info "Destination MAC set to: DST_MAC=$DST_MAC" + ;; + p) # PORT + export DST_PORT=$OPTARG + info "Destination PORT set to: DST_PORT=$DST_PORT" + ;; + f) + export F_THREAD=$OPTARG + info "Index of first thread (zero indexed CPU number): $F_THREAD" + ;; + t) + export THREADS=$OPTARG + info "Number of threads to start: $THREADS" + ;; + c) + export CLONE_SKB=$OPTARG + info "CLONE_SKB=$CLONE_SKB" + ;; + n) + export COUNT=$OPTARG + info "COUNT=$COUNT" + ;; + b) + export BURST=$OPTARG + info "SKB bursting: BURST=$BURST" + ;; + v) + export VERBOSE=yes + info "Verbose mode: VERBOSE=$VERBOSE" + ;; + x) + export DEBUG=yes + info "Debug mode: DEBUG=$DEBUG" + ;; + 6) + export IP6=6 + info "IP6: IP6=$IP6" + ;; + h|?|*) + usage; + err 2 "[ERROR] Unknown parameters!!!" + esac +done +shift $(( $OPTIND - 1 )) + +if [ -z "$PKT_SIZE" ]; then + # NIC adds 4 bytes CRC + export PKT_SIZE=60 + info "Default packet size set to: set to: $PKT_SIZE bytes" +fi + +if [ -z "$F_THREAD" ]; then + # First thread (F_THREAD) reference the zero indexed CPU number + export F_THREAD=0 +fi + +if [ -z "$THREADS" ]; then + export THREADS=1 +fi + +export L_THREAD=$(( THREADS + F_THREAD - 1 )) + +if [ -z "$DEV" ]; then + usage + err 2 "Please specify output device" +fi + +if [ -z "$DST_MAC" ]; then + warn "Missing destination MAC address" +fi + +if [ -z "$DEST_IP" ]; then + warn "Missing destination IP address" +fi + +if [ ! -d /proc/net/pktgen ]; then + info "Loading kernel module: pktgen" + modprobe pktgen +fi diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh new file mode 100755 index 000000000..1b6204125 --- /dev/null +++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Benchmark script: +# - developed for benchmarking ingress qdisc path +# +# Script for injecting packets into RX path of the stack with pktgen +# "xmit_mode netif_receive". With an invalid dst_mac this will only +# measure the ingress code path as packets gets dropped in ip_rcv(). +# +# This script don't really need any hardware. It benchmarks software +# RX path just after NIC driver level. With bursting is also +# "removes" the SKB alloc/free overhead. +# +# Setup scenarios for measuring ingress qdisc (with invalid dst_mac): +# ------------------------------------------------------------------ +# (1) no ingress (uses static_key_false(&ingress_needed)) +# +# (2) ingress on other dev (change ingress_needed and calls +# handle_ing() but exit early) +# +# config: tc qdisc add dev $SOMEDEV handle ffff: ingress +# +# (3) ingress on this dev, handle_ing() -> tc_classify() +# +# config: tc qdisc add dev $DEV handle ffff: ingress +# +# (4) ingress on this dev + drop at u32 classifier/action. +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +source ${basedir}/parameters.sh +# Using invalid DST_MAC will cause the packets to get dropped in +# ip_rcv() which is part of the test +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +[ -z "$BURST" ] && BURST=1024 +[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# Base Config +DELAY="0" # Zero means max speed + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + dev=${DEV}@${thread} + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # Base config of dev + pg_set $dev "flag QUEUE_MAP_CPU" + pg_set $dev "count $COUNT" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + # Inject packet into RX path of stack + pg_set $dev "xmit_mode netif_receive" + + # Burst allow us to avoid measuring SKB alloc/free overhead + pg_set $dev "burst $BURST" +done + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh new file mode 100755 index 000000000..e607cb369 --- /dev/null +++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Benchmark script: +# - developed for benchmarking egress qdisc path, derived (more +# like cut'n'pasted) from ingress benchmark script. +# +# Script for injecting packets into egress qdisc path of the stack +# with pktgen "xmit_mode queue_xmit". +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +source ${basedir}/parameters.sh +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" + +# Burst greater than 1 are invalid for queue_xmit mode +if [[ -n "$BURST" ]]; then + err 1 "Bursting not supported for this mode" +fi +[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# Base Config +DELAY="0" # Zero means max speed + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + dev=${DEV}@${thread} + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # Base config of dev + pg_set $dev "flag QUEUE_MAP_CPU" + pg_set $dev "count $COUNT" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + # Inject packet into TX qdisc egress path of stack + pg_set $dev "xmit_mode queue_xmit" +done + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh new file mode 100755 index 000000000..a4e250b45 --- /dev/null +++ b/samples/pktgen/pktgen_sample01_simple.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Simple example: +# * pktgen sending with single thread and single interface +# * flow variation via random UDP source port +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +# - go look in parameters.sh to see which setting are avail +# - required param is the interface "-i" stored in $DEV +source ${basedir}/parameters.sh +# +# Set some default params, if they didn't get set +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" +# Example enforce param "-m" for dst_mac +[ -z "$DST_MAC" ] && usage && err 2 "Must specify -m dst_mac" +[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# Base Config +DELAY="0" # Zero means max speed + +# Flow variation random source port between min and max +UDP_SRC_MIN=9 +UDP_SRC_MAX=109 + +# General cleanup everything since last run +# (especially important if other threads were configured by other scripts) +pg_ctrl "reset" + +# Add remove all other devices and add_device $DEV to thread 0 +thread=0 +pg_thread $thread "rem_device_all" +pg_thread $thread "add_device" $DEV + +# How many packets to send (zero means indefinitely) +pg_set $DEV "count $COUNT" + +# Reduce alloc cost by sending same SKB many times +# - this obviously affects the randomness within the packet +pg_set $DEV "clone_skb $CLONE_SKB" + +# Set packet size +pg_set $DEV "pkt_size $PKT_SIZE" + +# Delay between packets (zero means max speed) +pg_set $DEV "delay $DELAY" + +# Flag example disabling timestamping +pg_set $DEV "flag NO_TIMESTAMP" + +# Destination +pg_set $DEV "dst_mac $DST_MAC" +pg_set $DEV "dst${IP6}_min $DST_MIN" +pg_set $DEV "dst${IP6}_max $DST_MAX" + +if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $DEV "flag UDPDST_RND" + pg_set $DEV "udp_dst_min $UDP_DST_MIN" + pg_set $DEV "udp_dst_max $UDP_DST_MAX" +fi + +# Setup random UDP port src range +pg_set $DEV "flag UDPSRC_RND" +pg_set $DEV "udp_src_min $UDP_SRC_MIN" +pg_set $DEV "udp_src_max $UDP_SRC_MAX" + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +echo "Result device: $DEV" +cat /proc/net/pktgen/$DEV diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh new file mode 100755 index 000000000..cb2495fcd --- /dev/null +++ b/samples/pktgen/pktgen_sample02_multiqueue.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Multiqueue: Using pktgen threads for sending on multiple CPUs +# * adding devices to kernel threads +# * notice the naming scheme for keeping device names unique +# * nameing scheme: dev@thread_number +# * flow variation via random UDP source port +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" +# +# Required param: -i dev in $DEV +source ${basedir}/parameters.sh + +[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely + +# Base Config +DELAY="0" # Zero means max speed +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" + +# Flow variation random source port between min and max +UDP_SRC_MIN=9 +UDP_SRC_MAX=109 + +# (example of setting default params in your script) +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + dev=${DEV}@${thread} + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # Notice config queue to map to cpu (mirrors smp_processor_id()) + # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number + pg_set $dev "flag QUEUE_MAP_CPU" + + # Base config of dev + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + + # Flag example disabling timestamping + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + # Setup random UDP port src range + pg_set $dev "flag UDPSRC_RND" + pg_set $dev "udp_src_min $UDP_SRC_MIN" + pg_set $dev "udp_src_max $UDP_SRC_MAX" +done + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh new file mode 100755 index 000000000..fff50765a --- /dev/null +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Script for max single flow performance +# - If correctly tuned[1], single CPU 10G wirespeed small pkts is possible[2] +# +# Using pktgen "burst" option (use -b $N) +# - To boost max performance +# - Avail since: kernel v3.18 +# * commit 38b2cf2982dc73 ("net: pktgen: packet bursting via skb->xmit_more") +# - This avoids writing the HW tailptr on every driver xmit +# - The performance boost is impressive, see commit and blog [2] +# +# Notice: On purpose generates a single (UDP) flow towards target, +# reason behind this is to only overload/activate a single CPU on +# target host. And no randomness for pktgen also makes it faster. +# +# Tuning see: +# [1] http://netoptimizer.blogspot.dk/2014/06/pktgen-for-network-overload-testing.html +# [2] http://netoptimizer.blogspot.dk/2014/10/unlocked-10gbps-tx-wirespeed-smallest.html +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +source ${basedir}/parameters.sh +# Set some default params, if they didn't get set +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +[ -z "$BURST" ] && BURST=32 +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting +[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# Base Config +DELAY="0" # Zero means max speed + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # Base config + pg_set $dev "flag QUEUE_MAP_CPU" + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + # Setup burst, for easy testing -b 0 disable bursting + # (internally in pktgen default and minimum burst=1) + if [[ ${BURST} -ne 0 ]]; then + pg_set $dev "burst $BURST" + else + info "$dev: Not using burst" + fi +done + +# Run if user hits control-c +function control_c() { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap control_c SIGINT + +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh new file mode 100755 index 000000000..9db1ecf8d --- /dev/null +++ b/samples/pktgen/pktgen_sample04_many_flows.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Script example for many flows testing +# +# Number of simultaneous flows limited by variable $FLOWS +# and number of packets per flow controlled by variable $FLOWLEN +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +source ${basedir}/parameters.sh +# Set some default params, if they didn't get set +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" +[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# NOTICE: Script specific settings +# ======= +# Limiting the number of concurrent flows ($FLOWS) +# and also set how many packets each flow contains ($FLOWLEN) +# +[ -z "$FLOWS" ] && FLOWS="8000" +[ -z "$FLOWLEN" ] && FLOWLEN="10" + +# Base Config +DELAY="0" # Zero means max speed + +if [[ -n "$BURST" ]]; then + err 1 "Bursting not supported for this mode" +fi + +# 198.18.0.0 / 198.19.255.255 +read -r SRC_MIN SRC_MAX <<< $(parse_addr 198.18.0.0/15) + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # Base config + pg_set $dev "flag QUEUE_MAP_CPU" + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + pg_set $dev "flag NO_TIMESTAMP" + + # Single destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + # Randomize source IP-addresses + pg_set $dev "flag IPSRC_RND" + pg_set $dev "src_min $SRC_MIN" + pg_set $dev "src_max $SRC_MAX" + + # Limit number of flows (max 65535) + pg_set $dev "flows $FLOWS" + # + # How many packets a flow will send, before flow "entry" is + # re-generated/setup. + pg_set $dev "flowlen $FLOWLEN" + # + # Flag FLOW_SEQ will cause $FLOWLEN packets from the same flow + # being send back-to-back, before next flow is selected + # incrementally. This helps lookup caches, and is more realistic. + # + pg_set $dev "flag FLOW_SEQ" + +done + +# Run if user hits control-c +function print_result() { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" + +print_result diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh new file mode 100755 index 000000000..9fc6c6da0 --- /dev/null +++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Script will generate one flow per thread (-t N) +# - Same destination IP +# - Fake source IPs for each flow (fixed based on thread number) +# +# Useful for scale testing on receiver, to see whether silo'ing flows +# works and scales. For optimal scalability (on receiver) each +# separate-flow should not access shared variables/data. This script +# helps magnify any of these scaling issues by overloading the receiver. +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +source ${basedir}/parameters.sh +# Set some default params, if they didn't get set +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" +[ -z "$BURST" ] && BURST=32 +[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# Base Config +DELAY="0" # Zero means max speed + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # Base config + pg_set $dev "flag QUEUE_MAP_CPU" + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + pg_set $dev "flag NO_TIMESTAMP" + + # Single destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + # Setup source IP-addresses based on thread number + pg_set $dev "src_min 198.18.$((thread+1)).1" + pg_set $dev "src_max 198.18.$((thread+1)).1" + + # Setup burst, for easy testing -b 0 disable bursting + # (internally in pktgen default and minimum burst=1) + if [[ ${BURST} -ne 0 ]]; then + pg_set $dev "burst $BURST" + else + info "$dev: Not using burst" + fi + +done + +# Run if user hits control-c +function print_result() { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" + +print_result diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh new file mode 100755 index 000000000..728106060 --- /dev/null +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# +# Multiqueue: Using pktgen threads for sending on multiple CPUs +# * adding devices to kernel threads which are in the same NUMA node +# * bound devices queue's irq affinity to the threads, 1:1 mapping +# * notice the naming scheme for keeping device names unique +# * nameing scheme: dev@thread_number +# * flow variation via random UDP source port +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" +# +# Required param: -i dev in $DEV +source ${basedir}/parameters.sh + +# Base Config +DELAY="0" # Zero means max speed +[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" + +# Flow variation random source port between min and max +UDP_SRC_MIN=9 +UDP_SRC_MAX=109 + +node=`get_iface_node $DEV` +irq_array=(`get_iface_irqs $DEV`) +cpu_array=(`get_node_cpus $node`) + +[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \ + err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})" + +# (example of setting default params in your script) +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((i = 0; i < $THREADS; i++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + # Set the queue's irq affinity to this $thread (processor) + # if '-f' is designated, offset cpu id + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list + info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`" + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # select queue and bind the queue and $dev in 1:1 relationship + queue_num=$i + info "queue number is $queue_num" + pg_set $dev "queue_map_min $queue_num" + pg_set $dev "queue_map_max $queue_num" + + # Notice config queue to map to cpu (mirrors smp_processor_id()) + # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number + pg_set $dev "flag QUEUE_MAP_CPU" + + # Base config of dev + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + + # Flag example disabling timestamping + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + # Setup random UDP port src range + pg_set $dev "flag UDPSRC_RND" + pg_set $dev "udp_src_min $UDP_SRC_MIN" + pg_set $dev "udp_src_max $UDP_SRC_MAX" +done + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +for ((i = 0; i < $THREADS; i++)); do + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done diff --git a/samples/qmi/Makefile b/samples/qmi/Makefile new file mode 100644 index 000000000..641943d40 --- /dev/null +++ b/samples/qmi/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi_sample_client.o diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c new file mode 100644 index 000000000..c9e7276c3 --- /dev/null +++ b/samples/qmi/qmi_sample_client.c @@ -0,0 +1,622 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Sample in-kernel QMI client driver + * + * Copyright (c) 2013-2014, The Linux Foundation. All rights reserved. + * Copyright (C) 2017 Linaro Ltd. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/qrtr.h> +#include <linux/net.h> +#include <linux/completion.h> +#include <linux/idr.h> +#include <linux/string.h> +#include <net/sock.h> +#include <linux/soc/qcom/qmi.h> + +#define PING_REQ1_TLV_TYPE 0x1 +#define PING_RESP1_TLV_TYPE 0x2 +#define PING_OPT1_TLV_TYPE 0x10 +#define PING_OPT2_TLV_TYPE 0x11 + +#define DATA_REQ1_TLV_TYPE 0x1 +#define DATA_RESP1_TLV_TYPE 0x2 +#define DATA_OPT1_TLV_TYPE 0x10 +#define DATA_OPT2_TLV_TYPE 0x11 + +#define TEST_MED_DATA_SIZE_V01 8192 +#define TEST_MAX_NAME_SIZE_V01 255 + +#define TEST_PING_REQ_MSG_ID_V01 0x20 +#define TEST_DATA_REQ_MSG_ID_V01 0x21 + +#define TEST_PING_REQ_MAX_MSG_LEN_V01 266 +#define TEST_DATA_REQ_MAX_MSG_LEN_V01 8456 + +struct test_name_type_v01 { + u32 name_len; + char name[TEST_MAX_NAME_SIZE_V01]; +}; + +static struct qmi_elem_info test_name_type_v01_ei[] = { + { + .data_type = QMI_DATA_LEN, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = QMI_COMMON_TLV_TYPE, + .offset = offsetof(struct test_name_type_v01, + name_len), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = TEST_MAX_NAME_SIZE_V01, + .elem_size = sizeof(char), + .array_type = VAR_LEN_ARRAY, + .tlv_type = QMI_COMMON_TLV_TYPE, + .offset = offsetof(struct test_name_type_v01, + name), + }, + {} +}; + +struct test_ping_req_msg_v01 { + char ping[4]; + + u8 client_name_valid; + struct test_name_type_v01 client_name; +}; + +static struct qmi_elem_info test_ping_req_msg_v01_ei[] = { + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = 4, + .elem_size = sizeof(char), + .array_type = STATIC_ARRAY, + .tlv_type = PING_REQ1_TLV_TYPE, + .offset = offsetof(struct test_ping_req_msg_v01, + ping), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_req_msg_v01, + client_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_req_msg_v01, + client_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +struct test_ping_resp_msg_v01 { + struct qmi_response_type_v01 resp; + + u8 pong_valid; + char pong[4]; + + u8 service_name_valid; + struct test_name_type_v01 service_name; +}; + +static struct qmi_elem_info test_ping_resp_msg_v01_ei[] = { + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct qmi_response_type_v01), + .array_type = NO_ARRAY, + .tlv_type = PING_RESP1_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + resp), + .ei_array = qmi_response_type_v01_ei, + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + pong_valid), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = 4, + .elem_size = sizeof(char), + .array_type = STATIC_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + pong), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT2_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + service_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT2_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + service_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +struct test_data_req_msg_v01 { + u32 data_len; + u8 data[TEST_MED_DATA_SIZE_V01]; + + u8 client_name_valid; + struct test_name_type_v01 client_name; +}; + +static struct qmi_elem_info test_data_req_msg_v01_ei[] = { + { + .data_type = QMI_DATA_LEN, + .elem_len = 1, + .elem_size = sizeof(u32), + .array_type = NO_ARRAY, + .tlv_type = DATA_REQ1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + data_len), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = TEST_MED_DATA_SIZE_V01, + .elem_size = sizeof(u8), + .array_type = VAR_LEN_ARRAY, + .tlv_type = DATA_REQ1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + data), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + client_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + client_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +struct test_data_resp_msg_v01 { + struct qmi_response_type_v01 resp; + + u8 data_valid; + u32 data_len; + u8 data[TEST_MED_DATA_SIZE_V01]; + + u8 service_name_valid; + struct test_name_type_v01 service_name; +}; + +static struct qmi_elem_info test_data_resp_msg_v01_ei[] = { + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct qmi_response_type_v01), + .array_type = NO_ARRAY, + .tlv_type = DATA_RESP1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + resp), + .ei_array = qmi_response_type_v01_ei, + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + data_valid), + }, + { + .data_type = QMI_DATA_LEN, + .elem_len = 1, + .elem_size = sizeof(u32), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + data_len), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = TEST_MED_DATA_SIZE_V01, + .elem_size = sizeof(u8), + .array_type = VAR_LEN_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + data), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT2_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + service_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT2_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + service_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +/* + * ping_write() - ping_pong debugfs file write handler + * @file: debugfs file context + * @user_buf: reference to the user data (ignored) + * @count: number of bytes in @user_buf + * @ppos: offset in @file to write + * + * This function allows user space to send out a ping_pong QMI encoded message + * to the associated remote test service and will return with the result of the + * transaction. It serves as an example of how to provide a custom response + * handler. + * + * Return: @count, or negative errno on failure. + */ +static ssize_t ping_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct qmi_handle *qmi = file->private_data; + struct test_ping_req_msg_v01 req = {}; + struct qmi_txn txn; + int ret; + + memcpy(req.ping, "ping", sizeof(req.ping)); + + ret = qmi_txn_init(qmi, &txn, NULL, NULL); + if (ret < 0) + return ret; + + ret = qmi_send_request(qmi, NULL, &txn, + TEST_PING_REQ_MSG_ID_V01, + TEST_PING_REQ_MAX_MSG_LEN_V01, + test_ping_req_msg_v01_ei, &req); + if (ret < 0) { + qmi_txn_cancel(&txn); + return ret; + } + + ret = qmi_txn_wait(&txn, 5 * HZ); + if (ret < 0) + count = ret; + + return count; +} + +static const struct file_operations ping_fops = { + .open = simple_open, + .write = ping_write, +}; + +static void ping_pong_cb(struct qmi_handle *qmi, struct sockaddr_qrtr *sq, + struct qmi_txn *txn, const void *data) +{ + const struct test_ping_resp_msg_v01 *resp = data; + + if (!txn) { + pr_err("spurious ping response\n"); + return; + } + + if (resp->resp.result == QMI_RESULT_FAILURE_V01) + txn->result = -ENXIO; + else if (!resp->pong_valid || memcmp(resp->pong, "pong", 4)) + txn->result = -EINVAL; + + complete(&txn->completion); +} + +/* + * data_write() - data debugfs file write handler + * @file: debugfs file context + * @user_buf: reference to the user data + * @count: number of bytes in @user_buf + * @ppos: offset in @file to write + * + * This function allows user space to send out a data QMI encoded message to + * the associated remote test service and will return with the result of the + * transaction. It serves as an example of how to have the QMI helpers decode a + * transaction response into a provided object automatically. + * + * Return: @count, or negative errno on failure. + */ +static ssize_t data_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) + +{ + struct qmi_handle *qmi = file->private_data; + struct test_data_resp_msg_v01 *resp; + struct test_data_req_msg_v01 *req; + struct qmi_txn txn; + int ret; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + resp = kzalloc(sizeof(*resp), GFP_KERNEL); + if (!resp) { + kfree(req); + return -ENOMEM; + } + + req->data_len = min_t(size_t, sizeof(req->data), count); + if (copy_from_user(req->data, user_buf, req->data_len)) { + ret = -EFAULT; + goto out; + } + + ret = qmi_txn_init(qmi, &txn, test_data_resp_msg_v01_ei, resp); + if (ret < 0) + goto out; + + ret = qmi_send_request(qmi, NULL, &txn, + TEST_DATA_REQ_MSG_ID_V01, + TEST_DATA_REQ_MAX_MSG_LEN_V01, + test_data_req_msg_v01_ei, req); + if (ret < 0) { + qmi_txn_cancel(&txn); + goto out; + } + + ret = qmi_txn_wait(&txn, 5 * HZ); + if (ret < 0) { + goto out; + } else if (!resp->data_valid || + resp->data_len != req->data_len || + memcmp(resp->data, req->data, req->data_len)) { + pr_err("response data doesn't match expectation\n"); + ret = -EINVAL; + goto out; + } + + ret = count; + +out: + kfree(resp); + kfree(req); + + return ret; +} + +static const struct file_operations data_fops = { + .open = simple_open, + .write = data_write, +}; + +static struct qmi_msg_handler qmi_sample_handlers[] = { + { + .type = QMI_RESPONSE, + .msg_id = TEST_PING_REQ_MSG_ID_V01, + .ei = test_ping_resp_msg_v01_ei, + .decoded_size = sizeof(struct test_ping_req_msg_v01), + .fn = ping_pong_cb + }, + {} +}; + +struct qmi_sample { + struct qmi_handle qmi; + + struct dentry *de_dir; + struct dentry *de_data; + struct dentry *de_ping; +}; + +static struct dentry *qmi_debug_dir; + +static int qmi_sample_probe(struct platform_device *pdev) +{ + struct sockaddr_qrtr *sq; + struct qmi_sample *sample; + char path[20]; + int ret; + + sample = devm_kzalloc(&pdev->dev, sizeof(*sample), GFP_KERNEL); + if (!sample) + return -ENOMEM; + + ret = qmi_handle_init(&sample->qmi, TEST_DATA_REQ_MAX_MSG_LEN_V01, + NULL, + qmi_sample_handlers); + if (ret < 0) + return ret; + + sq = dev_get_platdata(&pdev->dev); + ret = kernel_connect(sample->qmi.sock, (struct sockaddr *)sq, + sizeof(*sq), 0); + if (ret < 0) { + pr_err("failed to connect to remote service port\n"); + goto err_release_qmi_handle; + } + + snprintf(path, sizeof(path), "%d:%d", sq->sq_node, sq->sq_port); + + sample->de_dir = debugfs_create_dir(path, qmi_debug_dir); + if (IS_ERR(sample->de_dir)) { + ret = PTR_ERR(sample->de_dir); + goto err_release_qmi_handle; + } + + sample->de_data = debugfs_create_file("data", 0600, sample->de_dir, + sample, &data_fops); + if (IS_ERR(sample->de_data)) { + ret = PTR_ERR(sample->de_data); + goto err_remove_de_dir; + } + + sample->de_ping = debugfs_create_file("ping", 0600, sample->de_dir, + sample, &ping_fops); + if (IS_ERR(sample->de_ping)) { + ret = PTR_ERR(sample->de_ping); + goto err_remove_de_data; + } + + platform_set_drvdata(pdev, sample); + + return 0; + +err_remove_de_data: + debugfs_remove(sample->de_data); +err_remove_de_dir: + debugfs_remove(sample->de_dir); +err_release_qmi_handle: + qmi_handle_release(&sample->qmi); + + return ret; +} + +static int qmi_sample_remove(struct platform_device *pdev) +{ + struct qmi_sample *sample = platform_get_drvdata(pdev); + + debugfs_remove(sample->de_ping); + debugfs_remove(sample->de_data); + debugfs_remove(sample->de_dir); + + qmi_handle_release(&sample->qmi); + + return 0; +} + +static struct platform_driver qmi_sample_driver = { + .probe = qmi_sample_probe, + .remove = qmi_sample_remove, + .driver = { + .name = "qmi_sample_client", + }, +}; + +static int qmi_sample_new_server(struct qmi_handle *qmi, + struct qmi_service *service) +{ + struct platform_device *pdev; + struct sockaddr_qrtr sq = { AF_QIPCRTR, service->node, service->port }; + int ret; + + pdev = platform_device_alloc("qmi_sample_client", PLATFORM_DEVID_AUTO); + if (!pdev) + return -ENOMEM; + + ret = platform_device_add_data(pdev, &sq, sizeof(sq)); + if (ret) + goto err_put_device; + + ret = platform_device_add(pdev); + if (ret) + goto err_put_device; + + service->priv = pdev; + + return 0; + +err_put_device: + platform_device_put(pdev); + + return ret; +} + +static void qmi_sample_del_server(struct qmi_handle *qmi, + struct qmi_service *service) +{ + struct platform_device *pdev = service->priv; + + platform_device_unregister(pdev); +} + +static struct qmi_handle lookup_client; + +static struct qmi_ops lookup_ops = { + .new_server = qmi_sample_new_server, + .del_server = qmi_sample_del_server, +}; + +static int qmi_sample_init(void) +{ + int ret; + + qmi_debug_dir = debugfs_create_dir("qmi_sample", NULL); + if (IS_ERR(qmi_debug_dir)) { + pr_err("failed to create qmi_sample dir\n"); + return PTR_ERR(qmi_debug_dir); + } + + ret = platform_driver_register(&qmi_sample_driver); + if (ret) + goto err_remove_debug_dir; + + ret = qmi_handle_init(&lookup_client, 0, &lookup_ops, NULL); + if (ret < 0) + goto err_unregister_driver; + + qmi_add_lookup(&lookup_client, 15, 0, 0); + + return 0; + +err_unregister_driver: + platform_driver_unregister(&qmi_sample_driver); +err_remove_debug_dir: + debugfs_remove(qmi_debug_dir); + + return ret; +} + +static void qmi_sample_exit(void) +{ + qmi_handle_release(&lookup_client); + + platform_driver_unregister(&qmi_sample_driver); + + debugfs_remove(qmi_debug_dir); +} + +module_init(qmi_sample_init); +module_exit(qmi_sample_exit); + +MODULE_DESCRIPTION("Sample QMI client driver"); +MODULE_LICENSE("GPL v2"); diff --git a/samples/rpmsg/Makefile b/samples/rpmsg/Makefile new file mode 100644 index 000000000..ddf9a5d13 --- /dev/null +++ b/samples/rpmsg/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg_client_sample.o diff --git a/samples/rpmsg/rpmsg_client_sample.c b/samples/rpmsg/rpmsg_client_sample.c new file mode 100644 index 000000000..ae5081662 --- /dev/null +++ b/samples/rpmsg/rpmsg_client_sample.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Remote processor messaging - sample client driver + * + * Copyright (C) 2011 Texas Instruments, Inc. + * Copyright (C) 2011 Google, Inc. + * + * Ohad Ben-Cohen <ohad@wizery.com> + * Brian Swetland <swetland@google.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rpmsg.h> + +#define MSG "hello world!" + +static int count = 100; +module_param(count, int, 0644); + +struct instance_data { + int rx_count; +}; + +static int rpmsg_sample_cb(struct rpmsg_device *rpdev, void *data, int len, + void *priv, u32 src) +{ + int ret; + struct instance_data *idata = dev_get_drvdata(&rpdev->dev); + + dev_info(&rpdev->dev, "incoming msg %d (src: 0x%x)\n", + ++idata->rx_count, src); + + print_hex_dump_debug(__func__, DUMP_PREFIX_NONE, 16, 1, data, len, + true); + + /* samples should not live forever */ + if (idata->rx_count >= count) { + dev_info(&rpdev->dev, "goodbye!\n"); + return 0; + } + + /* send a new message now */ + ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG)); + if (ret) + dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret); + + return 0; +} + +static int rpmsg_sample_probe(struct rpmsg_device *rpdev) +{ + int ret; + struct instance_data *idata; + + dev_info(&rpdev->dev, "new channel: 0x%x -> 0x%x!\n", + rpdev->src, rpdev->dst); + + idata = devm_kzalloc(&rpdev->dev, sizeof(*idata), GFP_KERNEL); + if (!idata) + return -ENOMEM; + + dev_set_drvdata(&rpdev->dev, idata); + + /* send a message to our remote processor */ + ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG)); + if (ret) { + dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret); + return ret; + } + + return 0; +} + +static void rpmsg_sample_remove(struct rpmsg_device *rpdev) +{ + dev_info(&rpdev->dev, "rpmsg sample client driver is removed\n"); +} + +static struct rpmsg_device_id rpmsg_driver_sample_id_table[] = { + { .name = "rpmsg-client-sample" }, + { }, +}; +MODULE_DEVICE_TABLE(rpmsg, rpmsg_driver_sample_id_table); + +static struct rpmsg_driver rpmsg_sample_client = { + .drv.name = KBUILD_MODNAME, + .id_table = rpmsg_driver_sample_id_table, + .probe = rpmsg_sample_probe, + .callback = rpmsg_sample_cb, + .remove = rpmsg_sample_remove, +}; +module_rpmsg_driver(rpmsg_sample_client); + +MODULE_DESCRIPTION("Remote processor messaging sample client driver"); +MODULE_LICENSE("GPL v2"); diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore new file mode 100644 index 000000000..4a5a5b7db --- /dev/null +++ b/samples/seccomp/.gitignore @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +bpf-direct +bpf-fancy +dropper +user-trap diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile new file mode 100644 index 000000000..c85ae0ed8 --- /dev/null +++ b/samples/seccomp/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += bpf-fancy dropper bpf-direct user-trap + +bpf-fancy-objs := bpf-fancy.o bpf-helper.o + +userccflags += -I usr/include diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c new file mode 100644 index 000000000..c09e4a17a --- /dev/null +++ b/samples/seccomp/bpf-direct.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros + * + * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> + * Author: Will Drewry <wad@chromium.org> + * + * The code may be used by anyone for any purpose, + * and can serve as a starting point for developing + * applications using prctl(PR_SET_SECCOMP, 2, ...). + */ +#if defined(__i386__) || defined(__x86_64__) +#define SUPPORTED_ARCH 1 +#endif + +#if defined(SUPPORTED_ARCH) +#define __USE_GNU 1 +#define _GNU_SOURCE 1 + +#include <linux/types.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <linux/unistd.h> +#include <signal.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <sys/prctl.h> +#include <unistd.h> + +#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) +#define syscall_nr (offsetof(struct seccomp_data, nr)) + +#if defined(__i386__) +#define REG_RESULT REG_EAX +#define REG_SYSCALL REG_EAX +#define REG_ARG0 REG_EBX +#define REG_ARG1 REG_ECX +#define REG_ARG2 REG_EDX +#define REG_ARG3 REG_ESI +#define REG_ARG4 REG_EDI +#define REG_ARG5 REG_EBP +#elif defined(__x86_64__) +#define REG_RESULT REG_RAX +#define REG_SYSCALL REG_RAX +#define REG_ARG0 REG_RDI +#define REG_ARG1 REG_RSI +#define REG_ARG2 REG_RDX +#define REG_ARG3 REG_R10 +#define REG_ARG4 REG_R8 +#define REG_ARG5 REG_R9 +#endif + +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif + +#ifndef SYS_SECCOMP +#define SYS_SECCOMP 1 +#endif + +static void emulator(int nr, siginfo_t *info, void *void_context) +{ + ucontext_t *ctx = (ucontext_t *)(void_context); + int syscall; + char *buf; + ssize_t bytes; + size_t len; + if (info->si_code != SYS_SECCOMP) + return; + if (!ctx) + return; + syscall = ctx->uc_mcontext.gregs[REG_SYSCALL]; + buf = (char *) ctx->uc_mcontext.gregs[REG_ARG1]; + len = (size_t) ctx->uc_mcontext.gregs[REG_ARG2]; + + if (syscall != __NR_write) + return; + if (ctx->uc_mcontext.gregs[REG_ARG0] != STDERR_FILENO) + return; + /* Redirect stderr messages to stdout. Doesn't handle EINTR, etc */ + ctx->uc_mcontext.gregs[REG_RESULT] = -1; + if (write(STDOUT_FILENO, "[ERR] ", 6) > 0) { + bytes = write(STDOUT_FILENO, buf, len); + ctx->uc_mcontext.gregs[REG_RESULT] = bytes; + } + return; +} + +static int install_emulator(void) +{ + struct sigaction act; + sigset_t mask; + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + sigaddset(&mask, SIGSYS); + + act.sa_sigaction = &emulator; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGSYS, &act, NULL) < 0) { + perror("sigaction"); + return -1; + } + if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) { + perror("sigprocmask"); + return -1; + } + return 0; +} + +static int install_filter(void) +{ + struct sock_filter filter[] = { + /* Grab the system call number */ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr), + /* Jump table for the allowed syscalls */ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), +#ifdef __NR_sigreturn + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), +#endif + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 3, 2), + + /* Check that read is only using stdin. */ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + + /* Check that write is only using stdout */ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0), + /* Trap attempts to write to stderr */ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 1, 2), + + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + }; + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + perror("prctl(NO_NEW_PRIVS)"); + return 1; + } + + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { + perror("prctl"); + return 1; + } + return 0; +} + +#define payload(_c) (_c), sizeof((_c)) +int main(int argc, char **argv) +{ + char buf[4096]; + ssize_t bytes = 0; + if (install_emulator()) + return 1; + if (install_filter()) + return 1; + syscall(__NR_write, STDOUT_FILENO, + payload("OHAI! WHAT IS YOUR NAME? ")); + bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)); + syscall(__NR_write, STDOUT_FILENO, payload("HELLO, ")); + syscall(__NR_write, STDOUT_FILENO, buf, bytes); + syscall(__NR_write, STDERR_FILENO, + payload("Error message going to STDERR\n")); + return 0; +} +#else /* SUPPORTED_ARCH */ +/* + * This sample is x86-only. Since kernel samples are compiled with the + * host toolchain, a non-x86 host will result in using only the main() + * below. + */ +int main(void) +{ + return 1; +} +#endif /* SUPPORTED_ARCH */ diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c new file mode 100644 index 000000000..1ccb43502 --- /dev/null +++ b/samples/seccomp/bpf-fancy.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Seccomp BPF example using a macro-based generator. + * + * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> + * Author: Will Drewry <wad@chromium.org> + * + * The code may be used by anyone for any purpose, + * and can serve as a starting point for developing + * applications using prctl(PR_ATTACH_SECCOMP_FILTER). + */ + +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <linux/unistd.h> +#include <stdio.h> +#include <string.h> +#include <sys/prctl.h> +#include <unistd.h> + +#include "bpf-helper.h" + +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif + +int main(int argc, char **argv) +{ + struct bpf_labels l = { + .count = 0, + }; + static const char msg1[] = "Please type something: "; + static const char msg2[] = "You typed: "; + char buf[256]; + struct sock_filter filter[] = { + /* TODO: LOAD_SYSCALL_NR(arch) and enforce an arch */ + LOAD_SYSCALL_NR, + SYSCALL(__NR_exit, ALLOW), + SYSCALL(__NR_exit_group, ALLOW), + SYSCALL(__NR_write, JUMP(&l, write_fd)), + SYSCALL(__NR_read, JUMP(&l, read)), + DENY, /* Don't passthrough into a label */ + + LABEL(&l, read), + ARG(0), + JNE(STDIN_FILENO, DENY), + ARG(1), + JNE((unsigned long)buf, DENY), + ARG(2), + JGE(sizeof(buf), DENY), + ALLOW, + + LABEL(&l, write_fd), + ARG(0), + JEQ(STDOUT_FILENO, JUMP(&l, write_buf)), + JEQ(STDERR_FILENO, JUMP(&l, write_buf)), + DENY, + + LABEL(&l, write_buf), + ARG(1), + JEQ((unsigned long)msg1, JUMP(&l, msg1_len)), + JEQ((unsigned long)msg2, JUMP(&l, msg2_len)), + JEQ((unsigned long)buf, JUMP(&l, buf_len)), + DENY, + + LABEL(&l, msg1_len), + ARG(2), + JLT(sizeof(msg1), ALLOW), + DENY, + + LABEL(&l, msg2_len), + ARG(2), + JLT(sizeof(msg2), ALLOW), + DENY, + + LABEL(&l, buf_len), + ARG(2), + JLT(sizeof(buf), ALLOW), + DENY, + }; + struct sock_fprog prog = { + .filter = filter, + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + }; + ssize_t bytes; + bpf_resolve_jumps(&l, filter, sizeof(filter)/sizeof(*filter)); + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + perror("prctl(NO_NEW_PRIVS)"); + return 1; + } + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { + perror("prctl(SECCOMP)"); + return 1; + } + syscall(__NR_write, STDOUT_FILENO, msg1, strlen(msg1)); + bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)-1); + bytes = (bytes > 0 ? bytes : 0); + syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)); + syscall(__NR_write, STDERR_FILENO, buf, bytes); + /* Now get killed */ + syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)+2); + return 0; +} diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c new file mode 100644 index 000000000..ae260d77a --- /dev/null +++ b/samples/seccomp/bpf-helper.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Seccomp BPF helper functions + * + * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> + * Author: Will Drewry <wad@chromium.org> + * + * The code may be used by anyone for any purpose, + * and can serve as a starting point for developing + * applications using prctl(PR_ATTACH_SECCOMP_FILTER). + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "bpf-helper.h" + +int bpf_resolve_jumps(struct bpf_labels *labels, + struct sock_filter *filter, size_t count) +{ + size_t i; + + if (count < 1 || count > BPF_MAXINSNS) + return -1; + /* + * Walk it once, backwards, to build the label table and do fixups. + * Since backward jumps are disallowed by BPF, this is easy. + */ + for (i = 0; i < count; ++i) { + size_t offset = count - i - 1; + struct sock_filter *instr = &filter[offset]; + if (instr->code != (BPF_JMP+BPF_JA)) + continue; + switch ((instr->jt<<8)|instr->jf) { + case (JUMP_JT<<8)|JUMP_JF: + if (labels->labels[instr->k].location == 0xffffffff) { + fprintf(stderr, "Unresolved label: '%s'\n", + labels->labels[instr->k].label); + return 1; + } + instr->k = labels->labels[instr->k].location - + (offset + 1); + instr->jt = 0; + instr->jf = 0; + continue; + case (LABEL_JT<<8)|LABEL_JF: + if (labels->labels[instr->k].location != 0xffffffff) { + fprintf(stderr, "Duplicate label use: '%s'\n", + labels->labels[instr->k].label); + return 1; + } + labels->labels[instr->k].location = offset; + instr->k = 0; /* fall through */ + instr->jt = 0; + instr->jf = 0; + continue; + } + } + return 0; +} + +/* Simple lookup table for labels. */ +__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label) +{ + struct __bpf_label *begin = labels->labels, *end; + int id; + + if (labels->count == BPF_LABELS_MAX) { + fprintf(stderr, "Too many labels\n"); + exit(1); + } + if (labels->count == 0) { + begin->label = label; + begin->location = 0xffffffff; + labels->count++; + return 0; + } + end = begin + labels->count; + for (id = 0; begin < end; ++begin, ++id) { + if (!strcmp(label, begin->label)) + return id; + } + begin->label = label; + begin->location = 0xffffffff; + labels->count++; + return id; +} + +void seccomp_bpf_print(struct sock_filter *filter, size_t count) +{ + struct sock_filter *end = filter + count; + for ( ; filter < end; ++filter) + printf("{ code=%u,jt=%u,jf=%u,k=%u },\n", + filter->code, filter->jt, filter->jf, filter->k); +} diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h new file mode 100644 index 000000000..0cc9816fe --- /dev/null +++ b/samples/seccomp/bpf-helper.h @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Example wrapper around BPF macros. + * + * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> + * Author: Will Drewry <wad@chromium.org> + * + * The code may be used by anyone for any purpose, + * and can serve as a starting point for developing + * applications using prctl(PR_SET_SECCOMP, 2, ...). + * + * No guarantees are provided with respect to the correctness + * or functionality of this code. + */ +#ifndef __BPF_HELPER_H__ +#define __BPF_HELPER_H__ + +#include <asm/bitsperlong.h> /* for __BITS_PER_LONG */ +#include <endian.h> +#include <linux/filter.h> +#include <linux/seccomp.h> /* for seccomp_data */ +#include <linux/types.h> +#include <linux/unistd.h> +#include <stddef.h> + +#define BPF_LABELS_MAX 256 +struct bpf_labels { + int count; + struct __bpf_label { + const char *label; + __u32 location; + } labels[BPF_LABELS_MAX]; +}; + +int bpf_resolve_jumps(struct bpf_labels *labels, + struct sock_filter *filter, size_t count); +__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label); +void seccomp_bpf_print(struct sock_filter *filter, size_t count); + +#define JUMP_JT 0xff +#define JUMP_JF 0xff +#define LABEL_JT 0xfe +#define LABEL_JF 0xfe + +#define ALLOW \ + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) +#define DENY \ + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) +#define JUMP(labels, label) \ + BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ + JUMP_JT, JUMP_JF) +#define LABEL(labels, label) \ + BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ + LABEL_JT, LABEL_JF) +#define SYSCALL(nr, jt) \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \ + jt + +/* Lame, but just an example */ +#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label) + +#define EXPAND(...) __VA_ARGS__ + +/* Ensure that we load the logically correct offset. */ +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) +#elif __BYTE_ORDER == __BIG_ENDIAN +#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) +#else +#error "Unknown endianness" +#endif + +/* Map all width-sensitive operations */ +#if __BITS_PER_LONG == 32 + +#define JEQ(x, jt) JEQ32(x, EXPAND(jt)) +#define JNE(x, jt) JNE32(x, EXPAND(jt)) +#define JGT(x, jt) JGT32(x, EXPAND(jt)) +#define JLT(x, jt) JLT32(x, EXPAND(jt)) +#define JGE(x, jt) JGE32(x, EXPAND(jt)) +#define JLE(x, jt) JLE32(x, EXPAND(jt)) +#define JA(x, jt) JA32(x, EXPAND(jt)) +#define ARG(i) ARG_32(i) + +#elif __BITS_PER_LONG == 64 + +/* Ensure that we load the logically correct offset. */ +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define ENDIAN(_lo, _hi) _lo, _hi +#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) +#elif __BYTE_ORDER == __BIG_ENDIAN +#define ENDIAN(_lo, _hi) _hi, _lo +#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) +#endif + +union arg64 { + struct { + __u32 ENDIAN(lo32, hi32); + }; + __u64 u64; +}; + +#define JEQ(x, jt) \ + JEQ64(((union arg64){.u64 = (x)}).lo32, \ + ((union arg64){.u64 = (x)}).hi32, \ + EXPAND(jt)) +#define JGT(x, jt) \ + JGT64(((union arg64){.u64 = (x)}).lo32, \ + ((union arg64){.u64 = (x)}).hi32, \ + EXPAND(jt)) +#define JGE(x, jt) \ + JGE64(((union arg64){.u64 = (x)}).lo32, \ + ((union arg64){.u64 = (x)}).hi32, \ + EXPAND(jt)) +#define JNE(x, jt) \ + JNE64(((union arg64){.u64 = (x)}).lo32, \ + ((union arg64){.u64 = (x)}).hi32, \ + EXPAND(jt)) +#define JLT(x, jt) \ + JLT64(((union arg64){.u64 = (x)}).lo32, \ + ((union arg64){.u64 = (x)}).hi32, \ + EXPAND(jt)) +#define JLE(x, jt) \ + JLE64(((union arg64){.u64 = (x)}).lo32, \ + ((union arg64){.u64 = (x)}).hi32, \ + EXPAND(jt)) + +#define JA(x, jt) \ + JA64(((union arg64){.u64 = (x)}).lo32, \ + ((union arg64){.u64 = (x)}).hi32, \ + EXPAND(jt)) +#define ARG(i) ARG_64(i) + +#else +#error __BITS_PER_LONG value unusable. +#endif + +/* Loads the arg into A */ +#define ARG_32(idx) \ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)) + +/* Loads lo into M[0] and hi into M[1] and A */ +#define ARG_64(idx) \ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \ + BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \ + BPF_STMT(BPF_ST, 1) /* hi -> M[1] */ + +#define JEQ32(value, jt) \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \ + jt + +#define JNE32(value, jt) \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \ + jt + +#define JA32(value, jt) \ + BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \ + jt + +#define JGE32(value, jt) \ + BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \ + jt + +#define JGT32(value, jt) \ + BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \ + jt + +#define JLE32(value, jt) \ + BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \ + jt + +#define JLT32(value, jt) \ + BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \ + jt + +/* + * All the JXX64 checks assume lo is saved in M[0] and hi is saved in both + * A and M[1]. This invariant is kept by restoring A if necessary. + */ +#define JEQ64(lo, hi, jt) \ + /* if (hi != arg.hi) goto NOMATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ + BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \ + /* if (lo != arg.lo) goto NOMATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \ + BPF_STMT(BPF_LD+BPF_MEM, 1), \ + jt, \ + BPF_STMT(BPF_LD+BPF_MEM, 1) + +#define JNE64(lo, hi, jt) \ + /* if (hi != arg.hi) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \ + BPF_STMT(BPF_LD+BPF_MEM, 0), \ + /* if (lo != arg.lo) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \ + BPF_STMT(BPF_LD+BPF_MEM, 1), \ + jt, \ + BPF_STMT(BPF_LD+BPF_MEM, 1) + +#define JA64(lo, hi, jt) \ + /* if (hi & arg.hi) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \ + BPF_STMT(BPF_LD+BPF_MEM, 0), \ + /* if (lo & arg.lo) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \ + BPF_STMT(BPF_LD+BPF_MEM, 1), \ + jt, \ + BPF_STMT(BPF_LD+BPF_MEM, 1) + +#define JGE64(lo, hi, jt) \ + /* if (hi > arg.hi) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \ + /* if (hi != arg.hi) goto NOMATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ + BPF_STMT(BPF_LD+BPF_MEM, 0), \ + /* if (lo >= arg.lo) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \ + BPF_STMT(BPF_LD+BPF_MEM, 1), \ + jt, \ + BPF_STMT(BPF_LD+BPF_MEM, 1) + +#define JGT64(lo, hi, jt) \ + /* if (hi > arg.hi) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \ + /* if (hi != arg.hi) goto NOMATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ + BPF_STMT(BPF_LD+BPF_MEM, 0), \ + /* if (lo > arg.lo) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \ + BPF_STMT(BPF_LD+BPF_MEM, 1), \ + jt, \ + BPF_STMT(BPF_LD+BPF_MEM, 1) + +#define JLE64(lo, hi, jt) \ + /* if (hi < arg.hi) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \ + /* if (hi != arg.hi) goto NOMATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ + BPF_STMT(BPF_LD+BPF_MEM, 0), \ + /* if (lo <= arg.lo) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \ + BPF_STMT(BPF_LD+BPF_MEM, 1), \ + jt, \ + BPF_STMT(BPF_LD+BPF_MEM, 1) + +#define JLT64(lo, hi, jt) \ + /* if (hi < arg.hi) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \ + /* if (hi != arg.hi) goto NOMATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ + BPF_STMT(BPF_LD+BPF_MEM, 0), \ + /* if (lo < arg.lo) goto MATCH; */ \ + BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 2, 0), \ + BPF_STMT(BPF_LD+BPF_MEM, 1), \ + jt, \ + BPF_STMT(BPF_LD+BPF_MEM, 1) + +#define LOAD_SYSCALL_NR \ + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \ + offsetof(struct seccomp_data, nr)) + +#endif /* __BPF_HELPER_H__ */ diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c new file mode 100644 index 000000000..cc0648eb3 --- /dev/null +++ b/samples/seccomp/dropper.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Naive system call dropper built on seccomp_filter. + * + * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> + * Author: Will Drewry <wad@chromium.org> + * + * The code may be used by anyone for any purpose, + * and can serve as a starting point for developing + * applications using prctl(PR_SET_SECCOMP, 2, ...). + * + * When run, returns the specified errno for the specified + * system call number against the given architecture. + * + */ + +#include <errno.h> +#include <linux/audit.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <linux/unistd.h> +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <sys/prctl.h> +#include <unistd.h> + +static int install_filter(int nr, int arch, int error) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + (offsetof(struct seccomp_data, arch))), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 3), + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, + SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + perror("prctl(NO_NEW_PRIVS)"); + return 1; + } + if (prctl(PR_SET_SECCOMP, 2, &prog)) { + perror("prctl(PR_SET_SECCOMP)"); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + if (argc < 5) { + fprintf(stderr, "Usage:\n" + "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n" + "Hint: AUDIT_ARCH_I386: 0x%X\n" + " AUDIT_ARCH_X86_64: 0x%X\n" + "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); + return 1; + } + if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0), + strtol(argv[3], NULL, 0))) + return 1; + execv(argv[4], &argv[4]); + printf("Failed to execv\n"); + return 255; +} diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c new file mode 100644 index 000000000..20291ec64 --- /dev/null +++ b/samples/seccomp/user-trap.c @@ -0,0 +1,375 @@ +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <stddef.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/user.h> +#include <sys/ioctl.h> +#include <sys/ptrace.h> +#include <sys/mount.h> +#include <linux/limits.h> +#include <linux/filter.h> +#include <linux/seccomp.h> + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + errno = 0; + return syscall(__NR_seccomp, op, flags, args); +} + +static int send_fd(int sock, int fd) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; + struct iovec io = { + .iov_base = &c, + .iov_len = 1, + }; + + msg.msg_iov = &io; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + *((int *)CMSG_DATA(cmsg)) = fd; + msg.msg_controllen = cmsg->cmsg_len; + + if (sendmsg(sock, &msg, 0) < 0) { + perror("sendmsg"); + return -1; + } + + return 0; +} + +static int recv_fd(int sock) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; + struct iovec io = { + .iov_base = &c, + .iov_len = 1, + }; + + msg.msg_iov = &io; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + + if (recvmsg(sock, &msg, 0) < 0) { + perror("recvmsg"); + return -1; + } + + cmsg = CMSG_FIRSTHDR(&msg); + + return *((int *)CMSG_DATA(cmsg)); +} + +static int user_trap_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +static int handle_req(struct seccomp_notif *req, + struct seccomp_notif_resp *resp, int listener) +{ + char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; + int ret = -1, mem; + + resp->id = req->id; + resp->error = -EPERM; + resp->val = 0; + + if (req->data.nr != __NR_mount) { + fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr); + return -1; + } + + /* Only allow bind mounts. */ + if (!(req->data.args[3] & MS_BIND)) + return 0; + + /* + * Ok, let's read the task's memory to see where they wanted their + * mount to go. + */ + snprintf(path, sizeof(path), "/proc/%d/mem", req->pid); + mem = open(path, O_RDONLY); + if (mem < 0) { + perror("open mem"); + return -1; + } + + /* + * Now we avoid a TOCTOU: we referred to a pid by its pid, but since + * the pid that made the syscall may have died, we need to confirm that + * the pid is still valid after we open its /proc/pid/mem file. We can + * ask the listener fd this as follows. + * + * Note that this check should occur *after* any task-specific + * resources are opened, to make sure that the task has not died and + * we're not wrongly reading someone else's state in order to make + * decisions. + */ + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { + fprintf(stderr, "task died before we could map its memory\n"); + goto out; + } + + /* + * Phew, we've got the right /proc/pid/mem. Now we can read it. Note + * that to avoid another TOCTOU, we should read all of the pointer args + * before we decide to allow the syscall. + */ + if (lseek(mem, req->data.args[0], SEEK_SET) < 0) { + perror("seek"); + goto out; + } + + ret = read(mem, source, sizeof(source)); + if (ret < 0) { + perror("read"); + goto out; + } + + if (lseek(mem, req->data.args[1], SEEK_SET) < 0) { + perror("seek"); + goto out; + } + + ret = read(mem, target, sizeof(target)); + if (ret < 0) { + perror("read"); + goto out; + } + + /* + * Our policy is to only allow bind mounts inside /tmp. This isn't very + * interesting, because we could do unprivlieged bind mounts with user + * namespaces already, but you get the idea. + */ + if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) { + if (mount(source, target, NULL, req->data.args[3], NULL) < 0) { + ret = -1; + perror("actual mount"); + goto out; + } + resp->error = 0; + } + + /* Even if we didn't allow it because of policy, generating the + * response was be a success, because we want to tell the worker EPERM. + */ + ret = 0; + +out: + close(mem); + return ret; +} + +int main(void) +{ + int sk_pair[2], ret = 1, status, listener; + pid_t worker = 0 , tracer = 0; + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) { + perror("socketpair"); + return 1; + } + + worker = fork(); + if (worker < 0) { + perror("fork"); + goto close_pair; + } + + if (worker == 0) { + listener = user_trap_syscall(__NR_mount, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) { + perror("seccomp"); + exit(1); + } + + /* + * Drop privileges. We definitely can't mount as uid 1000. + */ + if (setuid(1000) < 0) { + perror("setuid"); + exit(1); + } + + /* + * Send the listener to the parent; also serves as + * synchronization. + */ + if (send_fd(sk_pair[1], listener) < 0) + exit(1); + close(listener); + + if (mkdir("/tmp/foo", 0755) < 0) { + perror("mkdir"); + exit(1); + } + + /* + * Try a bad mount just for grins. + */ + if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) { + fprintf(stderr, "huh? mounted /dev/sda?\n"); + exit(1); + } + + if (errno != EPERM) { + perror("bad error from mount"); + exit(1); + } + + /* + * Ok, we expect this one to succeed. + */ + if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) { + perror("mount"); + exit(1); + } + + exit(0); + } + + /* + * Get the listener from the child. + */ + listener = recv_fd(sk_pair[0]); + if (listener < 0) + goto out_kill; + + /* + * Fork a task to handle the requests. This isn't strictly necessary, + * but it makes the particular writing of this sample easier, since we + * can just wait ofr the tracee to exit and kill the tracer. + */ + tracer = fork(); + if (tracer < 0) { + perror("fork"); + goto out_kill; + } + + if (tracer == 0) { + struct seccomp_notif *req; + struct seccomp_notif_resp *resp; + struct seccomp_notif_sizes sizes; + + if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) { + perror("seccomp(GET_NOTIF_SIZES)"); + goto out_close; + } + + req = malloc(sizes.seccomp_notif); + if (!req) + goto out_close; + + resp = malloc(sizes.seccomp_notif_resp); + if (!resp) + goto out_req; + memset(resp, 0, sizes.seccomp_notif_resp); + + while (1) { + memset(req, 0, sizes.seccomp_notif); + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { + perror("ioctl recv"); + goto out_resp; + } + + if (handle_req(req, resp, listener) < 0) + goto out_resp; + + /* + * ENOENT here means that the task may have gotten a + * signal and restarted the syscall. It's up to the + * handler to decide what to do in this case, but for + * the sample code, we just ignore it. Probably + * something better should happen, like undoing the + * mount, or keeping track of the args to make sure we + * don't do it again. + */ + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && + errno != ENOENT) { + perror("ioctl send"); + goto out_resp; + } + } +out_resp: + free(resp); +out_req: + free(req); +out_close: + close(listener); + exit(1); + } + + close(listener); + + if (waitpid(worker, &status, 0) != worker) { + perror("waitpid"); + goto out_kill; + } + + if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) { + perror("umount2"); + goto out_kill; + } + + if (remove("/tmp/foo") < 0 && errno != ENOENT) { + perror("remove"); + exit(1); + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fprintf(stderr, "worker exited nonzero\n"); + goto out_kill; + } + + ret = 0; + +out_kill: + if (tracer > 0) + kill(tracer, SIGKILL); + if (worker > 0) + kill(worker, SIGKILL); + +close_pair: + close(sk_pair[0]); + close(sk_pair[1]); + return ret; +} diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore new file mode 100644 index 000000000..40510c33c --- /dev/null +++ b/samples/timers/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +hpet_example diff --git a/samples/timers/Makefile b/samples/timers/Makefile new file mode 100644 index 000000000..e6836cdea --- /dev/null +++ b/samples/timers/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += hpet_example + +userccflags += -I usr/include diff --git a/samples/timers/hpet_example.c b/samples/timers/hpet_example.c new file mode 100644 index 000000000..f1cb622f6 --- /dev/null +++ b/samples/timers/hpet_example.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <memory.h> +#include <malloc.h> +#include <time.h> +#include <ctype.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <signal.h> +#include <errno.h> +#include <sys/time.h> +#include <linux/hpet.h> + + +extern void hpet_open_close(int, const char **); +extern void hpet_info(int, const char **); +extern void hpet_poll(int, const char **); +extern void hpet_fasync(int, const char **); +extern void hpet_read(int, const char **); + +#include <sys/poll.h> +#include <sys/ioctl.h> + +struct hpet_command { + char *command; + void (*func)(int argc, const char ** argv); +} hpet_command[] = { + { + "open-close", + hpet_open_close + }, + { + "info", + hpet_info + }, + { + "poll", + hpet_poll + }, + { + "fasync", + hpet_fasync + }, +}; + +int +main(int argc, const char ** argv) +{ + unsigned int i; + + argc--; + argv++; + + if (!argc) { + fprintf(stderr, "-hpet: requires command\n"); + return -1; + } + + + for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++) + if (!strcmp(argv[0], hpet_command[i].command)) { + argc--; + argv++; + fprintf(stderr, "-hpet: executing %s\n", + hpet_command[i].command); + hpet_command[i].func(argc, argv); + return 0; + } + + fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]); + + return -1; +} + +void +hpet_open_close(int argc, const char **argv) +{ + int fd; + + if (argc != 1) { + fprintf(stderr, "hpet_open_close: device-name\n"); + return; + } + + fd = open(argv[0], O_RDONLY); + if (fd < 0) + fprintf(stderr, "hpet_open_close: open failed\n"); + else + close(fd); + + return; +} + +void +hpet_info(int argc, const char **argv) +{ + struct hpet_info info; + int fd; + + if (argc != 1) { + fprintf(stderr, "hpet_info: device-name\n"); + return; + } + + fd = open(argv[0], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "hpet_info: open of %s failed\n", argv[0]); + return; + } + + if (ioctl(fd, HPET_INFO, &info) < 0) { + fprintf(stderr, "hpet_info: failed to get info\n"); + goto out; + } + + fprintf(stderr, "hpet_info: hi_irqfreq 0x%lx hi_flags 0x%lx ", + info.hi_ireqfreq, info.hi_flags); + fprintf(stderr, "hi_hpet %d hi_timer %d\n", + info.hi_hpet, info.hi_timer); + +out: + close(fd); + return; +} + +void +hpet_poll(int argc, const char **argv) +{ + unsigned long freq; + int iterations, i, fd; + struct pollfd pfd; + struct hpet_info info; + struct timeval stv, etv; + struct timezone tz; + long usec; + + if (argc != 3) { + fprintf(stderr, "hpet_poll: device-name freq iterations\n"); + return; + } + + freq = atoi(argv[1]); + iterations = atoi(argv[2]); + + fd = open(argv[0], O_RDONLY); + + if (fd < 0) { + fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]); + return; + } + + if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { + fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n"); + goto out; + } + + if (ioctl(fd, HPET_INFO, &info) < 0) { + fprintf(stderr, "hpet_poll: failed to get info\n"); + goto out; + } + + fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags); + + if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { + fprintf(stderr, "hpet_poll: HPET_EPI failed\n"); + goto out; + } + + if (ioctl(fd, HPET_IE_ON, 0) < 0) { + fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n"); + goto out; + } + + pfd.fd = fd; + pfd.events = POLLIN; + + for (i = 0; i < iterations; i++) { + pfd.revents = 0; + gettimeofday(&stv, &tz); + if (poll(&pfd, 1, -1) < 0) + fprintf(stderr, "hpet_poll: poll failed\n"); + else { + long data; + + gettimeofday(&etv, &tz); + usec = stv.tv_sec * 1000000 + stv.tv_usec; + usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec; + + fprintf(stderr, + "hpet_poll: expired time = 0x%lx\n", usec); + + fprintf(stderr, "hpet_poll: revents = 0x%x\n", + pfd.revents); + + if (read(fd, &data, sizeof(data)) != sizeof(data)) { + fprintf(stderr, "hpet_poll: read failed\n"); + } + else + fprintf(stderr, "hpet_poll: data 0x%lx\n", + data); + } + } + +out: + close(fd); + return; +} + +static int hpet_sigio_count; + +static void +hpet_sigio(int val) +{ + fprintf(stderr, "hpet_sigio: called\n"); + hpet_sigio_count++; +} + +void +hpet_fasync(int argc, const char **argv) +{ + unsigned long freq; + int iterations, i, fd, value; + sig_t oldsig; + struct hpet_info info; + + hpet_sigio_count = 0; + fd = -1; + + if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) { + fprintf(stderr, "hpet_fasync: failed to set signal handler\n"); + return; + } + + if (argc != 3) { + fprintf(stderr, "hpet_fasync: device-name freq iterations\n"); + goto out; + } + + fd = open(argv[0], O_RDONLY); + + if (fd < 0) { + fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]); + return; + } + + + if ((fcntl(fd, F_SETOWN, getpid()) == 1) || + ((value = fcntl(fd, F_GETFL)) == 1) || + (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) { + fprintf(stderr, "hpet_fasync: fcntl failed\n"); + goto out; + } + + freq = atoi(argv[1]); + iterations = atoi(argv[2]); + + if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { + fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n"); + goto out; + } + + if (ioctl(fd, HPET_INFO, &info) < 0) { + fprintf(stderr, "hpet_fasync: failed to get info\n"); + goto out; + } + + fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags); + + if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { + fprintf(stderr, "hpet_fasync: HPET_EPI failed\n"); + goto out; + } + + if (ioctl(fd, HPET_IE_ON, 0) < 0) { + fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n"); + goto out; + } + + for (i = 0; i < iterations; i++) { + (void) pause(); + fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count); + } + +out: + signal(SIGIO, oldsig); + + if (fd >= 0) + close(fd); + + return; +} diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile new file mode 100644 index 000000000..b78344e7b --- /dev/null +++ b/samples/trace_events/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# builds the trace events example kernel modules; +# then to use one (as root): insmod <module_name.ko> + +# If you include a trace header outside of include/trace/events +# then the file that does the #define CREATE_TRACE_POINTS must +# have that tracer file in its main search path. This is because +# define_trace.h will include it, and must be able to find it from +# the include/trace directory. +# +# Here trace-events-sample.c does the CREATE_TRACE_POINTS. +# +CFLAGS_trace-events-sample.o := -I$(src) + +obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c new file mode 100644 index 000000000..1a72b7d95 --- /dev/null +++ b/samples/trace_events/trace-events-sample.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> + +/* + * Any file that uses trace points, must include the header. + * But only one file, must include the header by defining + * CREATE_TRACE_POINTS first. This will make the C code that + * creates the handles for the trace points. + */ +#define CREATE_TRACE_POINTS +#include "trace-events-sample.h" + +static const char *random_strings[] = { + "Mother Goose", + "Snoopy", + "Gandalf", + "Frodo", + "One ring to rule them all" +}; + +static void simple_thread_func(int cnt) +{ + int array[6]; + int len = cnt % 5; + int i; + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + + for (i = 0; i < len; i++) + array[i] = i + 1; + array[i] = 0; + + /* Silly tracepoints */ + trace_foo_bar("hello", cnt, array, random_strings[len], + current->cpus_ptr); + + trace_foo_with_template_simple("HELLO", cnt); + + trace_foo_bar_with_cond("Some times print", cnt); + + trace_foo_with_template_cond("prints other times", cnt); + + trace_foo_with_template_print("I have to be different", cnt); +} + +static int simple_thread(void *arg) +{ + int cnt = 0; + + while (!kthread_should_stop()) + simple_thread_func(cnt++); + + return 0; +} + +static struct task_struct *simple_tsk; +static struct task_struct *simple_tsk_fn; + +static void simple_thread_func_fn(int cnt) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + + /* More silly tracepoints */ + trace_foo_bar_with_fn("Look at me", cnt); + trace_foo_with_template_fn("Look at me too", cnt); +} + +static int simple_thread_fn(void *arg) +{ + int cnt = 0; + + while (!kthread_should_stop()) + simple_thread_func_fn(cnt++); + + return 0; +} + +static DEFINE_MUTEX(thread_mutex); +static int simple_thread_cnt; + +int foo_bar_reg(void) +{ + mutex_lock(&thread_mutex); + if (simple_thread_cnt++) + goto out; + + pr_info("Starting thread for foo_bar_fn\n"); + /* + * We shouldn't be able to start a trace when the module is + * unloading (there's other locks to prevent that). But + * for consistency sake, we still take the thread_mutex. + */ + simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn"); + out: + mutex_unlock(&thread_mutex); + return 0; +} + +void foo_bar_unreg(void) +{ + mutex_lock(&thread_mutex); + if (--simple_thread_cnt) + goto out; + + pr_info("Killing thread for foo_bar_fn\n"); + if (simple_tsk_fn) + kthread_stop(simple_tsk_fn); + simple_tsk_fn = NULL; + out: + mutex_unlock(&thread_mutex); +} + +static int __init trace_event_init(void) +{ + simple_tsk = kthread_run(simple_thread, NULL, "event-sample"); + if (IS_ERR(simple_tsk)) + return -1; + + return 0; +} + +static void __exit trace_event_exit(void) +{ + kthread_stop(simple_tsk); + mutex_lock(&thread_mutex); + if (simple_tsk_fn) + kthread_stop(simple_tsk_fn); + simple_tsk_fn = NULL; + mutex_unlock(&thread_mutex); +} + +module_init(trace_event_init); +module_exit(trace_event_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("trace-events-sample"); +MODULE_LICENSE("GPL"); diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h new file mode 100644 index 000000000..13a35f7cb --- /dev/null +++ b/samples/trace_events/trace-events-sample.h @@ -0,0 +1,524 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * If TRACE_SYSTEM is defined, that will be the directory created + * in the ftrace directory under /sys/kernel/tracing/events/<system> + * + * The define_trace.h below will also look for a file name of + * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. + * In this case, it would look for sample-trace.h + * + * If the header name will be different than the system name + * (as in this case), then you can override the header name that + * define_trace.h will look up by defining TRACE_INCLUDE_FILE + * + * This file is called trace-events-sample.h but we want the system + * to be called "sample-trace". Therefore we must define the name of this + * file: + * + * #define TRACE_INCLUDE_FILE trace-events-sample + * + * As we do an the bottom of this file. + * + * Notice that TRACE_SYSTEM should be defined outside of #if + * protection, just like TRACE_INCLUDE_FILE. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sample-trace + +/* + * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric + * and underscore), although it may start with numbers. If for some + * reason it is not, you need to add the following lines: + */ +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR sample_trace +/* + * But the above is only needed if TRACE_SYSTEM is not alpha-numeric + * and underscored. By default, TRACE_SYSTEM_VAR will be equal to + * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if + * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with + * only alpha-numeric and underscores. + * + * The TRACE_SYSTEM_VAR is only used internally and not visible to + * user space. + */ + +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENT_SAMPLE_H + +/* + * All trace headers should include tracepoint.h, until we finally + * make it into a standard header. + */ +#include <linux/tracepoint.h> + +/* + * The TRACE_EVENT macro is broken up into 5 parts. + * + * name: name of the trace point. This is also how to enable the tracepoint. + * A function called trace_foo_bar() will be created. + * + * proto: the prototype of the function trace_foo_bar() + * Here it is trace_foo_bar(char *foo, int bar). + * + * args: must match the arguments in the prototype. + * Here it is simply "foo, bar". + * + * struct: This defines the way the data will be stored in the ring buffer. + * The items declared here become part of a special structure + * called "__entry", which can be used in the fast_assign part of the + * TRACE_EVENT macro. + * + * Here are the currently defined types you can use: + * + * __field : Is broken up into type and name. Where type can be any + * primitive type (integer, long or pointer). + * + * __field(int, foo) + * + * __entry->foo = 5; + * + * __field_struct : This can be any static complex data type (struct, union + * but not an array). Be careful using complex types, as each + * event is limited in size, and copying large amounts of data + * into the ring buffer can slow things down. + * + * __field_struct(struct bar, foo) + * + * __entry->bar.x = y; + + * __array: There are three fields (type, name, size). The type is the + * type of elements in the array, the name is the name of the array. + * size is the number of items in the array (not the total size). + * + * __array( char, foo, 10) is the same as saying: char foo[10]; + * + * Assigning arrays can be done like any array: + * + * __entry->foo[0] = 'a'; + * + * memcpy(__entry->foo, bar, 10); + * + * __dynamic_array: This is similar to array, but can vary its size from + * instance to instance of the tracepoint being called. + * Like __array, this too has three elements (type, name, size); + * type is the type of the element, name is the name of the array. + * The size is different than __array. It is not a static number, + * but the algorithm to figure out the length of the array for the + * specific instance of tracepoint. Again, size is the number of + * items in the array, not the total length in bytes. + * + * __dynamic_array( int, foo, bar) is similar to: int foo[bar]; + * + * Note, unlike arrays, you must use the __get_dynamic_array() macro + * to access the array. + * + * memcpy(__get_dynamic_array(foo), bar, 10); + * + * Notice, that "__entry" is not needed here. + * + * __string: This is a special kind of __dynamic_array. It expects to + * have a null terminated character array passed to it (it allows + * for NULL too, which would be converted into "(null)"). __string + * takes two parameter (name, src), where name is the name of + * the string saved, and src is the string to copy into the + * ring buffer. + * + * __string(foo, bar) is similar to: strcpy(foo, bar) + * + * To assign a string, use the helper macro __assign_str(). + * + * __assign_str(foo, bar); + * + * In most cases, the __assign_str() macro will take the same + * parameters as the __string() macro had to declare the string. + * + * __bitmask: This is another kind of __dynamic_array, but it expects + * an array of longs, and the number of bits to parse. It takes + * two parameters (name, nr_bits), where name is the name of the + * bitmask to save, and the nr_bits is the number of bits to record. + * + * __bitmask(target_cpu, nr_cpumask_bits) + * + * To assign a bitmask, use the __assign_bitmask() helper macro. + * + * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits); + * + * + * fast_assign: This is a C like function that is used to store the items + * into the ring buffer. A special variable called "__entry" will be the + * structure that points into the ring buffer and has the same fields as + * described by the struct part of TRACE_EVENT above. + * + * printk: This is a way to print out the data in pretty print. This is + * useful if the system crashes and you are logging via a serial line, + * the data can be printed to the console using this "printk" method. + * This is also used to print out the data from the trace files. + * Again, the __entry macro is used to access the data from the ring buffer. + * + * Note, __dynamic_array, __string, and __bitmask require special helpers + * to access the data. + * + * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo) + * Use __get_dynamic_array_len(foo) to get the length of the array + * saved. Note, __get_dynamic_array_len() returns the total allocated + * length of the dynamic array; __print_array() expects the second + * parameter to be the number of elements. To get that, the array length + * needs to be divided by the element size. + * + * For __string(foo, bar) use __get_str(foo) + * + * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus) + * + * + * Note, that for both the assign and the printk, __entry is the handler + * to the data structure in the ring buffer, and is defined by the + * TP_STRUCT__entry. + */ + +/* + * It is OK to have helper functions in the file, but they need to be protected + * from being defined more than once. Remember, this file gets included more + * than once. + */ +#ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS +#define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS +static inline int __length_of(const int *list) +{ + int i; + + if (!list) + return 0; + + for (i = 0; list[i]; i++) + ; + return i; +} + +enum { + TRACE_SAMPLE_FOO = 2, + TRACE_SAMPLE_BAR = 4, + TRACE_SAMPLE_ZOO = 8, +}; +#endif + +/* + * If enums are used in the TP_printk(), their names will be shown in + * format files and not their values. This can cause problems with user + * space programs that parse the format files to know how to translate + * the raw binary trace output into human readable text. + * + * To help out user space programs, any enum that is used in the TP_printk() + * should be defined by TRACE_DEFINE_ENUM() macro. All that is needed to + * be done is to add this macro with the enum within it in the trace + * header file, and it will be converted in the output. + */ + +TRACE_DEFINE_ENUM(TRACE_SAMPLE_FOO); +TRACE_DEFINE_ENUM(TRACE_SAMPLE_BAR); +TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO); + +TRACE_EVENT(foo_bar, + + TP_PROTO(const char *foo, int bar, const int *lst, + const char *string, const struct cpumask *mask), + + TP_ARGS(foo, bar, lst, string, mask), + + TP_STRUCT__entry( + __array( char, foo, 10 ) + __field( int, bar ) + __dynamic_array(int, list, __length_of(lst)) + __string( str, string ) + __bitmask( cpus, num_possible_cpus() ) + ), + + TP_fast_assign( + strlcpy(__entry->foo, foo, 10); + __entry->bar = bar; + memcpy(__get_dynamic_array(list), lst, + __length_of(lst) * sizeof(int)); + __assign_str(str, string); + __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); + ), + + TP_printk("foo %s %d %s %s %s %s (%s)", __entry->foo, __entry->bar, + +/* + * Notice here the use of some helper functions. This includes: + * + * __print_symbolic( variable, { value, "string" }, ... ), + * + * The variable is tested against each value of the { } pair. If + * the variable matches one of the values, then it will print the + * string in that pair. If non are matched, it returns a string + * version of the number (if __entry->bar == 7 then "7" is returned). + */ + __print_symbolic(__entry->bar, + { 0, "zero" }, + { TRACE_SAMPLE_FOO, "TWO" }, + { TRACE_SAMPLE_BAR, "FOUR" }, + { TRACE_SAMPLE_ZOO, "EIGHT" }, + { 10, "TEN" } + ), + +/* + * __print_flags( variable, "delim", { value, "flag" }, ... ), + * + * This is similar to __print_symbolic, except that it tests the bits + * of the value. If ((FLAG & variable) == FLAG) then the string is + * printed. If more than one flag matches, then each one that does is + * also printed with delim in between them. + * If not all bits are accounted for, then the not found bits will be + * added in hex format: 0x506 will show BIT2|BIT4|0x500 + */ + __print_flags(__entry->bar, "|", + { 1, "BIT1" }, + { 2, "BIT2" }, + { 4, "BIT3" }, + { 8, "BIT4" } + ), +/* + * __print_array( array, len, element_size ) + * + * This prints out the array that is defined by __array in a nice format. + */ + __print_array(__get_dynamic_array(list), + __get_dynamic_array_len(list) / sizeof(int), + sizeof(int)), + __get_str(str), __get_bitmask(cpus)) +); + +/* + * There may be a case where a tracepoint should only be called if + * some condition is set. Otherwise the tracepoint should not be called. + * But to do something like: + * + * if (cond) + * trace_foo(); + * + * Would cause a little overhead when tracing is not enabled, and that + * overhead, even if small, is not something we want. As tracepoints + * use static branch (aka jump_labels), where no branch is taken to + * skip the tracepoint when not enabled, and a jmp is placed to jump + * to the tracepoint code when it is enabled, having a if statement + * nullifies that optimization. It would be nice to place that + * condition within the static branch. This is where TRACE_EVENT_CONDITION + * comes in. + * + * TRACE_EVENT_CONDITION() is just like TRACE_EVENT, except it adds another + * parameter just after args. Where TRACE_EVENT has: + * + * TRACE_EVENT(name, proto, args, struct, assign, printk) + * + * the CONDITION version has: + * + * TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, printk) + * + * Everything is the same as TRACE_EVENT except for the new cond. Think + * of the cond variable as: + * + * if (cond) + * trace_foo_bar_with_cond(); + * + * Except that the logic for the if branch is placed after the static branch. + * That is, the if statement that processes the condition will not be + * executed unless that traecpoint is enabled. Otherwise it still remains + * a nop. + */ +TRACE_EVENT_CONDITION(foo_bar_with_cond, + + TP_PROTO(const char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_CONDITION(!(bar % 10)), + + TP_STRUCT__entry( + __string( foo, foo ) + __field( int, bar ) + ), + + TP_fast_assign( + __assign_str(foo, foo); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __get_str(foo), __entry->bar) +); + +int foo_bar_reg(void); +void foo_bar_unreg(void); + +/* + * Now in the case that some function needs to be called when the + * tracepoint is enabled and/or when it is disabled, the + * TRACE_EVENT_FN() serves this purpose. This is just like TRACE_EVENT() + * but adds two more parameters at the end: + * + * TRACE_EVENT_FN( name, proto, args, struct, assign, printk, reg, unreg) + * + * reg and unreg are functions with the prototype of: + * + * void reg(void) + * + * The reg function gets called before the tracepoint is enabled, and + * the unreg function gets called after the tracepoint is disabled. + * + * Note, reg and unreg are allowed to be NULL. If you only need to + * call a function before enabling, or after disabling, just set one + * function and pass in NULL for the other parameter. + */ +TRACE_EVENT_FN(foo_bar_with_fn, + + TP_PROTO(const char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_STRUCT__entry( + __string( foo, foo ) + __field( int, bar ) + ), + + TP_fast_assign( + __assign_str(foo, foo); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __get_str(foo), __entry->bar), + + foo_bar_reg, foo_bar_unreg +); + +/* + * Each TRACE_EVENT macro creates several helper functions to produce + * the code to add the tracepoint, create the files in the trace + * directory, hook it to perf, assign the values and to print out + * the raw data from the ring buffer. To prevent too much bloat, + * if there are more than one tracepoint that uses the same format + * for the proto, args, struct, assign and printk, and only the name + * is different, it is highly recommended to use the DECLARE_EVENT_CLASS + * + * DECLARE_EVENT_CLASS() macro creates most of the functions for the + * tracepoint. Then DEFINE_EVENT() is use to hook a tracepoint to those + * functions. This DEFINE_EVENT() is an instance of the class and can + * be enabled and disabled separately from other events (either TRACE_EVENT + * or other DEFINE_EVENT()s). + * + * Note, TRACE_EVENT() itself is simply defined as: + * + * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \ + * DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ + * DEFINE_EVENT(name, name, proto, args) + * + * The DEFINE_EVENT() also can be declared with conditions and reg functions: + * + * DEFINE_EVENT_CONDITION(template, name, proto, args, cond); + * DEFINE_EVENT_FN(template, name, proto, args, reg, unreg); + */ +DECLARE_EVENT_CLASS(foo_template, + + TP_PROTO(const char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_STRUCT__entry( + __string( foo, foo ) + __field( int, bar ) + ), + + TP_fast_assign( + __assign_str(foo, foo); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __get_str(foo), __entry->bar) +); + +/* + * Here's a better way for the previous samples (except, the first + * example had more fields and could not be used here). + */ +DEFINE_EVENT(foo_template, foo_with_template_simple, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar)); + +DEFINE_EVENT_CONDITION(foo_template, foo_with_template_cond, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar), + TP_CONDITION(!(bar % 8))); + + +DEFINE_EVENT_FN(foo_template, foo_with_template_fn, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar), + foo_bar_reg, foo_bar_unreg); + +/* + * Anytime two events share basically the same values and have + * the same output, use the DECLARE_EVENT_CLASS() and DEFINE_EVENT() + * when ever possible. + */ + +/* + * If the event is similar to the DECLARE_EVENT_CLASS, but you need + * to have a different output, then use DEFINE_EVENT_PRINT() which + * lets you override the TP_printk() of the class. + */ + +DEFINE_EVENT_PRINT(foo_template, foo_with_template_print, + TP_PROTO(const char *foo, int bar), + TP_ARGS(foo, bar), + TP_printk("bar %s %d", __get_str(foo), __entry->bar)); + +#endif + +/***** NOTICE! The #if protection ends here. *****/ + + +/* + * There are several ways I could have done this. If I left out the + * TRACE_INCLUDE_PATH, then it would default to the kernel source + * include/trace/events directory. + * + * I could specify a path from the define_trace.h file back to this + * file. + * + * #define TRACE_INCLUDE_PATH ../../samples/trace_events + * + * But the safest and easiest way to simply make it use the directory + * that the file is in is to add in the Makefile: + * + * CFLAGS_trace-events-sample.o := -I$(src) + * + * This will make sure the current path is part of the include + * structure for our file so that define_trace.h can find it. + * + * I could have made only the top level directory the include: + * + * CFLAGS_trace-events-sample.o := -I$(PWD) + * + * And then let the path to this directory be the TRACE_INCLUDE_PATH: + * + * #define TRACE_INCLUDE_PATH samples/trace_events + * + * But then if something defines "samples" or "trace_events" as a macro + * then we could risk that being converted too, and give us an unexpected + * result. + */ +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +/* + * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal + */ +#define TRACE_INCLUDE_FILE trace-events-sample +#include <trace/define_trace.h> diff --git a/samples/trace_printk/Makefile b/samples/trace_printk/Makefile new file mode 100644 index 000000000..c0df36167 --- /dev/null +++ b/samples/trace_printk/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +# builds a module that calls various trace_printk routines +# then to use one (as root): insmod <module_name.ko> + +# This module can also be used to test the trace_printk code. + +obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace-printk.o diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c new file mode 100644 index 000000000..cfc159580 --- /dev/null +++ b/samples/trace_printk/trace-printk.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/irq_work.h> + +/* Must not be static to force gcc to consider these non constant */ +char *trace_printk_test_global_str = + "This is a dynamic string that will use trace_puts\n"; + +char *trace_printk_test_global_str_irq = + "(irq) This is a dynamic string that will use trace_puts\n"; + +char *trace_printk_test_global_str_fmt = + "%sThis is a %s that will use trace_printk\n"; + +static struct irq_work irqwork; + +static void trace_printk_irq_work(struct irq_work *work) +{ + trace_printk("(irq) This is a static string that will use trace_bputs\n"); + trace_printk(trace_printk_test_global_str_irq); + + trace_printk("(irq) This is a %s that will use trace_bprintk()\n", + "static string"); + + trace_printk(trace_printk_test_global_str_fmt, + "(irq) ", "dynamic string"); +} + +static int __init trace_printk_init(void) +{ + init_irq_work(&irqwork, trace_printk_irq_work); + + trace_printk("This is a static string that will use trace_bputs\n"); + trace_printk(trace_printk_test_global_str); + + /* Kick off printing in irq context */ + irq_work_queue(&irqwork); + irq_work_sync(&irqwork); + + trace_printk("This is a %s that will use trace_bprintk()\n", + "static string"); + + trace_printk(trace_printk_test_global_str_fmt, "", "dynamic string"); + + return 0; +} + +static void __exit trace_printk_exit(void) +{ +} + +module_init(trace_printk_init); +module_exit(trace_printk_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("trace-printk"); +MODULE_LICENSE("GPL"); diff --git a/samples/uhid/.gitignore b/samples/uhid/.gitignore new file mode 100644 index 000000000..0e0a5a929 --- /dev/null +++ b/samples/uhid/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/uhid-example diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile new file mode 100644 index 000000000..0aa424ec4 --- /dev/null +++ b/samples/uhid/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += uhid-example + +userccflags += -I usr/include diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c new file mode 100644 index 000000000..015cb06a2 --- /dev/null +++ b/samples/uhid/uhid-example.c @@ -0,0 +1,465 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * UHID Example + * + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * The code may be used by anyone for any purpose, + * and can serve as a starting point for developing + * applications using uhid. + */ + +/* + * UHID Example + * This example emulates a basic 3 buttons mouse with wheel over UHID. Run this + * program as root and then use the following keys to control the mouse: + * q: Quit the application + * 1: Toggle left button (down, up, ...) + * 2: Toggle right button + * 3: Toggle middle button + * a: Move mouse left + * d: Move mouse right + * w: Move mouse up + * s: Move mouse down + * r: Move wheel up + * f: Move wheel down + * + * Additionally to 3 button mouse, 3 keyboard LEDs are also supported (LED_NUML, + * LED_CAPSL and LED_SCROLLL). The device doesn't generate any related keyboard + * events, though. You need to manually write the EV_LED/LED_XY/1 activation + * input event to the evdev device to see it being sent to this device. + * + * If uhid is not available as /dev/uhid, then you can pass a different path as + * first argument. + * If <linux/uhid.h> is not installed in /usr, then compile this with: + * gcc -o ./uhid_test -Wall -I./include ./samples/uhid/uhid-example.c + * And ignore the warning about kernel headers. However, it is recommended to + * use the installed uhid.h if available. + */ + +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <termios.h> +#include <unistd.h> +#include <linux/uhid.h> + +/* + * HID Report Desciptor + * We emulate a basic 3 button mouse with wheel and 3 keyboard LEDs. This is + * the report-descriptor as the kernel will parse it: + * + * INPUT(1)[INPUT] + * Field(0) + * Physical(GenericDesktop.Pointer) + * Application(GenericDesktop.Mouse) + * Usage(3) + * Button.0001 + * Button.0002 + * Button.0003 + * Logical Minimum(0) + * Logical Maximum(1) + * Report Size(1) + * Report Count(3) + * Report Offset(0) + * Flags( Variable Absolute ) + * Field(1) + * Physical(GenericDesktop.Pointer) + * Application(GenericDesktop.Mouse) + * Usage(3) + * GenericDesktop.X + * GenericDesktop.Y + * GenericDesktop.Wheel + * Logical Minimum(-128) + * Logical Maximum(127) + * Report Size(8) + * Report Count(3) + * Report Offset(8) + * Flags( Variable Relative ) + * OUTPUT(2)[OUTPUT] + * Field(0) + * Application(GenericDesktop.Keyboard) + * Usage(3) + * LED.NumLock + * LED.CapsLock + * LED.ScrollLock + * Logical Minimum(0) + * Logical Maximum(1) + * Report Size(1) + * Report Count(3) + * Report Offset(0) + * Flags( Variable Absolute ) + * + * This is the mapping that we expect: + * Button.0001 ---> Key.LeftBtn + * Button.0002 ---> Key.RightBtn + * Button.0003 ---> Key.MiddleBtn + * GenericDesktop.X ---> Relative.X + * GenericDesktop.Y ---> Relative.Y + * GenericDesktop.Wheel ---> Relative.Wheel + * LED.NumLock ---> LED.NumLock + * LED.CapsLock ---> LED.CapsLock + * LED.ScrollLock ---> LED.ScrollLock + * + * This information can be verified by reading /sys/kernel/debug/hid/<dev>/rdesc + * This file should print the same information as showed above. + */ + +static unsigned char rdesc[] = { + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x02, /* USAGE (Mouse) */ + 0xa1, 0x01, /* COLLECTION (Application) */ + 0x09, 0x01, /* USAGE (Pointer) */ + 0xa1, 0x00, /* COLLECTION (Physical) */ + 0x85, 0x01, /* REPORT_ID (1) */ + 0x05, 0x09, /* USAGE_PAGE (Button) */ + 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */ + 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x81, 0x01, /* INPUT (Cnst,Var,Abs) */ + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x30, /* USAGE (X) */ + 0x09, 0x31, /* USAGE (Y) */ + 0x09, 0x38, /* USAGE (WHEEL) */ + 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ + 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ + 0x75, 0x08, /* REPORT_SIZE (8) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x81, 0x06, /* INPUT (Data,Var,Rel) */ + 0xc0, /* END_COLLECTION */ + 0xc0, /* END_COLLECTION */ + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x06, /* USAGE (Keyboard) */ + 0xa1, 0x01, /* COLLECTION (Application) */ + 0x85, 0x02, /* REPORT_ID (2) */ + 0x05, 0x08, /* USAGE_PAGE (Led) */ + 0x19, 0x01, /* USAGE_MINIMUM (1) */ + 0x29, 0x03, /* USAGE_MAXIMUM (3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0x91, 0x02, /* Output (Data,Var,Abs) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x91, 0x01, /* Output (Cnst,Var,Abs) */ + 0xc0, /* END_COLLECTION */ +}; + +static int uhid_write(int fd, const struct uhid_event *ev) +{ + ssize_t ret; + + ret = write(fd, ev, sizeof(*ev)); + if (ret < 0) { + fprintf(stderr, "Cannot write to uhid: %m\n"); + return -errno; + } else if (ret != sizeof(*ev)) { + fprintf(stderr, "Wrong size written to uhid: %zd != %zu\n", + ret, sizeof(ev)); + return -EFAULT; + } else { + return 0; + } +} + +static int create(int fd) +{ + struct uhid_event ev; + + memset(&ev, 0, sizeof(ev)); + ev.type = UHID_CREATE; + strcpy((char*)ev.u.create.name, "test-uhid-device"); + ev.u.create.rd_data = rdesc; + ev.u.create.rd_size = sizeof(rdesc); + ev.u.create.bus = BUS_USB; + ev.u.create.vendor = 0x15d9; + ev.u.create.product = 0x0a37; + ev.u.create.version = 0; + ev.u.create.country = 0; + + return uhid_write(fd, &ev); +} + +static void destroy(int fd) +{ + struct uhid_event ev; + + memset(&ev, 0, sizeof(ev)); + ev.type = UHID_DESTROY; + + uhid_write(fd, &ev); +} + +/* This parses raw output reports sent by the kernel to the device. A normal + * uhid program shouldn't do this but instead just forward the raw report. + * However, for ducomentational purposes, we try to detect LED events here and + * print debug messages for it. */ +static void handle_output(struct uhid_event *ev) +{ + /* LED messages are adverised via OUTPUT reports; ignore the rest */ + if (ev->u.output.rtype != UHID_OUTPUT_REPORT) + return; + /* LED reports have length 2 bytes */ + if (ev->u.output.size != 2) + return; + /* first byte is report-id which is 0x02 for LEDs in our rdesc */ + if (ev->u.output.data[0] != 0x2) + return; + + /* print flags payload */ + fprintf(stderr, "LED output report received with flags %x\n", + ev->u.output.data[1]); +} + +static int event(int fd) +{ + struct uhid_event ev; + ssize_t ret; + + memset(&ev, 0, sizeof(ev)); + ret = read(fd, &ev, sizeof(ev)); + if (ret == 0) { + fprintf(stderr, "Read HUP on uhid-cdev\n"); + return -EFAULT; + } else if (ret < 0) { + fprintf(stderr, "Cannot read uhid-cdev: %m\n"); + return -errno; + } else if (ret != sizeof(ev)) { + fprintf(stderr, "Invalid size read from uhid-dev: %zd != %zu\n", + ret, sizeof(ev)); + return -EFAULT; + } + + switch (ev.type) { + case UHID_START: + fprintf(stderr, "UHID_START from uhid-dev\n"); + break; + case UHID_STOP: + fprintf(stderr, "UHID_STOP from uhid-dev\n"); + break; + case UHID_OPEN: + fprintf(stderr, "UHID_OPEN from uhid-dev\n"); + break; + case UHID_CLOSE: + fprintf(stderr, "UHID_CLOSE from uhid-dev\n"); + break; + case UHID_OUTPUT: + fprintf(stderr, "UHID_OUTPUT from uhid-dev\n"); + handle_output(&ev); + break; + case UHID_OUTPUT_EV: + fprintf(stderr, "UHID_OUTPUT_EV from uhid-dev\n"); + break; + default: + fprintf(stderr, "Invalid event from uhid-dev: %u\n", ev.type); + } + + return 0; +} + +static bool btn1_down; +static bool btn2_down; +static bool btn3_down; +static signed char abs_hor; +static signed char abs_ver; +static signed char wheel; + +static int send_event(int fd) +{ + struct uhid_event ev; + + memset(&ev, 0, sizeof(ev)); + ev.type = UHID_INPUT; + ev.u.input.size = 5; + + ev.u.input.data[0] = 0x1; + if (btn1_down) + ev.u.input.data[1] |= 0x1; + if (btn2_down) + ev.u.input.data[1] |= 0x2; + if (btn3_down) + ev.u.input.data[1] |= 0x4; + + ev.u.input.data[2] = abs_hor; + ev.u.input.data[3] = abs_ver; + ev.u.input.data[4] = wheel; + + return uhid_write(fd, &ev); +} + +static int keyboard(int fd) +{ + char buf[128]; + ssize_t ret, i; + + ret = read(STDIN_FILENO, buf, sizeof(buf)); + if (ret == 0) { + fprintf(stderr, "Read HUP on stdin\n"); + return -EFAULT; + } else if (ret < 0) { + fprintf(stderr, "Cannot read stdin: %m\n"); + return -errno; + } + + for (i = 0; i < ret; ++i) { + switch (buf[i]) { + case '1': + btn1_down = !btn1_down; + ret = send_event(fd); + if (ret) + return ret; + break; + case '2': + btn2_down = !btn2_down; + ret = send_event(fd); + if (ret) + return ret; + break; + case '3': + btn3_down = !btn3_down; + ret = send_event(fd); + if (ret) + return ret; + break; + case 'a': + abs_hor = -20; + ret = send_event(fd); + abs_hor = 0; + if (ret) + return ret; + break; + case 'd': + abs_hor = 20; + ret = send_event(fd); + abs_hor = 0; + if (ret) + return ret; + break; + case 'w': + abs_ver = -20; + ret = send_event(fd); + abs_ver = 0; + if (ret) + return ret; + break; + case 's': + abs_ver = 20; + ret = send_event(fd); + abs_ver = 0; + if (ret) + return ret; + break; + case 'r': + wheel = 1; + ret = send_event(fd); + wheel = 0; + if (ret) + return ret; + break; + case 'f': + wheel = -1; + ret = send_event(fd); + wheel = 0; + if (ret) + return ret; + break; + case 'q': + return -ECANCELED; + default: + fprintf(stderr, "Invalid input: %c\n", buf[i]); + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + int fd; + const char *path = "/dev/uhid"; + struct pollfd pfds[2]; + int ret; + struct termios state; + + ret = tcgetattr(STDIN_FILENO, &state); + if (ret) { + fprintf(stderr, "Cannot get tty state\n"); + } else { + state.c_lflag &= ~ICANON; + state.c_cc[VMIN] = 1; + ret = tcsetattr(STDIN_FILENO, TCSANOW, &state); + if (ret) + fprintf(stderr, "Cannot set tty state\n"); + } + + if (argc >= 2) { + if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) { + fprintf(stderr, "Usage: %s [%s]\n", argv[0], path); + return EXIT_SUCCESS; + } else { + path = argv[1]; + } + } + + fprintf(stderr, "Open uhid-cdev %s\n", path); + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "Cannot open uhid-cdev %s: %m\n", path); + return EXIT_FAILURE; + } + + fprintf(stderr, "Create uhid device\n"); + ret = create(fd); + if (ret) { + close(fd); + return EXIT_FAILURE; + } + + pfds[0].fd = STDIN_FILENO; + pfds[0].events = POLLIN; + pfds[1].fd = fd; + pfds[1].events = POLLIN; + + fprintf(stderr, "Press 'q' to quit...\n"); + while (1) { + ret = poll(pfds, 2, -1); + if (ret < 0) { + fprintf(stderr, "Cannot poll for fds: %m\n"); + break; + } + if (pfds[0].revents & POLLHUP) { + fprintf(stderr, "Received HUP on stdin\n"); + break; + } + if (pfds[1].revents & POLLHUP) { + fprintf(stderr, "Received HUP on uhid-cdev\n"); + break; + } + + if (pfds[0].revents & POLLIN) { + ret = keyboard(fd); + if (ret) + break; + } + if (pfds[1].revents & POLLIN) { + ret = event(fd); + if (ret) + break; + } + } + + fprintf(stderr, "Destroy uhid device\n"); + destroy(fd); + return EXIT_SUCCESS; +} diff --git a/samples/v4l/Makefile b/samples/v4l/Makefile new file mode 100644 index 000000000..f86ab1245 --- /dev/null +++ b/samples/v4l/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_VIDEO_PCI_SKELETON) := v4l2-pci-skeleton.o diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c new file mode 100644 index 000000000..3fa6582b4 --- /dev/null +++ b/samples/v4l/v4l2-pci-skeleton.c @@ -0,0 +1,915 @@ +/* + * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source + * for use with other PCI drivers. + * + * This skeleton PCI driver assumes that the card has an S-Video connector as + * input 0 and an HDMI connector as input 1. + * + * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/videodev2.h> +#include <linux/v4l2-dv-timings.h> +#include <media/v4l2-device.h> +#include <media/v4l2-dev.h> +#include <media/v4l2-ioctl.h> +#include <media/v4l2-dv-timings.h> +#include <media/v4l2-ctrls.h> +#include <media/v4l2-event.h> +#include <media/videobuf2-v4l2.h> +#include <media/videobuf2-dma-contig.h> + +MODULE_DESCRIPTION("V4L2 PCI Skeleton Driver"); +MODULE_AUTHOR("Hans Verkuil"); +MODULE_LICENSE("GPL v2"); + +/** + * struct skeleton - All internal data for one instance of device + * @pdev: PCI device + * @v4l2_dev: top-level v4l2 device struct + * @vdev: video node structure + * @ctrl_handler: control handler structure + * @lock: ioctl serialization mutex + * @std: current SDTV standard + * @timings: current HDTV timings + * @format: current pix format + * @input: current video input (0 = SDTV, 1 = HDTV) + * @queue: vb2 video capture queue + * @qlock: spinlock controlling access to buf_list and sequence + * @buf_list: list of buffers queued for DMA + * @field: the field (TOP/BOTTOM/other) of the current buffer + * @sequence: frame sequence counter + */ +struct skeleton { + struct pci_dev *pdev; + struct v4l2_device v4l2_dev; + struct video_device vdev; + struct v4l2_ctrl_handler ctrl_handler; + struct mutex lock; + v4l2_std_id std; + struct v4l2_dv_timings timings; + struct v4l2_pix_format format; + unsigned input; + + struct vb2_queue queue; + + spinlock_t qlock; + struct list_head buf_list; + unsigned field; + unsigned sequence; +}; + +struct skel_buffer { + struct vb2_v4l2_buffer vb; + struct list_head list; +}; + +static inline struct skel_buffer *to_skel_buffer(struct vb2_v4l2_buffer *vbuf) +{ + return container_of(vbuf, struct skel_buffer, vb); +} + +static const struct pci_device_id skeleton_pci_tbl[] = { + /* { PCI_DEVICE(PCI_VENDOR_ID_, PCI_DEVICE_ID_) }, */ + { 0, } +}; +MODULE_DEVICE_TABLE(pci, skeleton_pci_tbl); + +/* + * HDTV: this structure has the capabilities of the HDTV receiver. + * It is used to constrain the huge list of possible formats based + * upon the hardware capabilities. + */ +static const struct v4l2_dv_timings_cap skel_timings_cap = { + .type = V4L2_DV_BT_656_1120, + /* keep this initialization for compatibility with GCC < 4.4.6 */ + .reserved = { 0 }, + V4L2_INIT_BT_TIMINGS( + 720, 1920, /* min/max width */ + 480, 1080, /* min/max height */ + 27000000, 74250000, /* min/max pixelclock*/ + V4L2_DV_BT_STD_CEA861, /* Supported standards */ + /* capabilities */ + V4L2_DV_BT_CAP_INTERLACED | V4L2_DV_BT_CAP_PROGRESSIVE + ) +}; + +/* + * Supported SDTV standards. This does the same job as skel_timings_cap, but + * for standard TV formats. + */ +#define SKEL_TVNORMS V4L2_STD_ALL + +/* + * Interrupt handler: typically interrupts happen after a new frame has been + * captured. It is the job of the handler to remove the new frame from the + * internal list and give it back to the vb2 framework, updating the sequence + * counter, field and timestamp at the same time. + */ +static irqreturn_t skeleton_irq(int irq, void *dev_id) +{ +#ifdef TODO + struct skeleton *skel = dev_id; + + /* handle interrupt */ + + /* Once a new frame has been captured, mark it as done like this: */ + if (captured_new_frame) { + ... + spin_lock(&skel->qlock); + list_del(&new_buf->list); + spin_unlock(&skel->qlock); + new_buf->vb.vb2_buf.timestamp = ktime_get_ns(); + new_buf->vb.sequence = skel->sequence++; + new_buf->vb.field = skel->field; + if (skel->format.field == V4L2_FIELD_ALTERNATE) { + if (skel->field == V4L2_FIELD_BOTTOM) + skel->field = V4L2_FIELD_TOP; + else if (skel->field == V4L2_FIELD_TOP) + skel->field = V4L2_FIELD_BOTTOM; + } + vb2_buffer_done(&new_buf->vb.vb2_buf, VB2_BUF_STATE_DONE); + } +#endif + return IRQ_HANDLED; +} + +/* + * Setup the constraints of the queue: besides setting the number of planes + * per buffer and the size and allocation context of each plane, it also + * checks if sufficient buffers have been allocated. Usually 3 is a good + * minimum number: many DMA engines need a minimum of 2 buffers in the + * queue and you need to have another available for userspace processing. + */ +static int queue_setup(struct vb2_queue *vq, + unsigned int *nbuffers, unsigned int *nplanes, + unsigned int sizes[], struct device *alloc_devs[]) +{ + struct skeleton *skel = vb2_get_drv_priv(vq); + + skel->field = skel->format.field; + if (skel->field == V4L2_FIELD_ALTERNATE) { + /* + * You cannot use read() with FIELD_ALTERNATE since the field + * information (TOP/BOTTOM) cannot be passed back to the user. + */ + if (vb2_fileio_is_active(vq)) + return -EINVAL; + skel->field = V4L2_FIELD_TOP; + } + + if (vq->num_buffers + *nbuffers < 3) + *nbuffers = 3 - vq->num_buffers; + + if (*nplanes) + return sizes[0] < skel->format.sizeimage ? -EINVAL : 0; + *nplanes = 1; + sizes[0] = skel->format.sizeimage; + return 0; +} + +/* + * Prepare the buffer for queueing to the DMA engine: check and set the + * payload size. + */ +static int buffer_prepare(struct vb2_buffer *vb) +{ + struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue); + unsigned long size = skel->format.sizeimage; + + if (vb2_plane_size(vb, 0) < size) { + dev_err(&skel->pdev->dev, "buffer too small (%lu < %lu)\n", + vb2_plane_size(vb, 0), size); + return -EINVAL; + } + + vb2_set_plane_payload(vb, 0, size); + return 0; +} + +/* + * Queue this buffer to the DMA engine. + */ +static void buffer_queue(struct vb2_buffer *vb) +{ + struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); + struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue); + struct skel_buffer *buf = to_skel_buffer(vbuf); + unsigned long flags; + + spin_lock_irqsave(&skel->qlock, flags); + list_add_tail(&buf->list, &skel->buf_list); + + /* TODO: Update any DMA pointers if necessary */ + + spin_unlock_irqrestore(&skel->qlock, flags); +} + +static void return_all_buffers(struct skeleton *skel, + enum vb2_buffer_state state) +{ + struct skel_buffer *buf, *node; + unsigned long flags; + + spin_lock_irqsave(&skel->qlock, flags); + list_for_each_entry_safe(buf, node, &skel->buf_list, list) { + vb2_buffer_done(&buf->vb.vb2_buf, state); + list_del(&buf->list); + } + spin_unlock_irqrestore(&skel->qlock, flags); +} + +/* + * Start streaming. First check if the minimum number of buffers have been + * queued. If not, then return -ENOBUFS and the vb2 framework will call + * this function again the next time a buffer has been queued until enough + * buffers are available to actually start the DMA engine. + */ +static int start_streaming(struct vb2_queue *vq, unsigned int count) +{ + struct skeleton *skel = vb2_get_drv_priv(vq); + int ret = 0; + + skel->sequence = 0; + + /* TODO: start DMA */ + + if (ret) { + /* + * In case of an error, return all active buffers to the + * QUEUED state + */ + return_all_buffers(skel, VB2_BUF_STATE_QUEUED); + } + return ret; +} + +/* + * Stop the DMA engine. Any remaining buffers in the DMA queue are dequeued + * and passed on to the vb2 framework marked as STATE_ERROR. + */ +static void stop_streaming(struct vb2_queue *vq) +{ + struct skeleton *skel = vb2_get_drv_priv(vq); + + /* TODO: stop DMA */ + + /* Release all active buffers */ + return_all_buffers(skel, VB2_BUF_STATE_ERROR); +} + +/* + * The vb2 queue ops. Note that since q->lock is set we can use the standard + * vb2_ops_wait_prepare/finish helper functions. If q->lock would be NULL, + * then this driver would have to provide these ops. + */ +static const struct vb2_ops skel_qops = { + .queue_setup = queue_setup, + .buf_prepare = buffer_prepare, + .buf_queue = buffer_queue, + .start_streaming = start_streaming, + .stop_streaming = stop_streaming, + .wait_prepare = vb2_ops_wait_prepare, + .wait_finish = vb2_ops_wait_finish, +}; + +/* + * Required ioctl querycap. Note that the version field is prefilled with + * the version of the kernel. + */ +static int skeleton_querycap(struct file *file, void *priv, + struct v4l2_capability *cap) +{ + struct skeleton *skel = video_drvdata(file); + + strlcpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver)); + strlcpy(cap->card, "V4L2 PCI Skeleton", sizeof(cap->card)); + snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s", + pci_name(skel->pdev)); + return 0; +} + +/* + * Helper function to check and correct struct v4l2_pix_format. It's used + * not only in VIDIOC_TRY/S_FMT, but also elsewhere if changes to the SDTV + * standard, HDTV timings or the video input would require updating the + * current format. + */ +static void skeleton_fill_pix_format(struct skeleton *skel, + struct v4l2_pix_format *pix) +{ + pix->pixelformat = V4L2_PIX_FMT_YUYV; + if (skel->input == 0) { + /* S-Video input */ + pix->width = 720; + pix->height = (skel->std & V4L2_STD_525_60) ? 480 : 576; + pix->field = V4L2_FIELD_INTERLACED; + pix->colorspace = V4L2_COLORSPACE_SMPTE170M; + } else { + /* HDMI input */ + pix->width = skel->timings.bt.width; + pix->height = skel->timings.bt.height; + if (skel->timings.bt.interlaced) { + pix->field = V4L2_FIELD_ALTERNATE; + pix->height /= 2; + } else { + pix->field = V4L2_FIELD_NONE; + } + pix->colorspace = V4L2_COLORSPACE_REC709; + } + + /* + * The YUYV format is four bytes for every two pixels, so bytesperline + * is width * 2. + */ + pix->bytesperline = pix->width * 2; + pix->sizeimage = pix->bytesperline * pix->height; + pix->priv = 0; +} + +static int skeleton_try_fmt_vid_cap(struct file *file, void *priv, + struct v4l2_format *f) +{ + struct skeleton *skel = video_drvdata(file); + struct v4l2_pix_format *pix = &f->fmt.pix; + + /* + * Due to historical reasons providing try_fmt with an unsupported + * pixelformat will return -EINVAL for video receivers. Webcam drivers, + * however, will silently correct the pixelformat. Some video capture + * applications rely on this behavior... + */ + if (pix->pixelformat != V4L2_PIX_FMT_YUYV) + return -EINVAL; + skeleton_fill_pix_format(skel, pix); + return 0; +} + +static int skeleton_s_fmt_vid_cap(struct file *file, void *priv, + struct v4l2_format *f) +{ + struct skeleton *skel = video_drvdata(file); + int ret; + + ret = skeleton_try_fmt_vid_cap(file, priv, f); + if (ret) + return ret; + + /* + * It is not allowed to change the format while buffers for use with + * streaming have already been allocated. + */ + if (vb2_is_busy(&skel->queue)) + return -EBUSY; + + /* TODO: change format */ + skel->format = f->fmt.pix; + return 0; +} + +static int skeleton_g_fmt_vid_cap(struct file *file, void *priv, + struct v4l2_format *f) +{ + struct skeleton *skel = video_drvdata(file); + + f->fmt.pix = skel->format; + return 0; +} + +static int skeleton_enum_fmt_vid_cap(struct file *file, void *priv, + struct v4l2_fmtdesc *f) +{ + if (f->index != 0) + return -EINVAL; + + f->pixelformat = V4L2_PIX_FMT_YUYV; + return 0; +} + +static int skeleton_s_std(struct file *file, void *priv, v4l2_std_id std) +{ + struct skeleton *skel = video_drvdata(file); + + /* S_STD is not supported on the HDMI input */ + if (skel->input) + return -ENODATA; + + /* + * No change, so just return. Some applications call S_STD again after + * the buffers for streaming have been set up, so we have to allow for + * this behavior. + */ + if (std == skel->std) + return 0; + + /* + * Changing the standard implies a format change, which is not allowed + * while buffers for use with streaming have already been allocated. + */ + if (vb2_is_busy(&skel->queue)) + return -EBUSY; + + /* TODO: handle changing std */ + + skel->std = std; + + /* Update the internal format */ + skeleton_fill_pix_format(skel, &skel->format); + return 0; +} + +static int skeleton_g_std(struct file *file, void *priv, v4l2_std_id *std) +{ + struct skeleton *skel = video_drvdata(file); + + /* G_STD is not supported on the HDMI input */ + if (skel->input) + return -ENODATA; + + *std = skel->std; + return 0; +} + +/* + * Query the current standard as seen by the hardware. This function shall + * never actually change the standard, it just detects and reports. + * The framework will initially set *std to tvnorms (i.e. the set of + * supported standards by this input), and this function should just AND + * this value. If there is no signal, then *std should be set to 0. + */ +static int skeleton_querystd(struct file *file, void *priv, v4l2_std_id *std) +{ + struct skeleton *skel = video_drvdata(file); + + /* QUERY_STD is not supported on the HDMI input */ + if (skel->input) + return -ENODATA; + +#ifdef TODO + /* + * Query currently seen standard. Initial value of *std is + * V4L2_STD_ALL. This function should look something like this: + */ + get_signal_info(); + if (no_signal) { + *std = 0; + return 0; + } + /* Use signal information to reduce the number of possible standards */ + if (signal_has_525_lines) + *std &= V4L2_STD_525_60; + else + *std &= V4L2_STD_625_50; +#endif + return 0; +} + +static int skeleton_s_dv_timings(struct file *file, void *_fh, + struct v4l2_dv_timings *timings) +{ + struct skeleton *skel = video_drvdata(file); + + /* S_DV_TIMINGS is not supported on the S-Video input */ + if (skel->input == 0) + return -ENODATA; + + /* Quick sanity check */ + if (!v4l2_valid_dv_timings(timings, &skel_timings_cap, NULL, NULL)) + return -EINVAL; + + /* Check if the timings are part of the CEA-861 timings. */ + if (!v4l2_find_dv_timings_cap(timings, &skel_timings_cap, + 0, NULL, NULL)) + return -EINVAL; + + /* Return 0 if the new timings are the same as the current timings. */ + if (v4l2_match_dv_timings(timings, &skel->timings, 0, false)) + return 0; + + /* + * Changing the timings implies a format change, which is not allowed + * while buffers for use with streaming have already been allocated. + */ + if (vb2_is_busy(&skel->queue)) + return -EBUSY; + + /* TODO: Configure new timings */ + + /* Save timings */ + skel->timings = *timings; + + /* Update the internal format */ + skeleton_fill_pix_format(skel, &skel->format); + return 0; +} + +static int skeleton_g_dv_timings(struct file *file, void *_fh, + struct v4l2_dv_timings *timings) +{ + struct skeleton *skel = video_drvdata(file); + + /* G_DV_TIMINGS is not supported on the S-Video input */ + if (skel->input == 0) + return -ENODATA; + + *timings = skel->timings; + return 0; +} + +static int skeleton_enum_dv_timings(struct file *file, void *_fh, + struct v4l2_enum_dv_timings *timings) +{ + struct skeleton *skel = video_drvdata(file); + + /* ENUM_DV_TIMINGS is not supported on the S-Video input */ + if (skel->input == 0) + return -ENODATA; + + return v4l2_enum_dv_timings_cap(timings, &skel_timings_cap, + NULL, NULL); +} + +/* + * Query the current timings as seen by the hardware. This function shall + * never actually change the timings, it just detects and reports. + * If no signal is detected, then return -ENOLINK. If the hardware cannot + * lock to the signal, then return -ENOLCK. If the signal is out of range + * of the capabilities of the system (e.g., it is possible that the receiver + * can lock but that the DMA engine it is connected to cannot handle + * pixelclocks above a certain frequency), then -ERANGE is returned. + */ +static int skeleton_query_dv_timings(struct file *file, void *_fh, + struct v4l2_dv_timings *timings) +{ + struct skeleton *skel = video_drvdata(file); + + /* QUERY_DV_TIMINGS is not supported on the S-Video input */ + if (skel->input == 0) + return -ENODATA; + +#ifdef TODO + /* + * Query currently seen timings. This function should look + * something like this: + */ + detect_timings(); + if (no_signal) + return -ENOLINK; + if (cannot_lock_to_signal) + return -ENOLCK; + if (signal_out_of_range_of_capabilities) + return -ERANGE; + + /* Useful for debugging */ + v4l2_print_dv_timings(skel->v4l2_dev.name, "query_dv_timings:", + timings, true); +#endif + return 0; +} + +static int skeleton_dv_timings_cap(struct file *file, void *fh, + struct v4l2_dv_timings_cap *cap) +{ + struct skeleton *skel = video_drvdata(file); + + /* DV_TIMINGS_CAP is not supported on the S-Video input */ + if (skel->input == 0) + return -ENODATA; + *cap = skel_timings_cap; + return 0; +} + +static int skeleton_enum_input(struct file *file, void *priv, + struct v4l2_input *i) +{ + if (i->index > 1) + return -EINVAL; + + i->type = V4L2_INPUT_TYPE_CAMERA; + if (i->index == 0) { + i->std = SKEL_TVNORMS; + strlcpy(i->name, "S-Video", sizeof(i->name)); + i->capabilities = V4L2_IN_CAP_STD; + } else { + i->std = 0; + strlcpy(i->name, "HDMI", sizeof(i->name)); + i->capabilities = V4L2_IN_CAP_DV_TIMINGS; + } + return 0; +} + +static int skeleton_s_input(struct file *file, void *priv, unsigned int i) +{ + struct skeleton *skel = video_drvdata(file); + + if (i > 1) + return -EINVAL; + + /* + * Changing the input implies a format change, which is not allowed + * while buffers for use with streaming have already been allocated. + */ + if (vb2_is_busy(&skel->queue)) + return -EBUSY; + + skel->input = i; + /* + * Update tvnorms. The tvnorms value is used by the core to implement + * VIDIOC_ENUMSTD so it has to be correct. If tvnorms == 0, then + * ENUMSTD will return -ENODATA. + */ + skel->vdev.tvnorms = i ? 0 : SKEL_TVNORMS; + + /* Update the internal format */ + skeleton_fill_pix_format(skel, &skel->format); + return 0; +} + +static int skeleton_g_input(struct file *file, void *priv, unsigned int *i) +{ + struct skeleton *skel = video_drvdata(file); + + *i = skel->input; + return 0; +} + +/* The control handler. */ +static int skeleton_s_ctrl(struct v4l2_ctrl *ctrl) +{ + /*struct skeleton *skel = + container_of(ctrl->handler, struct skeleton, ctrl_handler);*/ + + switch (ctrl->id) { + case V4L2_CID_BRIGHTNESS: + /* TODO: set brightness to ctrl->val */ + break; + case V4L2_CID_CONTRAST: + /* TODO: set contrast to ctrl->val */ + break; + case V4L2_CID_SATURATION: + /* TODO: set saturation to ctrl->val */ + break; + case V4L2_CID_HUE: + /* TODO: set hue to ctrl->val */ + break; + default: + return -EINVAL; + } + return 0; +} + +/* ------------------------------------------------------------------ + File operations for the device + ------------------------------------------------------------------*/ + +static const struct v4l2_ctrl_ops skel_ctrl_ops = { + .s_ctrl = skeleton_s_ctrl, +}; + +/* + * The set of all supported ioctls. Note that all the streaming ioctls + * use the vb2 helper functions that take care of all the locking and + * that also do ownership tracking (i.e. only the filehandle that requested + * the buffers can call the streaming ioctls, all other filehandles will + * receive -EBUSY if they attempt to call the same streaming ioctls). + * + * The last three ioctls also use standard helper functions: these implement + * standard behavior for drivers with controls. + */ +static const struct v4l2_ioctl_ops skel_ioctl_ops = { + .vidioc_querycap = skeleton_querycap, + .vidioc_try_fmt_vid_cap = skeleton_try_fmt_vid_cap, + .vidioc_s_fmt_vid_cap = skeleton_s_fmt_vid_cap, + .vidioc_g_fmt_vid_cap = skeleton_g_fmt_vid_cap, + .vidioc_enum_fmt_vid_cap = skeleton_enum_fmt_vid_cap, + + .vidioc_g_std = skeleton_g_std, + .vidioc_s_std = skeleton_s_std, + .vidioc_querystd = skeleton_querystd, + + .vidioc_s_dv_timings = skeleton_s_dv_timings, + .vidioc_g_dv_timings = skeleton_g_dv_timings, + .vidioc_enum_dv_timings = skeleton_enum_dv_timings, + .vidioc_query_dv_timings = skeleton_query_dv_timings, + .vidioc_dv_timings_cap = skeleton_dv_timings_cap, + + .vidioc_enum_input = skeleton_enum_input, + .vidioc_g_input = skeleton_g_input, + .vidioc_s_input = skeleton_s_input, + + .vidioc_reqbufs = vb2_ioctl_reqbufs, + .vidioc_create_bufs = vb2_ioctl_create_bufs, + .vidioc_querybuf = vb2_ioctl_querybuf, + .vidioc_qbuf = vb2_ioctl_qbuf, + .vidioc_dqbuf = vb2_ioctl_dqbuf, + .vidioc_expbuf = vb2_ioctl_expbuf, + .vidioc_streamon = vb2_ioctl_streamon, + .vidioc_streamoff = vb2_ioctl_streamoff, + + .vidioc_log_status = v4l2_ctrl_log_status, + .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, + .vidioc_unsubscribe_event = v4l2_event_unsubscribe, +}; + +/* + * The set of file operations. Note that all these ops are standard core + * helper functions. + */ +static const struct v4l2_file_operations skel_fops = { + .owner = THIS_MODULE, + .open = v4l2_fh_open, + .release = vb2_fop_release, + .unlocked_ioctl = video_ioctl2, + .read = vb2_fop_read, + .mmap = vb2_fop_mmap, + .poll = vb2_fop_poll, +}; + +/* + * The initial setup of this device instance. Note that the initial state of + * the driver should be complete. So the initial format, standard, timings + * and video input should all be initialized to some reasonable value. + */ +static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + /* The initial timings are chosen to be 720p60. */ + static const struct v4l2_dv_timings timings_def = + V4L2_DV_BT_CEA_1280X720P60; + struct skeleton *skel; + struct video_device *vdev; + struct v4l2_ctrl_handler *hdl; + struct vb2_queue *q; + int ret; + + /* Enable PCI */ + ret = pci_enable_device(pdev); + if (ret) + return ret; + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(&pdev->dev, "no suitable DMA available.\n"); + goto disable_pci; + } + + /* Allocate a new instance */ + skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL); + if (!skel) { + ret = -ENOMEM; + goto disable_pci; + } + + /* Allocate the interrupt */ + ret = devm_request_irq(&pdev->dev, pdev->irq, + skeleton_irq, 0, KBUILD_MODNAME, skel); + if (ret) { + dev_err(&pdev->dev, "request_irq failed\n"); + goto disable_pci; + } + skel->pdev = pdev; + + /* Fill in the initial format-related settings */ + skel->timings = timings_def; + skel->std = V4L2_STD_625_50; + skeleton_fill_pix_format(skel, &skel->format); + + /* Initialize the top-level structure */ + ret = v4l2_device_register(&pdev->dev, &skel->v4l2_dev); + if (ret) + goto disable_pci; + + mutex_init(&skel->lock); + + /* Add the controls */ + hdl = &skel->ctrl_handler; + v4l2_ctrl_handler_init(hdl, 4); + v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, + V4L2_CID_BRIGHTNESS, 0, 255, 1, 127); + v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, + V4L2_CID_CONTRAST, 0, 255, 1, 16); + v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, + V4L2_CID_SATURATION, 0, 255, 1, 127); + v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, + V4L2_CID_HUE, -128, 127, 1, 0); + if (hdl->error) { + ret = hdl->error; + goto free_hdl; + } + skel->v4l2_dev.ctrl_handler = hdl; + + /* Initialize the vb2 queue */ + q = &skel->queue; + q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + q->io_modes = VB2_MMAP | VB2_DMABUF | VB2_READ; + q->dev = &pdev->dev; + q->drv_priv = skel; + q->buf_struct_size = sizeof(struct skel_buffer); + q->ops = &skel_qops; + q->mem_ops = &vb2_dma_contig_memops; + q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; + /* + * Assume that this DMA engine needs to have at least two buffers + * available before it can be started. The start_streaming() op + * won't be called until at least this many buffers are queued up. + */ + q->min_buffers_needed = 2; + /* + * The serialization lock for the streaming ioctls. This is the same + * as the main serialization lock, but if some of the non-streaming + * ioctls could take a long time to execute, then you might want to + * have a different lock here to prevent VIDIOC_DQBUF from being + * blocked while waiting for another action to finish. This is + * generally not needed for PCI devices, but USB devices usually do + * want a separate lock here. + */ + q->lock = &skel->lock; + /* + * Since this driver can only do 32-bit DMA we must make sure that + * the vb2 core will allocate the buffers in 32-bit DMA memory. + */ + q->gfp_flags = GFP_DMA32; + ret = vb2_queue_init(q); + if (ret) + goto free_hdl; + + INIT_LIST_HEAD(&skel->buf_list); + spin_lock_init(&skel->qlock); + + /* Initialize the video_device structure */ + vdev = &skel->vdev; + strlcpy(vdev->name, KBUILD_MODNAME, sizeof(vdev->name)); + /* + * There is nothing to clean up, so release is set to an empty release + * function. The release callback must be non-NULL. + */ + vdev->release = video_device_release_empty; + vdev->fops = &skel_fops, + vdev->ioctl_ops = &skel_ioctl_ops, + vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_READWRITE | + V4L2_CAP_STREAMING; + /* + * The main serialization lock. All ioctls are serialized by this + * lock. Exception: if q->lock is set, then the streaming ioctls + * are serialized by that separate lock. + */ + vdev->lock = &skel->lock; + vdev->queue = q; + vdev->v4l2_dev = &skel->v4l2_dev; + /* Supported SDTV standards, if any */ + vdev->tvnorms = SKEL_TVNORMS; + video_set_drvdata(vdev, skel); + + ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); + if (ret) + goto free_hdl; + + dev_info(&pdev->dev, "V4L2 PCI Skeleton Driver loaded\n"); + return 0; + +free_hdl: + v4l2_ctrl_handler_free(&skel->ctrl_handler); + v4l2_device_unregister(&skel->v4l2_dev); +disable_pci: + pci_disable_device(pdev); + return ret; +} + +static void skeleton_remove(struct pci_dev *pdev) +{ + struct v4l2_device *v4l2_dev = pci_get_drvdata(pdev); + struct skeleton *skel = container_of(v4l2_dev, struct skeleton, v4l2_dev); + + video_unregister_device(&skel->vdev); + v4l2_ctrl_handler_free(&skel->ctrl_handler); + v4l2_device_unregister(&skel->v4l2_dev); + pci_disable_device(skel->pdev); +} + +static struct pci_driver skeleton_driver = { + .name = KBUILD_MODNAME, + .probe = skeleton_probe, + .remove = skeleton_remove, + .id_table = skeleton_pci_tbl, +}; + +module_pci_driver(skeleton_driver); diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile new file mode 100644 index 000000000..10d179c4f --- /dev/null +++ b/samples/vfio-mdev/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c new file mode 100644 index 000000000..e03068917 --- /dev/null +++ b/samples/vfio-mdev/mbochs.c @@ -0,0 +1,1485 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Mediated virtual PCI display host device driver + * + * Emulate enough of qemu stdvga to make bochs-drm.ko happy. That is + * basically the vram memory bar and the bochs dispi interface vbe + * registers in the mmio register bar. Specifically it does *not* + * include any legacy vga stuff. Device looks a lot like "qemu -device + * secondary-vga". + * + * (c) Gerd Hoffmann <kraxel@redhat.com> + * + * based on mtty driver which is: + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/cdev.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/sysfs.h> +#include <linux/mdev.h> +#include <linux/pci.h> +#include <linux/dma-buf.h> +#include <linux/highmem.h> +#include <drm/drm_fourcc.h> +#include <drm/drm_rect.h> +#include <drm/drm_modeset_lock.h> +#include <drm/drm_property.h> +#include <drm/drm_plane.h> + + +#define VBE_DISPI_INDEX_ID 0x0 +#define VBE_DISPI_INDEX_XRES 0x1 +#define VBE_DISPI_INDEX_YRES 0x2 +#define VBE_DISPI_INDEX_BPP 0x3 +#define VBE_DISPI_INDEX_ENABLE 0x4 +#define VBE_DISPI_INDEX_BANK 0x5 +#define VBE_DISPI_INDEX_VIRT_WIDTH 0x6 +#define VBE_DISPI_INDEX_VIRT_HEIGHT 0x7 +#define VBE_DISPI_INDEX_X_OFFSET 0x8 +#define VBE_DISPI_INDEX_Y_OFFSET 0x9 +#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa +#define VBE_DISPI_INDEX_COUNT 0xb + +#define VBE_DISPI_ID0 0xB0C0 +#define VBE_DISPI_ID1 0xB0C1 +#define VBE_DISPI_ID2 0xB0C2 +#define VBE_DISPI_ID3 0xB0C3 +#define VBE_DISPI_ID4 0xB0C4 +#define VBE_DISPI_ID5 0xB0C5 + +#define VBE_DISPI_DISABLED 0x00 +#define VBE_DISPI_ENABLED 0x01 +#define VBE_DISPI_GETCAPS 0x02 +#define VBE_DISPI_8BIT_DAC 0x20 +#define VBE_DISPI_LFB_ENABLED 0x40 +#define VBE_DISPI_NOCLEARMEM 0x80 + + +#define MBOCHS_NAME "mbochs" +#define MBOCHS_CLASS_NAME "mbochs" + +#define MBOCHS_EDID_REGION_INDEX VFIO_PCI_NUM_REGIONS +#define MBOCHS_NUM_REGIONS (MBOCHS_EDID_REGION_INDEX+1) + +#define MBOCHS_CONFIG_SPACE_SIZE 0xff +#define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE +#define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE +#define MBOCHS_EDID_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \ + MBOCHS_MMIO_BAR_SIZE) +#define MBOCHS_EDID_SIZE PAGE_SIZE +#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_EDID_OFFSET + \ + MBOCHS_EDID_SIZE) + +#define MBOCHS_EDID_BLOB_OFFSET (MBOCHS_EDID_SIZE/2) + +#define STORE_LE16(addr, val) (*(u16 *)addr = val) +#define STORE_LE32(addr, val) (*(u32 *)addr = val) + + +MODULE_LICENSE("GPL v2"); + +static int max_mbytes = 256; +module_param_named(count, max_mbytes, int, 0444); +MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); + + +#define MBOCHS_TYPE_1 "small" +#define MBOCHS_TYPE_2 "medium" +#define MBOCHS_TYPE_3 "large" + +static const struct mbochs_type { + const char *name; + u32 mbytes; + u32 max_x; + u32 max_y; +} mbochs_types[] = { + { + .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, + .mbytes = 4, + .max_x = 800, + .max_y = 600, + }, { + .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, + .mbytes = 16, + .max_x = 1920, + .max_y = 1440, + }, { + .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, + .mbytes = 64, + .max_x = 0, + .max_y = 0, + }, +}; + + +static dev_t mbochs_devt; +static struct class *mbochs_class; +static struct cdev mbochs_cdev; +static struct device mbochs_dev; +static int mbochs_used_mbytes; + +struct vfio_region_info_ext { + struct vfio_region_info base; + struct vfio_region_info_cap_type type; +}; + +struct mbochs_mode { + u32 drm_format; + u32 bytepp; + u32 width; + u32 height; + u32 stride; + u32 __pad; + u64 offset; + u64 size; +}; + +struct mbochs_dmabuf { + struct mbochs_mode mode; + u32 id; + struct page **pages; + pgoff_t pagecount; + struct dma_buf *buf; + struct mdev_state *mdev_state; + struct list_head next; + bool unlinked; +}; + +/* State of each mdev device */ +struct mdev_state { + u8 *vconfig; + u64 bar_mask[3]; + u32 memory_bar_mask; + struct mutex ops_lock; + struct mdev_device *mdev; + + const struct mbochs_type *type; + u16 vbe[VBE_DISPI_INDEX_COUNT]; + u64 memsize; + struct page **pages; + pgoff_t pagecount; + struct vfio_region_gfx_edid edid_regs; + u8 edid_blob[0x400]; + + struct list_head dmabufs; + u32 active_id; + u32 next_id; +}; + +static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = { + [VBE_DISPI_INDEX_ID] = "id", + [VBE_DISPI_INDEX_XRES] = "xres", + [VBE_DISPI_INDEX_YRES] = "yres", + [VBE_DISPI_INDEX_BPP] = "bpp", + [VBE_DISPI_INDEX_ENABLE] = "enable", + [VBE_DISPI_INDEX_BANK] = "bank", + [VBE_DISPI_INDEX_VIRT_WIDTH] = "virt-width", + [VBE_DISPI_INDEX_VIRT_HEIGHT] = "virt-height", + [VBE_DISPI_INDEX_X_OFFSET] = "x-offset", + [VBE_DISPI_INDEX_Y_OFFSET] = "y-offset", + [VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = "video-mem", +}; + +static const char *vbe_name(u32 index) +{ + if (index < ARRAY_SIZE(vbe_name_list)) + return vbe_name_list[index]; + return "(invalid)"; +} + +static struct page *__mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff); +static struct page *mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff); + +static const struct mbochs_type *mbochs_find_type(struct kobject *kobj) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mbochs_types); i++) + if (strcmp(mbochs_types[i].name, kobj->name) == 0) + return mbochs_types + i; + return NULL; +} + +static void mbochs_create_config_space(struct mdev_state *mdev_state) +{ + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], + 0x1234); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID], + 0x1111); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID], + PCI_SUBVENDOR_ID_REDHAT_QUMRANET); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID], + PCI_SUBDEVICE_ID_QEMU); + + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND], + PCI_COMMAND_IO | PCI_COMMAND_MEMORY); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE], + PCI_CLASS_DISPLAY_OTHER); + mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01; + + STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0], + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_32 | + PCI_BASE_ADDRESS_MEM_PREFETCH); + mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1; + + STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2], + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_32); + mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1; +} + +static int mbochs_check_framebuffer(struct mdev_state *mdev_state, + struct mbochs_mode *mode) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + u16 *vbe = mdev_state->vbe; + u32 virt_width; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED)) + goto nofb; + + memset(mode, 0, sizeof(*mode)); + switch (vbe[VBE_DISPI_INDEX_BPP]) { + case 32: + mode->drm_format = DRM_FORMAT_XRGB8888; + mode->bytepp = 4; + break; + default: + dev_info_ratelimited(dev, "%s: bpp %d not supported\n", + __func__, vbe[VBE_DISPI_INDEX_BPP]); + goto nofb; + } + + mode->width = vbe[VBE_DISPI_INDEX_XRES]; + mode->height = vbe[VBE_DISPI_INDEX_YRES]; + virt_width = vbe[VBE_DISPI_INDEX_VIRT_WIDTH]; + if (virt_width < mode->width) + virt_width = mode->width; + mode->stride = virt_width * mode->bytepp; + mode->size = (u64)mode->stride * mode->height; + mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp + + (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride); + + if (mode->width < 64 || mode->height < 64) { + dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n", + __func__, mode->width, mode->height); + goto nofb; + } + if (mode->offset + mode->size > mdev_state->memsize) { + dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n", + __func__); + goto nofb; + } + + return 0; + +nofb: + memset(mode, 0, sizeof(*mode)); + return -EINVAL; +} + +static bool mbochs_modes_equal(struct mbochs_mode *mode1, + struct mbochs_mode *mode2) +{ + return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0; +} + +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + int index = (offset - PCI_BASE_ADDRESS_0) / 0x04; + u32 cfg_addr; + + switch (offset) { + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_2: + cfg_addr = *(u32 *)buf; + + if (cfg_addr == 0xffffffff) { + cfg_addr = (cfg_addr & mdev_state->bar_mask[index]); + } else { + cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK; + if (cfg_addr) + dev_info(dev, "BAR #%d @ 0x%x\n", + index, cfg_addr); + } + + cfg_addr |= (mdev_state->vconfig[offset] & + ~PCI_BASE_ADDRESS_MEM_MASK); + STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); + break; + } +} + +static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + int index; + u16 reg16; + + switch (offset) { + case 0x400 ... 0x41f: /* vga ioports remapped */ + goto unhandled; + case 0x500 ... 0x515: /* bochs dispi interface */ + if (count != 2) + goto unhandled; + index = (offset - 0x500) / 2; + reg16 = *(u16 *)buf; + if (index < ARRAY_SIZE(mdev_state->vbe)) + mdev_state->vbe[index] = reg16; + dev_dbg(dev, "%s: vbe write %d = %d (%s)\n", + __func__, index, reg16, vbe_name(index)); + break; + case 0x600 ... 0x607: /* qemu extended regs */ + goto unhandled; + default: +unhandled: + dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n", + __func__, offset, count); + break; + } +} + +static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + struct vfio_region_gfx_edid *edid; + u16 reg16 = 0; + int index; + + switch (offset) { + case 0x000 ... 0x3ff: /* edid block */ + edid = &mdev_state->edid_regs; + if (edid->link_state != VFIO_DEVICE_GFX_LINK_STATE_UP || + offset >= edid->edid_size) { + memset(buf, 0, count); + break; + } + memcpy(buf, mdev_state->edid_blob + offset, count); + break; + case 0x500 ... 0x515: /* bochs dispi interface */ + if (count != 2) + goto unhandled; + index = (offset - 0x500) / 2; + if (index < ARRAY_SIZE(mdev_state->vbe)) + reg16 = mdev_state->vbe[index]; + dev_dbg(dev, "%s: vbe read %d = %d (%s)\n", + __func__, index, reg16, vbe_name(index)); + *(u16 *)buf = reg16; + break; + default: +unhandled: + dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n", + __func__, offset, count); + memset(buf, 0, count); + break; + } +} + +static void handle_edid_regs(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count, bool is_write) +{ + char *regs = (void *)&mdev_state->edid_regs; + + if (offset + count > sizeof(mdev_state->edid_regs)) + return; + if (count != 4) + return; + if (offset % 4) + return; + + if (is_write) { + switch (offset) { + case offsetof(struct vfio_region_gfx_edid, link_state): + case offsetof(struct vfio_region_gfx_edid, edid_size): + memcpy(regs + offset, buf, count); + break; + default: + /* read-only regs */ + break; + } + } else { + memcpy(buf, regs + offset, count); + } +} + +static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count, bool is_write) +{ + if (offset + count > mdev_state->edid_regs.edid_max_size) + return; + if (is_write) + memcpy(mdev_state->edid_blob + offset, buf, count); + else + memcpy(buf, mdev_state->edid_blob + offset, count); +} + +static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, + loff_t pos, bool is_write) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct device *dev = mdev_dev(mdev); + struct page *pg; + loff_t poff; + char *map; + int ret = 0; + + mutex_lock(&mdev_state->ops_lock); + + if (pos < MBOCHS_CONFIG_SPACE_SIZE) { + if (is_write) + handle_pci_cfg_write(mdev_state, pos, buf, count); + else + memcpy(buf, (mdev_state->vconfig + pos), count); + + } else if (pos >= MBOCHS_MMIO_BAR_OFFSET && + pos + count <= (MBOCHS_MMIO_BAR_OFFSET + + MBOCHS_MMIO_BAR_SIZE)) { + pos -= MBOCHS_MMIO_BAR_OFFSET; + if (is_write) + handle_mmio_write(mdev_state, pos, buf, count); + else + handle_mmio_read(mdev_state, pos, buf, count); + + } else if (pos >= MBOCHS_EDID_OFFSET && + pos + count <= (MBOCHS_EDID_OFFSET + + MBOCHS_EDID_SIZE)) { + pos -= MBOCHS_EDID_OFFSET; + if (pos < MBOCHS_EDID_BLOB_OFFSET) { + handle_edid_regs(mdev_state, pos, buf, count, is_write); + } else { + pos -= MBOCHS_EDID_BLOB_OFFSET; + handle_edid_blob(mdev_state, pos, buf, count, is_write); + } + + } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET && + pos + count <= + MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) { + pos -= MBOCHS_MMIO_BAR_OFFSET; + poff = pos & ~PAGE_MASK; + pg = __mbochs_get_page(mdev_state, pos >> PAGE_SHIFT); + map = kmap(pg); + if (is_write) + memcpy(map + poff, buf, count); + else + memcpy(buf, map + poff, count); + kunmap(pg); + put_page(pg); + + } else { + dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n", + __func__, is_write ? "WR" : "RD", pos); + ret = -1; + goto accessfailed; + } + + ret = count; + + +accessfailed: + mutex_unlock(&mdev_state->ops_lock); + + return ret; +} + +static int mbochs_reset(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + u32 size64k = mdev_state->memsize / (64 * 1024); + int i; + + for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++) + mdev_state->vbe[i] = 0; + mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5; + mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k; + return 0; +} + +static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev) +{ + const struct mbochs_type *type = mbochs_find_type(kobj); + struct device *dev = mdev_dev(mdev); + struct mdev_state *mdev_state; + + if (!type) + type = &mbochs_types[0]; + if (type->mbytes + mbochs_used_mbytes > max_mbytes) + return -ENOMEM; + + mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); + if (mdev_state == NULL) + return -ENOMEM; + + mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (mdev_state->vconfig == NULL) + goto err_mem; + + mdev_state->memsize = type->mbytes * 1024 * 1024; + mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT; + mdev_state->pages = kcalloc(mdev_state->pagecount, + sizeof(struct page *), + GFP_KERNEL); + if (!mdev_state->pages) + goto err_mem; + + dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__, + kobj->name, type->mbytes, mdev_state->pagecount); + + mutex_init(&mdev_state->ops_lock); + mdev_state->mdev = mdev; + mdev_set_drvdata(mdev, mdev_state); + INIT_LIST_HEAD(&mdev_state->dmabufs); + mdev_state->next_id = 1; + + mdev_state->type = type; + mdev_state->edid_regs.max_xres = type->max_x; + mdev_state->edid_regs.max_yres = type->max_y; + mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET; + mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob); + mbochs_create_config_space(mdev_state); + mbochs_reset(mdev); + + mbochs_used_mbytes += type->mbytes; + return 0; + +err_mem: + kfree(mdev_state->vconfig); + kfree(mdev_state); + return -ENOMEM; +} + +static int mbochs_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + + mbochs_used_mbytes -= mdev_state->type->mbytes; + mdev_set_drvdata(mdev, NULL); + kfree(mdev_state->pages); + kfree(mdev_state->vconfig); + kfree(mdev_state); + return 0; +} + +static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 2; + } else { + u8 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; + +read_err: + return -EFAULT; +} + +static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 1; + } + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; +write_err: + return -EFAULT; +} + +static struct page *__mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff) +{ + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + if (!mdev_state->pages[pgoff]) { + mdev_state->pages[pgoff] = + alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0); + if (!mdev_state->pages[pgoff]) + return NULL; + } + + get_page(mdev_state->pages[pgoff]); + return mdev_state->pages[pgoff]; +} + +static struct page *mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff) +{ + struct page *page; + + if (WARN_ON(pgoff >= mdev_state->pagecount)) + return NULL; + + mutex_lock(&mdev_state->ops_lock); + page = __mbochs_get_page(mdev_state, pgoff); + mutex_unlock(&mdev_state->ops_lock); + + return page; +} + +static void mbochs_put_pages(struct mdev_state *mdev_state) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + int i, count = 0; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + for (i = 0; i < mdev_state->pagecount; i++) { + if (!mdev_state->pages[i]) + continue; + put_page(mdev_state->pages[i]); + mdev_state->pages[i] = NULL; + count++; + } + dev_dbg(dev, "%s: %d pages released\n", __func__, count); +} + +static vm_fault_t mbochs_region_vm_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mdev_state *mdev_state = vma->vm_private_data; + pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; + + if (page_offset >= mdev_state->pagecount) + return VM_FAULT_SIGBUS; + + vmf->page = mbochs_get_page(mdev_state, page_offset); + if (!vmf->page) + return VM_FAULT_SIGBUS; + + return 0; +} + +static const struct vm_operations_struct mbochs_region_vm_ops = { + .fault = mbochs_region_vm_fault, +}; + +static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + + if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT) + return -EINVAL; + if (vma->vm_end < vma->vm_start) + return -EINVAL; + if (vma->vm_end - vma->vm_start > mdev_state->memsize) + return -EINVAL; + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + vma->vm_ops = &mbochs_region_vm_ops; + vma->vm_private_data = mdev_state; + return 0; +} + +static vm_fault_t mbochs_dmabuf_vm_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mbochs_dmabuf *dmabuf = vma->vm_private_data; + + if (WARN_ON(vmf->pgoff >= dmabuf->pagecount)) + return VM_FAULT_SIGBUS; + + vmf->page = dmabuf->pages[vmf->pgoff]; + get_page(vmf->page); + return 0; +} + +static const struct vm_operations_struct mbochs_dmabuf_vm_ops = { + .fault = mbochs_dmabuf_vm_fault, +}; + +static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma) +{ + struct mbochs_dmabuf *dmabuf = buf->priv; + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + vma->vm_ops = &mbochs_dmabuf_vm_ops; + vma->vm_private_data = dmabuf; + return 0; +} + +static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf, + const char *prefix) +{ + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + u32 fourcc = dmabuf->mode.drm_format; + + dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n", + prefix, dmabuf->id, + fourcc ? ((fourcc >> 0) & 0xff) : '-', + fourcc ? ((fourcc >> 8) & 0xff) : '-', + fourcc ? ((fourcc >> 16) & 0xff) : '-', + fourcc ? ((fourcc >> 24) & 0xff) : '-', + dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride, + dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount); +} + +static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at, + enum dma_data_direction direction) +{ + struct mbochs_dmabuf *dmabuf = at->dmabuf->priv; + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + struct sg_table *sg; + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + sg = kzalloc(sizeof(*sg), GFP_KERNEL); + if (!sg) + goto err1; + if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount, + 0, dmabuf->mode.size, GFP_KERNEL) < 0) + goto err2; + if (dma_map_sgtable(at->dev, sg, direction, 0)) + goto err3; + + return sg; + +err3: + sg_free_table(sg); +err2: + kfree(sg); +err1: + return ERR_PTR(-ENOMEM); +} + +static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at, + struct sg_table *sg, + enum dma_data_direction direction) +{ + struct mbochs_dmabuf *dmabuf = at->dmabuf->priv; + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + dma_unmap_sgtable(at->dev, sg, direction, 0); + sg_free_table(sg); + kfree(sg); +} + +static void mbochs_release_dmabuf(struct dma_buf *buf) +{ + struct mbochs_dmabuf *dmabuf = buf->priv; + struct mdev_state *mdev_state = dmabuf->mdev_state; + struct device *dev = mdev_dev(mdev_state->mdev); + pgoff_t pg; + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + for (pg = 0; pg < dmabuf->pagecount; pg++) + put_page(dmabuf->pages[pg]); + + mutex_lock(&mdev_state->ops_lock); + dmabuf->buf = NULL; + if (dmabuf->unlinked) + kfree(dmabuf); + mutex_unlock(&mdev_state->ops_lock); +} + +static struct dma_buf_ops mbochs_dmabuf_ops = { + .map_dma_buf = mbochs_map_dmabuf, + .unmap_dma_buf = mbochs_unmap_dmabuf, + .release = mbochs_release_dmabuf, + .mmap = mbochs_mmap_dmabuf, +}; + +static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state, + struct mbochs_mode *mode) +{ + struct mbochs_dmabuf *dmabuf; + pgoff_t page_offset, pg; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL); + if (!dmabuf) + return NULL; + + dmabuf->mode = *mode; + dmabuf->id = mdev_state->next_id++; + dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE); + dmabuf->pages = kcalloc(dmabuf->pagecount, sizeof(struct page *), + GFP_KERNEL); + if (!dmabuf->pages) + goto err_free_dmabuf; + + page_offset = dmabuf->mode.offset >> PAGE_SHIFT; + for (pg = 0; pg < dmabuf->pagecount; pg++) { + dmabuf->pages[pg] = __mbochs_get_page(mdev_state, + page_offset + pg); + if (!dmabuf->pages[pg]) + goto err_free_pages; + } + + dmabuf->mdev_state = mdev_state; + list_add(&dmabuf->next, &mdev_state->dmabufs); + + mbochs_print_dmabuf(dmabuf, __func__); + return dmabuf; + +err_free_pages: + while (pg > 0) + put_page(dmabuf->pages[--pg]); + kfree(dmabuf->pages); +err_free_dmabuf: + kfree(dmabuf); + return NULL; +} + +static struct mbochs_dmabuf * +mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state, + struct mbochs_mode *mode) +{ + struct mbochs_dmabuf *dmabuf; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + list_for_each_entry(dmabuf, &mdev_state->dmabufs, next) + if (mbochs_modes_equal(&dmabuf->mode, mode)) + return dmabuf; + + return NULL; +} + +static struct mbochs_dmabuf * +mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id) +{ + struct mbochs_dmabuf *dmabuf; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + list_for_each_entry(dmabuf, &mdev_state->dmabufs, next) + if (dmabuf->id == id) + return dmabuf; + + return NULL; +} + +static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) +{ + struct mdev_state *mdev_state = dmabuf->mdev_state; + struct device *dev = mdev_dev(mdev_state->mdev); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct dma_buf *buf; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) { + dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n", + __func__); + return -EINVAL; + } + + exp_info.ops = &mbochs_dmabuf_ops; + exp_info.size = dmabuf->mode.size; + exp_info.priv = dmabuf; + + buf = dma_buf_export(&exp_info); + if (IS_ERR(buf)) { + dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n", + __func__, PTR_ERR(buf)); + return PTR_ERR(buf); + } + + dmabuf->buf = buf; + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + return 0; +} + +static int mbochs_get_region_info(struct mdev_device *mdev, + struct vfio_region_info_ext *ext) +{ + struct vfio_region_info *region_info = &ext->base; + struct mdev_state *mdev_state; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + if (region_info->index >= MBOCHS_NUM_REGIONS) + return -EINVAL; + + switch (region_info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + region_info->offset = 0; + region_info->size = MBOCHS_CONFIG_SPACE_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE); + break; + case VFIO_PCI_BAR0_REGION_INDEX: + region_info->offset = MBOCHS_MEMORY_BAR_OFFSET; + region_info->size = mdev_state->memsize; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP); + break; + case VFIO_PCI_BAR2_REGION_INDEX: + region_info->offset = MBOCHS_MMIO_BAR_OFFSET; + region_info->size = MBOCHS_MMIO_BAR_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE); + break; + case MBOCHS_EDID_REGION_INDEX: + ext->base.argsz = sizeof(*ext); + ext->base.offset = MBOCHS_EDID_OFFSET; + ext->base.size = MBOCHS_EDID_SIZE; + ext->base.flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_CAPS); + ext->base.cap_offset = offsetof(typeof(*ext), type); + ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE; + ext->type.header.version = 1; + ext->type.header.next = 0; + ext->type.type = VFIO_REGION_TYPE_GFX; + ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID; + break; + default: + region_info->size = 0; + region_info->offset = 0; + region_info->flags = 0; + } + + return 0; +} + +static int mbochs_get_irq_info(struct mdev_device *mdev, + struct vfio_irq_info *irq_info) +{ + irq_info->count = 0; + return 0; +} + +static int mbochs_get_device_info(struct mdev_device *mdev, + struct vfio_device_info *dev_info) +{ + dev_info->flags = VFIO_DEVICE_FLAGS_PCI; + dev_info->num_regions = MBOCHS_NUM_REGIONS; + dev_info->num_irqs = VFIO_PCI_NUM_IRQS; + return 0; +} + +static int mbochs_query_gfx_plane(struct mdev_device *mdev, + struct vfio_device_gfx_plane_info *plane) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct device *dev = mdev_dev(mdev); + struct mbochs_dmabuf *dmabuf; + struct mbochs_mode mode; + int ret; + + if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) { + if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE | + VFIO_GFX_PLANE_TYPE_DMABUF)) + return 0; + return -EINVAL; + } + + if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF) + return -EINVAL; + + plane->drm_format_mod = 0; + plane->x_pos = 0; + plane->y_pos = 0; + plane->x_hot = 0; + plane->y_hot = 0; + + mutex_lock(&mdev_state->ops_lock); + + ret = -EINVAL; + if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY) + ret = mbochs_check_framebuffer(mdev_state, &mode); + if (ret < 0) { + plane->drm_format = 0; + plane->width = 0; + plane->height = 0; + plane->stride = 0; + plane->size = 0; + plane->dmabuf_id = 0; + goto done; + } + + dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode); + if (!dmabuf) + mbochs_dmabuf_alloc(mdev_state, &mode); + if (!dmabuf) { + mutex_unlock(&mdev_state->ops_lock); + return -ENOMEM; + } + + plane->drm_format = dmabuf->mode.drm_format; + plane->width = dmabuf->mode.width; + plane->height = dmabuf->mode.height; + plane->stride = dmabuf->mode.stride; + plane->size = dmabuf->mode.size; + plane->dmabuf_id = dmabuf->id; + +done: + if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY && + mdev_state->active_id != plane->dmabuf_id) { + dev_dbg(dev, "%s: primary: %d => %d\n", __func__, + mdev_state->active_id, plane->dmabuf_id); + mdev_state->active_id = plane->dmabuf_id; + } + mutex_unlock(&mdev_state->ops_lock); + return 0; +} + +static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev, + u32 id) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mbochs_dmabuf *dmabuf; + + mutex_lock(&mdev_state->ops_lock); + + dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id); + if (!dmabuf) { + mutex_unlock(&mdev_state->ops_lock); + return -ENOENT; + } + + if (!dmabuf->buf) + mbochs_dmabuf_export(dmabuf); + + mutex_unlock(&mdev_state->ops_lock); + + if (!dmabuf->buf) + return -EINVAL; + + return dma_buf_fd(dmabuf->buf, 0); +} + +static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + unsigned long minsz, outsz; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mbochs_get_device_info(mdev, &info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info_ext info; + + minsz = offsetofend(typeof(info), base.offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + outsz = info.base.argsz; + if (outsz < minsz) + return -EINVAL; + if (outsz > sizeof(info)) + return -EINVAL; + + ret = mbochs_get_region_info(mdev, &info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, outsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if ((info.argsz < minsz) || + (info.index >= VFIO_PCI_NUM_IRQS)) + return -EINVAL; + + ret = mbochs_get_irq_info(mdev, &info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_QUERY_GFX_PLANE: + { + struct vfio_device_gfx_plane_info plane; + + minsz = offsetofend(struct vfio_device_gfx_plane_info, + region_index); + + if (copy_from_user(&plane, (void __user *)arg, minsz)) + return -EFAULT; + + if (plane.argsz < minsz) + return -EINVAL; + + ret = mbochs_query_gfx_plane(mdev, &plane); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &plane, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_GET_GFX_DMABUF: + { + u32 dmabuf_id; + + if (get_user(dmabuf_id, (__u32 __user *)arg)) + return -EFAULT; + + return mbochs_get_gfx_dmabuf(mdev, dmabuf_id); + } + + case VFIO_DEVICE_SET_IRQS: + return -EINVAL; + + case VFIO_DEVICE_RESET: + return mbochs_reset(mdev); + } + return -ENOTTY; +} + +static int mbochs_open(struct mdev_device *mdev) +{ + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + return 0; +} + +static void mbochs_close(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct mbochs_dmabuf *dmabuf, *tmp; + + mutex_lock(&mdev_state->ops_lock); + + list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) { + list_del(&dmabuf->next); + if (dmabuf->buf) { + /* free in mbochs_release_dmabuf() */ + dmabuf->unlinked = true; + } else { + kfree(dmabuf); + } + } + mbochs_put_pages(mdev_state); + + mutex_unlock(&mdev_state->ops_lock); + module_put(THIS_MODULE); +} + +static ssize_t +memory_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct mdev_device *mdev = mdev_from_dev(dev); + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + + return sprintf(buf, "%d MB\n", mdev_state->type->mbytes); +} +static DEVICE_ATTR_RO(memory); + +static struct attribute *mdev_dev_attrs[] = { + &dev_attr_memory.attr, + NULL, +}; + +static const struct attribute_group mdev_dev_group = { + .name = "vendor", + .attrs = mdev_dev_attrs, +}; + +const struct attribute_group *mdev_dev_groups[] = { + &mdev_dev_group, + NULL, +}; + +static ssize_t +name_show(struct kobject *kobj, struct device *dev, char *buf) +{ + return sprintf(buf, "%s\n", kobj->name); +} +MDEV_TYPE_ATTR_RO(name); + +static ssize_t +description_show(struct kobject *kobj, struct device *dev, char *buf) +{ + const struct mbochs_type *type = mbochs_find_type(kobj); + + return sprintf(buf, "virtual display, %d MB video memory\n", + type ? type->mbytes : 0); +} +MDEV_TYPE_ATTR_RO(description); + +static ssize_t +available_instances_show(struct kobject *kobj, struct device *dev, char *buf) +{ + const struct mbochs_type *type = mbochs_find_type(kobj); + int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes; + + return sprintf(buf, "%d\n", count); +} +MDEV_TYPE_ATTR_RO(available_instances); + +static ssize_t device_api_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); +} +MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *mdev_types_attrs[] = { + &mdev_type_attr_name.attr, + &mdev_type_attr_description.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_available_instances.attr, + NULL, +}; + +static struct attribute_group mdev_type_group1 = { + .name = MBOCHS_TYPE_1, + .attrs = mdev_types_attrs, +}; + +static struct attribute_group mdev_type_group2 = { + .name = MBOCHS_TYPE_2, + .attrs = mdev_types_attrs, +}; + +static struct attribute_group mdev_type_group3 = { + .name = MBOCHS_TYPE_3, + .attrs = mdev_types_attrs, +}; + +static struct attribute_group *mdev_type_groups[] = { + &mdev_type_group1, + &mdev_type_group2, + &mdev_type_group3, + NULL, +}; + +static const struct mdev_parent_ops mdev_fops = { + .owner = THIS_MODULE, + .mdev_attr_groups = mdev_dev_groups, + .supported_type_groups = mdev_type_groups, + .create = mbochs_create, + .remove = mbochs_remove, + .open = mbochs_open, + .release = mbochs_close, + .read = mbochs_read, + .write = mbochs_write, + .ioctl = mbochs_ioctl, + .mmap = mbochs_mmap, +}; + +static const struct file_operations vd_fops = { + .owner = THIS_MODULE, +}; + +static void mbochs_device_release(struct device *dev) +{ + /* nothing */ +} + +static int __init mbochs_dev_init(void) +{ + int ret = 0; + + ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME); + if (ret < 0) { + pr_err("Error: failed to register mbochs_dev, err: %d\n", ret); + return ret; + } + cdev_init(&mbochs_cdev, &vd_fops); + cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1); + pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt)); + + mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME); + if (IS_ERR(mbochs_class)) { + pr_err("Error: failed to register mbochs_dev class\n"); + ret = PTR_ERR(mbochs_class); + goto failed1; + } + mbochs_dev.class = mbochs_class; + mbochs_dev.release = mbochs_device_release; + dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME); + + ret = device_register(&mbochs_dev); + if (ret) + goto failed2; + + ret = mdev_register_device(&mbochs_dev, &mdev_fops); + if (ret) + goto failed3; + + return 0; + +failed3: + device_unregister(&mbochs_dev); +failed2: + class_destroy(mbochs_class); +failed1: + cdev_del(&mbochs_cdev); + unregister_chrdev_region(mbochs_devt, MINORMASK + 1); + return ret; +} + +static void __exit mbochs_dev_exit(void) +{ + mbochs_dev.bus = NULL; + mdev_unregister_device(&mbochs_dev); + + device_unregister(&mbochs_dev); + cdev_del(&mbochs_cdev); + unregister_chrdev_region(mbochs_devt, MINORMASK + 1); + class_destroy(mbochs_class); + mbochs_class = NULL; +} + +module_init(mbochs_dev_init) +module_exit(mbochs_dev_exit) diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h new file mode 100644 index 000000000..961c55ec3 --- /dev/null +++ b/samples/vfio-mdev/mdpy-defs.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Simple pci display device. + * + * Framebuffer memory is pci bar 0. + * Configuration (read-only) is in pci config space. + * Format field uses drm fourcc codes. + * ATM only DRM_FORMAT_XRGB8888 is supported. + */ + +/* pci ids */ +#define MDPY_PCI_VENDOR_ID PCI_VENDOR_ID_REDHAT +#define MDPY_PCI_DEVICE_ID 0x000f +#define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET +#define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU + +/* pci cfg space offsets for fb config (dword) */ +#define MDPY_VENDORCAP_OFFSET 0x40 +#define MDPY_VENDORCAP_SIZE 0x10 +#define MDPY_FORMAT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x04) +#define MDPY_WIDTH_OFFSET (MDPY_VENDORCAP_OFFSET + 0x08) +#define MDPY_HEIGHT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x0c) diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c new file mode 100644 index 000000000..4eb7aa11c --- /dev/null +++ b/samples/vfio-mdev/mdpy-fb.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Framebuffer driver for mdpy (mediated virtual pci display device). + * + * See mdpy-defs.h for device specs + * + * (c) Gerd Hoffmann <kraxel@redhat.com> + * + * Using some code snippets from simplefb and cirrusfb. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/errno.h> +#include <linux/fb.h> +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <drm/drm_fourcc.h> +#include "mdpy-defs.h" + +static const struct fb_fix_screeninfo mdpy_fb_fix = { + .id = "mdpy-fb", + .type = FB_TYPE_PACKED_PIXELS, + .visual = FB_VISUAL_TRUECOLOR, + .accel = FB_ACCEL_NONE, +}; + +static const struct fb_var_screeninfo mdpy_fb_var = { + .height = -1, + .width = -1, + .activate = FB_ACTIVATE_NOW, + .vmode = FB_VMODE_NONINTERLACED, + + .bits_per_pixel = 32, + .transp.offset = 24, + .red.offset = 16, + .green.offset = 8, + .blue.offset = 0, + .transp.length = 8, + .red.length = 8, + .green.length = 8, + .blue.length = 8, +}; + +#define PSEUDO_PALETTE_SIZE 16 + +struct mdpy_fb_par { + u32 palette[PSEUDO_PALETTE_SIZE]; +}; + +static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, + u_int transp, struct fb_info *info) +{ + u32 *pal = info->pseudo_palette; + u32 cr = red >> (16 - info->var.red.length); + u32 cg = green >> (16 - info->var.green.length); + u32 cb = blue >> (16 - info->var.blue.length); + u32 value, mask; + + if (regno >= PSEUDO_PALETTE_SIZE) + return -EINVAL; + + value = (cr << info->var.red.offset) | + (cg << info->var.green.offset) | + (cb << info->var.blue.offset); + if (info->var.transp.length > 0) { + mask = (1 << info->var.transp.length) - 1; + mask <<= info->var.transp.offset; + value |= mask; + } + pal[regno] = value; + + return 0; +} + +static void mdpy_fb_destroy(struct fb_info *info) +{ + if (info->screen_base) + iounmap(info->screen_base); +} + +static const struct fb_ops mdpy_fb_ops = { + .owner = THIS_MODULE, + .fb_destroy = mdpy_fb_destroy, + .fb_setcolreg = mdpy_fb_setcolreg, + .fb_fillrect = cfb_fillrect, + .fb_copyarea = cfb_copyarea, + .fb_imageblit = cfb_imageblit, +}; + +static int mdpy_fb_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct fb_info *info; + struct mdpy_fb_par *par; + u32 format, width, height; + int ret; + + ret = pci_enable_device(pdev); + if (ret < 0) + return ret; + + ret = pci_request_regions(pdev, "mdpy-fb"); + if (ret < 0) + goto err_disable_dev; + + pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format); + pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width); + pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height); + if (format != DRM_FORMAT_XRGB8888) { + pci_err(pdev, "format mismatch (0x%x != 0x%x)\n", + format, DRM_FORMAT_XRGB8888); + ret = -EINVAL; + goto err_release_regions; + } + if (width < 100 || width > 10000) { + pci_err(pdev, "width (%d) out of range\n", width); + ret = -EINVAL; + goto err_release_regions; + } + if (height < 100 || height > 10000) { + pci_err(pdev, "height (%d) out of range\n", height); + ret = -EINVAL; + goto err_release_regions; + } + pci_info(pdev, "mdpy found: %dx%d framebuffer\n", + width, height); + + info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev); + if (!info) { + ret = -ENOMEM; + goto err_release_regions; + } + pci_set_drvdata(pdev, info); + par = info->par; + + info->fix = mdpy_fb_fix; + info->fix.smem_start = pci_resource_start(pdev, 0); + info->fix.smem_len = pci_resource_len(pdev, 0); + info->fix.line_length = width * 4; + + info->var = mdpy_fb_var; + info->var.xres = width; + info->var.yres = height; + info->var.xres_virtual = width; + info->var.yres_virtual = height; + + info->screen_size = info->fix.smem_len; + info->screen_base = ioremap(info->fix.smem_start, + info->screen_size); + if (!info->screen_base) { + pci_err(pdev, "ioremap(pcibar) failed\n"); + ret = -EIO; + goto err_release_fb; + } + + info->apertures = alloc_apertures(1); + if (!info->apertures) { + ret = -ENOMEM; + goto err_unmap; + } + info->apertures->ranges[0].base = info->fix.smem_start; + info->apertures->ranges[0].size = info->fix.smem_len; + + info->fbops = &mdpy_fb_ops; + info->flags = FBINFO_DEFAULT; + info->pseudo_palette = par->palette; + + ret = register_framebuffer(info); + if (ret < 0) { + pci_err(pdev, "mdpy-fb device register failed: %d\n", ret); + goto err_unmap; + } + + pci_info(pdev, "fb%d registered\n", info->node); + return 0; + +err_unmap: + iounmap(info->screen_base); + +err_release_fb: + framebuffer_release(info); + +err_release_regions: + pci_release_regions(pdev); + +err_disable_dev: + pci_disable_device(pdev); + + return ret; +} + +static void mdpy_fb_remove(struct pci_dev *pdev) +{ + struct fb_info *info = pci_get_drvdata(pdev); + + unregister_framebuffer(info); + iounmap(info->screen_base); + framebuffer_release(info); + pci_release_regions(pdev); + pci_disable_device(pdev); +} + +static struct pci_device_id mdpy_fb_pci_table[] = { + { + .vendor = MDPY_PCI_VENDOR_ID, + .device = MDPY_PCI_DEVICE_ID, + .subvendor = MDPY_PCI_SUBVENDOR_ID, + .subdevice = MDPY_PCI_SUBDEVICE_ID, + }, { + /* end of list */ + } +}; + +static struct pci_driver mdpy_fb_pci_driver = { + .name = "mdpy-fb", + .id_table = mdpy_fb_pci_table, + .probe = mdpy_fb_probe, + .remove = mdpy_fb_remove, +}; + +static int __init mdpy_fb_init(void) +{ + int ret; + + ret = pci_register_driver(&mdpy_fb_pci_driver); + if (ret) + return ret; + + return 0; +} + +module_init(mdpy_fb_init); + +MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table); +MODULE_LICENSE("GPL v2"); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c new file mode 100644 index 000000000..9894693f3 --- /dev/null +++ b/samples/vfio-mdev/mdpy.c @@ -0,0 +1,807 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Mediated virtual PCI display host device driver + * + * See mdpy-defs.h for device specs + * + * (c) Gerd Hoffmann <kraxel@redhat.com> + * + * based on mtty driver which is: + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/cdev.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/sysfs.h> +#include <linux/mdev.h> +#include <linux/pci.h> +#include <drm/drm_fourcc.h> +#include "mdpy-defs.h" + +#define MDPY_NAME "mdpy" +#define MDPY_CLASS_NAME "mdpy" + +#define MDPY_CONFIG_SPACE_SIZE 0xff +#define MDPY_MEMORY_BAR_OFFSET PAGE_SIZE +#define MDPY_DISPLAY_REGION 16 + +#define STORE_LE16(addr, val) (*(u16 *)addr = val) +#define STORE_LE32(addr, val) (*(u32 *)addr = val) + + +MODULE_LICENSE("GPL v2"); + +static int max_devices = 4; +module_param_named(count, max_devices, int, 0444); +MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); + + +#define MDPY_TYPE_1 "vga" +#define MDPY_TYPE_2 "xga" +#define MDPY_TYPE_3 "hd" + +static const struct mdpy_type { + const char *name; + u32 format; + u32 bytepp; + u32 width; + u32 height; +} mdpy_types[] = { + { + .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, + .format = DRM_FORMAT_XRGB8888, + .bytepp = 4, + .width = 640, + .height = 480, + }, { + .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, + .format = DRM_FORMAT_XRGB8888, + .bytepp = 4, + .width = 1024, + .height = 768, + }, { + .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, + .format = DRM_FORMAT_XRGB8888, + .bytepp = 4, + .width = 1920, + .height = 1080, + }, +}; + +static dev_t mdpy_devt; +static struct class *mdpy_class; +static struct cdev mdpy_cdev; +static struct device mdpy_dev; +static u32 mdpy_count; + +/* State of each mdev device */ +struct mdev_state { + u8 *vconfig; + u32 bar_mask; + struct mutex ops_lock; + struct mdev_device *mdev; + struct vfio_device_info dev_info; + + const struct mdpy_type *type; + u32 memsize; + void *memblk; +}; + +static const struct mdpy_type *mdpy_find_type(struct kobject *kobj) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mdpy_types); i++) + if (strcmp(mdpy_types[i].name, kobj->name) == 0) + return mdpy_types + i; + return NULL; +} + +static void mdpy_create_config_space(struct mdev_state *mdev_state) +{ + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], + MDPY_PCI_VENDOR_ID); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID], + MDPY_PCI_DEVICE_ID); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID], + MDPY_PCI_SUBVENDOR_ID); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID], + MDPY_PCI_SUBDEVICE_ID); + + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND], + PCI_COMMAND_IO | PCI_COMMAND_MEMORY); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_STATUS], + PCI_STATUS_CAP_LIST); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE], + PCI_CLASS_DISPLAY_OTHER); + mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01; + + STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0], + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_32 | + PCI_BASE_ADDRESS_MEM_PREFETCH); + mdev_state->bar_mask = ~(mdev_state->memsize) + 1; + + /* vendor specific capability for the config registers */ + mdev_state->vconfig[PCI_CAPABILITY_LIST] = MDPY_VENDORCAP_OFFSET; + mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 0] = 0x09; /* vendor cap */ + mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 1] = 0x00; /* next ptr */ + mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 2] = MDPY_VENDORCAP_SIZE; + STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET], + mdev_state->type->format); + STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET], + mdev_state->type->width); + STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET], + mdev_state->type->height); +} + +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + u32 cfg_addr; + + switch (offset) { + case PCI_BASE_ADDRESS_0: + cfg_addr = *(u32 *)buf; + + if (cfg_addr == 0xffffffff) { + cfg_addr = (cfg_addr & mdev_state->bar_mask); + } else { + cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK; + if (cfg_addr) + dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr); + } + + cfg_addr |= (mdev_state->vconfig[offset] & + ~PCI_BASE_ADDRESS_MEM_MASK); + STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); + break; + } +} + +static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, + loff_t pos, bool is_write) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct device *dev = mdev_dev(mdev); + int ret = 0; + + mutex_lock(&mdev_state->ops_lock); + + if (pos < MDPY_CONFIG_SPACE_SIZE) { + if (is_write) + handle_pci_cfg_write(mdev_state, pos, buf, count); + else + memcpy(buf, (mdev_state->vconfig + pos), count); + + } else if ((pos >= MDPY_MEMORY_BAR_OFFSET) && + (pos + count <= + MDPY_MEMORY_BAR_OFFSET + mdev_state->memsize)) { + pos -= MDPY_MEMORY_BAR_OFFSET; + if (is_write) + memcpy(mdev_state->memblk, buf, count); + else + memcpy(buf, mdev_state->memblk, count); + + } else { + dev_info(dev, "%s: %s @0x%llx (unhandled)\n", + __func__, is_write ? "WR" : "RD", pos); + ret = -1; + goto accessfailed; + } + + ret = count; + + +accessfailed: + mutex_unlock(&mdev_state->ops_lock); + + return ret; +} + +static int mdpy_reset(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + u32 stride, i; + + /* initialize with gray gradient */ + stride = mdev_state->type->width * mdev_state->type->bytepp; + for (i = 0; i < mdev_state->type->height; i++) + memset(mdev_state->memblk + i * stride, + i * 255 / mdev_state->type->height, + stride); + return 0; +} + +static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev) +{ + const struct mdpy_type *type = mdpy_find_type(kobj); + struct device *dev = mdev_dev(mdev); + struct mdev_state *mdev_state; + u32 fbsize; + + if (mdpy_count >= max_devices) + return -ENOMEM; + + mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); + if (mdev_state == NULL) + return -ENOMEM; + + mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (mdev_state->vconfig == NULL) { + kfree(mdev_state); + return -ENOMEM; + } + + if (!type) + type = &mdpy_types[0]; + fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp); + + mdev_state->memblk = vmalloc_user(fbsize); + if (!mdev_state->memblk) { + kfree(mdev_state->vconfig); + kfree(mdev_state); + return -ENOMEM; + } + dev_info(dev, "%s: %s (%dx%d)\n", + __func__, kobj->name, type->width, type->height); + + mutex_init(&mdev_state->ops_lock); + mdev_state->mdev = mdev; + mdev_set_drvdata(mdev, mdev_state); + + mdev_state->type = type; + mdev_state->memsize = fbsize; + mdpy_create_config_space(mdev_state); + mdpy_reset(mdev); + + mdpy_count++; + return 0; +} + +static int mdpy_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + struct device *dev = mdev_dev(mdev); + + dev_info(dev, "%s\n", __func__); + + mdev_set_drvdata(mdev, NULL); + vfree(mdev_state->memblk); + kfree(mdev_state->vconfig); + kfree(mdev_state); + + mdpy_count--; + return 0; +} + +static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 2; + } else { + u8 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; + +read_err: + return -EFAULT; +} + +static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 1; + } + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; +write_err: + return -EFAULT; +} + +static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + + if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT) + return -EINVAL; + if (vma->vm_end < vma->vm_start) + return -EINVAL; + if (vma->vm_end - vma->vm_start > mdev_state->memsize) + return -EINVAL; + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + return remap_vmalloc_range_partial(vma, vma->vm_start, + mdev_state->memblk, 0, + vma->vm_end - vma->vm_start); +} + +static int mdpy_get_region_info(struct mdev_device *mdev, + struct vfio_region_info *region_info, + u16 *cap_type_id, void **cap_type) +{ + struct mdev_state *mdev_state; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + if (region_info->index >= VFIO_PCI_NUM_REGIONS && + region_info->index != MDPY_DISPLAY_REGION) + return -EINVAL; + + switch (region_info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + region_info->offset = 0; + region_info->size = MDPY_CONFIG_SPACE_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE); + break; + case VFIO_PCI_BAR0_REGION_INDEX: + case MDPY_DISPLAY_REGION: + region_info->offset = MDPY_MEMORY_BAR_OFFSET; + region_info->size = mdev_state->memsize; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP); + break; + default: + region_info->size = 0; + region_info->offset = 0; + region_info->flags = 0; + } + + return 0; +} + +static int mdpy_get_irq_info(struct mdev_device *mdev, + struct vfio_irq_info *irq_info) +{ + irq_info->count = 0; + return 0; +} + +static int mdpy_get_device_info(struct mdev_device *mdev, + struct vfio_device_info *dev_info) +{ + dev_info->flags = VFIO_DEVICE_FLAGS_PCI; + dev_info->num_regions = VFIO_PCI_NUM_REGIONS; + dev_info->num_irqs = VFIO_PCI_NUM_IRQS; + return 0; +} + +static int mdpy_query_gfx_plane(struct mdev_device *mdev, + struct vfio_device_gfx_plane_info *plane) +{ + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + + if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) { + if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE | + VFIO_GFX_PLANE_TYPE_REGION)) + return 0; + return -EINVAL; + } + + if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION) + return -EINVAL; + + plane->drm_format = mdev_state->type->format; + plane->width = mdev_state->type->width; + plane->height = mdev_state->type->height; + plane->stride = (mdev_state->type->width * + mdev_state->type->bytepp); + plane->size = mdev_state->memsize; + plane->region_index = MDPY_DISPLAY_REGION; + + /* unused */ + plane->drm_format_mod = 0; + plane->x_pos = 0; + plane->y_pos = 0; + plane->x_hot = 0; + plane->y_hot = 0; + + return 0; +} + +static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + unsigned long minsz; + struct mdev_state *mdev_state; + + mdev_state = mdev_get_drvdata(mdev); + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mdpy_get_device_info(mdev, &info); + if (ret) + return ret; + + memcpy(&mdev_state->dev_info, &info, sizeof(info)); + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info info; + u16 cap_type_id = 0; + void *cap_type = NULL; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mdpy_get_region_info(mdev, &info, &cap_type_id, + &cap_type); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if ((info.argsz < minsz) || + (info.index >= mdev_state->dev_info.num_irqs)) + return -EINVAL; + + ret = mdpy_get_irq_info(mdev, &info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_QUERY_GFX_PLANE: + { + struct vfio_device_gfx_plane_info plane; + + minsz = offsetofend(struct vfio_device_gfx_plane_info, + region_index); + + if (copy_from_user(&plane, (void __user *)arg, minsz)) + return -EFAULT; + + if (plane.argsz < minsz) + return -EINVAL; + + ret = mdpy_query_gfx_plane(mdev, &plane); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &plane, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_SET_IRQS: + return -EINVAL; + + case VFIO_DEVICE_RESET: + return mdpy_reset(mdev); + } + return -ENOTTY; +} + +static int mdpy_open(struct mdev_device *mdev) +{ + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + return 0; +} + +static void mdpy_close(struct mdev_device *mdev) +{ + module_put(THIS_MODULE); +} + +static ssize_t +resolution_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct mdev_device *mdev = mdev_from_dev(dev); + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + + return sprintf(buf, "%dx%d\n", + mdev_state->type->width, + mdev_state->type->height); +} +static DEVICE_ATTR_RO(resolution); + +static struct attribute *mdev_dev_attrs[] = { + &dev_attr_resolution.attr, + NULL, +}; + +static const struct attribute_group mdev_dev_group = { + .name = "vendor", + .attrs = mdev_dev_attrs, +}; + +const struct attribute_group *mdev_dev_groups[] = { + &mdev_dev_group, + NULL, +}; + +static ssize_t +name_show(struct kobject *kobj, struct device *dev, char *buf) +{ + return sprintf(buf, "%s\n", kobj->name); +} +MDEV_TYPE_ATTR_RO(name); + +static ssize_t +description_show(struct kobject *kobj, struct device *dev, char *buf) +{ + const struct mdpy_type *type = mdpy_find_type(kobj); + + return sprintf(buf, "virtual display, %dx%d framebuffer\n", + type ? type->width : 0, + type ? type->height : 0); +} +MDEV_TYPE_ATTR_RO(description); + +static ssize_t +available_instances_show(struct kobject *kobj, struct device *dev, char *buf) +{ + return sprintf(buf, "%d\n", max_devices - mdpy_count); +} +MDEV_TYPE_ATTR_RO(available_instances); + +static ssize_t device_api_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); +} +MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *mdev_types_attrs[] = { + &mdev_type_attr_name.attr, + &mdev_type_attr_description.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_available_instances.attr, + NULL, +}; + +static struct attribute_group mdev_type_group1 = { + .name = MDPY_TYPE_1, + .attrs = mdev_types_attrs, +}; + +static struct attribute_group mdev_type_group2 = { + .name = MDPY_TYPE_2, + .attrs = mdev_types_attrs, +}; + +static struct attribute_group mdev_type_group3 = { + .name = MDPY_TYPE_3, + .attrs = mdev_types_attrs, +}; + +static struct attribute_group *mdev_type_groups[] = { + &mdev_type_group1, + &mdev_type_group2, + &mdev_type_group3, + NULL, +}; + +static const struct mdev_parent_ops mdev_fops = { + .owner = THIS_MODULE, + .mdev_attr_groups = mdev_dev_groups, + .supported_type_groups = mdev_type_groups, + .create = mdpy_create, + .remove = mdpy_remove, + .open = mdpy_open, + .release = mdpy_close, + .read = mdpy_read, + .write = mdpy_write, + .ioctl = mdpy_ioctl, + .mmap = mdpy_mmap, +}; + +static const struct file_operations vd_fops = { + .owner = THIS_MODULE, +}; + +static void mdpy_device_release(struct device *dev) +{ + /* nothing */ +} + +static int __init mdpy_dev_init(void) +{ + int ret = 0; + + ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK + 1, MDPY_NAME); + if (ret < 0) { + pr_err("Error: failed to register mdpy_dev, err: %d\n", ret); + return ret; + } + cdev_init(&mdpy_cdev, &vd_fops); + cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1); + pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt)); + + mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME); + if (IS_ERR(mdpy_class)) { + pr_err("Error: failed to register mdpy_dev class\n"); + ret = PTR_ERR(mdpy_class); + goto failed1; + } + mdpy_dev.class = mdpy_class; + mdpy_dev.release = mdpy_device_release; + dev_set_name(&mdpy_dev, "%s", MDPY_NAME); + + ret = device_register(&mdpy_dev); + if (ret) + goto failed2; + + ret = mdev_register_device(&mdpy_dev, &mdev_fops); + if (ret) + goto failed3; + + return 0; + +failed3: + device_unregister(&mdpy_dev); +failed2: + class_destroy(mdpy_class); +failed1: + cdev_del(&mdpy_cdev); + unregister_chrdev_region(mdpy_devt, MINORMASK + 1); + return ret; +} + +static void __exit mdpy_dev_exit(void) +{ + mdpy_dev.bus = NULL; + mdev_unregister_device(&mdpy_dev); + + device_unregister(&mdpy_dev); + cdev_del(&mdpy_cdev); + unregister_chrdev_region(mdpy_devt, MINORMASK + 1); + class_destroy(mdpy_class); + mdpy_class = NULL; +} + +module_init(mdpy_dev_init) +module_exit(mdpy_dev_exit) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c new file mode 100644 index 000000000..ce84a300a --- /dev/null +++ b/samples/vfio-mdev/mtty.c @@ -0,0 +1,1491 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Mediated virtual PCI serial host device driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * Sample driver that creates mdev device that simulates serial port over PCI + * card. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/cdev.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/uuid.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/sysfs.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/mdev.h> +#include <linux/pci.h> +#include <linux/serial.h> +#include <uapi/linux/serial_reg.h> +#include <linux/eventfd.h> +/* + * #defines + */ + +#define VERSION_STRING "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" + +#define MTTY_CLASS_NAME "mtty" + +#define MTTY_NAME "mtty" + +#define MTTY_STRING_LEN 16 + +#define MTTY_CONFIG_SPACE_SIZE 0xff +#define MTTY_IO_BAR_SIZE 0x8 +#define MTTY_MMIO_BAR_SIZE 0x100000 + +#define STORE_LE16(addr, val) (*(u16 *)addr = val) +#define STORE_LE32(addr, val) (*(u32 *)addr = val) + +#define MAX_FIFO_SIZE 16 + +#define CIRCULAR_BUF_INC_IDX(idx) (idx = (idx + 1) & (MAX_FIFO_SIZE - 1)) + +#define MTTY_VFIO_PCI_OFFSET_SHIFT 40 + +#define MTTY_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> MTTY_VFIO_PCI_OFFSET_SHIFT) +#define MTTY_VFIO_PCI_INDEX_TO_OFFSET(index) \ + ((u64)(index) << MTTY_VFIO_PCI_OFFSET_SHIFT) +#define MTTY_VFIO_PCI_OFFSET_MASK \ + (((u64)(1) << MTTY_VFIO_PCI_OFFSET_SHIFT) - 1) +#define MAX_MTTYS 24 + +/* + * Global Structures + */ + +static struct mtty_dev { + dev_t vd_devt; + struct class *vd_class; + struct cdev vd_cdev; + struct idr vd_idr; + struct device dev; +} mtty_dev; + +struct mdev_region_info { + u64 start; + u64 phys_start; + u32 size; + u64 vfio_offset; +}; + +#if defined(DEBUG_REGS) +static const char *wr_reg[] = { + "TX", + "IER", + "FCR", + "LCR", + "MCR", + "LSR", + "MSR", + "SCR" +}; + +static const char *rd_reg[] = { + "RX", + "IER", + "IIR", + "LCR", + "MCR", + "LSR", + "MSR", + "SCR" +}; +#endif + +/* loop back buffer */ +struct rxtx { + u8 fifo[MAX_FIFO_SIZE]; + u8 head, tail; + u8 count; +}; + +struct serial_port { + u8 uart_reg[8]; /* 8 registers */ + struct rxtx rxtx; /* loop back buffer */ + bool dlab; + bool overrun; + u16 divisor; + u8 fcr; /* FIFO control register */ + u8 max_fifo_size; + u8 intr_trigger_level; /* interrupt trigger level */ +}; + +/* State of each mdev device */ +struct mdev_state { + int irq_fd; + struct eventfd_ctx *intx_evtfd; + struct eventfd_ctx *msi_evtfd; + int irq_index; + u8 *vconfig; + struct mutex ops_lock; + struct mdev_device *mdev; + struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS]; + u32 bar_mask[VFIO_PCI_NUM_REGIONS]; + struct list_head next; + struct serial_port s[2]; + struct mutex rxtx_lock; + struct vfio_device_info dev_info; + int nr_ports; +}; + +static struct mutex mdev_list_lock; +static struct list_head mdev_devices_list; + +static const struct file_operations vd_fops = { + .owner = THIS_MODULE, +}; + +/* function prototypes */ + +static int mtty_trigger_interrupt(struct mdev_state *mdev_state); + +/* Helper functions */ + +static void dump_buffer(u8 *buf, uint32_t count) +{ +#if defined(DEBUG) + int i; + + pr_info("Buffer:\n"); + for (i = 0; i < count; i++) { + pr_info("%2x ", *(buf + i)); + if ((i + 1) % 16 == 0) + pr_info("\n"); + } +#endif +} + +static void mtty_create_config_space(struct mdev_state *mdev_state) +{ + /* PCI dev ID */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x0], 0x32534348); + + /* Control: I/O+, Mem-, BusMaster- */ + STORE_LE16((u16 *) &mdev_state->vconfig[0x4], 0x0001); + + /* Status: capabilities list absent */ + STORE_LE16((u16 *) &mdev_state->vconfig[0x6], 0x0200); + + /* Rev ID */ + mdev_state->vconfig[0x8] = 0x10; + + /* programming interface class : 16550-compatible serial controller */ + mdev_state->vconfig[0x9] = 0x02; + + /* Sub class : 00 */ + mdev_state->vconfig[0xa] = 0x00; + + /* Base class : Simple Communication controllers */ + mdev_state->vconfig[0xb] = 0x07; + + /* base address registers */ + /* BAR0: IO space */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x10], 0x000001); + mdev_state->bar_mask[0] = ~(MTTY_IO_BAR_SIZE) + 1; + + if (mdev_state->nr_ports == 2) { + /* BAR1: IO space */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x14], 0x000001); + mdev_state->bar_mask[1] = ~(MTTY_IO_BAR_SIZE) + 1; + } + + /* Subsystem ID */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x2c], 0x32534348); + + mdev_state->vconfig[0x34] = 0x00; /* Cap Ptr */ + mdev_state->vconfig[0x3d] = 0x01; /* interrupt pin (INTA#) */ + + /* Vendor specific data */ + mdev_state->vconfig[0x40] = 0x23; + mdev_state->vconfig[0x43] = 0x80; + mdev_state->vconfig[0x44] = 0x23; + mdev_state->vconfig[0x48] = 0x23; + mdev_state->vconfig[0x4c] = 0x23; + + mdev_state->vconfig[0x60] = 0x50; + mdev_state->vconfig[0x61] = 0x43; + mdev_state->vconfig[0x62] = 0x49; + mdev_state->vconfig[0x63] = 0x20; + mdev_state->vconfig[0x64] = 0x53; + mdev_state->vconfig[0x65] = 0x65; + mdev_state->vconfig[0x66] = 0x72; + mdev_state->vconfig[0x67] = 0x69; + mdev_state->vconfig[0x68] = 0x61; + mdev_state->vconfig[0x69] = 0x6c; + mdev_state->vconfig[0x6a] = 0x2f; + mdev_state->vconfig[0x6b] = 0x55; + mdev_state->vconfig[0x6c] = 0x41; + mdev_state->vconfig[0x6d] = 0x52; + mdev_state->vconfig[0x6e] = 0x54; +} + +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, + u8 *buf, u32 count) +{ + u32 cfg_addr, bar_mask, bar_index = 0; + + switch (offset) { + case 0x04: /* device control */ + case 0x06: /* device status */ + /* do nothing */ + break; + case 0x3c: /* interrupt line */ + mdev_state->vconfig[0x3c] = buf[0]; + break; + case 0x3d: + /* + * Interrupt Pin is hardwired to INTA. + * This field is write protected by hardware + */ + break; + case 0x10: /* BAR0 */ + case 0x14: /* BAR1 */ + if (offset == 0x10) + bar_index = 0; + else if (offset == 0x14) + bar_index = 1; + + if ((mdev_state->nr_ports == 1) && (bar_index == 1)) { + STORE_LE32(&mdev_state->vconfig[offset], 0); + break; + } + + cfg_addr = *(u32 *)buf; + pr_info("BAR%d addr 0x%x\n", bar_index, cfg_addr); + + if (cfg_addr == 0xffffffff) { + bar_mask = mdev_state->bar_mask[bar_index]; + cfg_addr = (cfg_addr & bar_mask); + } + + cfg_addr |= (mdev_state->vconfig[offset] & 0x3ul); + STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); + break; + case 0x18: /* BAR2 */ + case 0x1c: /* BAR3 */ + case 0x20: /* BAR4 */ + STORE_LE32(&mdev_state->vconfig[offset], 0); + break; + default: + pr_info("PCI config write @0x%x of %d bytes not handled\n", + offset, count); + break; + } +} + +static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, + u16 offset, u8 *buf, u32 count) +{ + u8 data = *buf; + + /* Handle data written by guest */ + switch (offset) { + case UART_TX: + /* if DLAB set, data is LSB of divisor */ + if (mdev_state->s[index].dlab) { + mdev_state->s[index].divisor |= data; + break; + } + + mutex_lock(&mdev_state->rxtx_lock); + + /* save in TX buffer */ + if (mdev_state->s[index].rxtx.count < + mdev_state->s[index].max_fifo_size) { + mdev_state->s[index].rxtx.fifo[ + mdev_state->s[index].rxtx.head] = data; + mdev_state->s[index].rxtx.count++; + CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.head); + mdev_state->s[index].overrun = false; + + /* + * Trigger interrupt if receive data interrupt is + * enabled and fifo reached trigger level + */ + if ((mdev_state->s[index].uart_reg[UART_IER] & + UART_IER_RDI) && + (mdev_state->s[index].rxtx.count == + mdev_state->s[index].intr_trigger_level)) { + /* trigger interrupt */ +#if defined(DEBUG_INTR) + pr_err("Serial port %d: Fifo level trigger\n", + index); +#endif + mtty_trigger_interrupt(mdev_state); + } + } else { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: Buffer Overflow\n", index); +#endif + mdev_state->s[index].overrun = true; + + /* + * Trigger interrupt if receiver line status interrupt + * is enabled + */ + if (mdev_state->s[index].uart_reg[UART_IER] & + UART_IER_RLSI) + mtty_trigger_interrupt(mdev_state); + } + mutex_unlock(&mdev_state->rxtx_lock); + break; + + case UART_IER: + /* if DLAB set, data is MSB of divisor */ + if (mdev_state->s[index].dlab) + mdev_state->s[index].divisor |= (u16)data << 8; + else { + mdev_state->s[index].uart_reg[offset] = data; + mutex_lock(&mdev_state->rxtx_lock); + if ((data & UART_IER_THRI) && + (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail)) { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: IER_THRI write\n", + index); +#endif + mtty_trigger_interrupt(mdev_state); + } + + mutex_unlock(&mdev_state->rxtx_lock); + } + + break; + + case UART_FCR: + mdev_state->s[index].fcr = data; + + mutex_lock(&mdev_state->rxtx_lock); + if (data & (UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT)) { + /* clear loop back FIFO */ + mdev_state->s[index].rxtx.count = 0; + mdev_state->s[index].rxtx.head = 0; + mdev_state->s[index].rxtx.tail = 0; + } + mutex_unlock(&mdev_state->rxtx_lock); + + switch (data & UART_FCR_TRIGGER_MASK) { + case UART_FCR_TRIGGER_1: + mdev_state->s[index].intr_trigger_level = 1; + break; + + case UART_FCR_TRIGGER_4: + mdev_state->s[index].intr_trigger_level = 4; + break; + + case UART_FCR_TRIGGER_8: + mdev_state->s[index].intr_trigger_level = 8; + break; + + case UART_FCR_TRIGGER_14: + mdev_state->s[index].intr_trigger_level = 14; + break; + } + + /* + * Set trigger level to 1 otherwise or implement timer with + * timeout of 4 characters and on expiring that timer set + * Recevice data timeout in IIR register + */ + mdev_state->s[index].intr_trigger_level = 1; + if (data & UART_FCR_ENABLE_FIFO) + mdev_state->s[index].max_fifo_size = MAX_FIFO_SIZE; + else { + mdev_state->s[index].max_fifo_size = 1; + mdev_state->s[index].intr_trigger_level = 1; + } + + break; + + case UART_LCR: + if (data & UART_LCR_DLAB) { + mdev_state->s[index].dlab = true; + mdev_state->s[index].divisor = 0; + } else + mdev_state->s[index].dlab = false; + + mdev_state->s[index].uart_reg[offset] = data; + break; + + case UART_MCR: + mdev_state->s[index].uart_reg[offset] = data; + + if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) && + (data & UART_MCR_OUT2)) { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: MCR_OUT2 write\n", index); +#endif + mtty_trigger_interrupt(mdev_state); + } + + if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) && + (data & (UART_MCR_RTS | UART_MCR_DTR))) { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: MCR RTS/DTR write\n", index); +#endif + mtty_trigger_interrupt(mdev_state); + } + break; + + case UART_LSR: + case UART_MSR: + /* do nothing */ + break; + + case UART_SCR: + mdev_state->s[index].uart_reg[offset] = data; + break; + + default: + break; + } +} + +static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, + u16 offset, u8 *buf, u32 count) +{ + /* Handle read requests by guest */ + switch (offset) { + case UART_RX: + /* if DLAB set, data is LSB of divisor */ + if (mdev_state->s[index].dlab) { + *buf = (u8)mdev_state->s[index].divisor; + break; + } + + mutex_lock(&mdev_state->rxtx_lock); + /* return data in tx buffer */ + if (mdev_state->s[index].rxtx.head != + mdev_state->s[index].rxtx.tail) { + *buf = mdev_state->s[index].rxtx.fifo[ + mdev_state->s[index].rxtx.tail]; + mdev_state->s[index].rxtx.count--; + CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.tail); + } + + if (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail) { + /* + * Trigger interrupt if tx buffer empty interrupt is + * enabled and fifo is empty + */ +#if defined(DEBUG_INTR) + pr_err("Serial port %d: Buffer Empty\n", index); +#endif + if (mdev_state->s[index].uart_reg[UART_IER] & + UART_IER_THRI) + mtty_trigger_interrupt(mdev_state); + } + mutex_unlock(&mdev_state->rxtx_lock); + + break; + + case UART_IER: + if (mdev_state->s[index].dlab) { + *buf = (u8)(mdev_state->s[index].divisor >> 8); + break; + } + *buf = mdev_state->s[index].uart_reg[offset] & 0x0f; + break; + + case UART_IIR: + { + u8 ier = mdev_state->s[index].uart_reg[UART_IER]; + *buf = 0; + + mutex_lock(&mdev_state->rxtx_lock); + /* Interrupt priority 1: Parity, overrun, framing or break */ + if ((ier & UART_IER_RLSI) && mdev_state->s[index].overrun) + *buf |= UART_IIR_RLSI; + + /* Interrupt priority 2: Fifo trigger level reached */ + if ((ier & UART_IER_RDI) && + (mdev_state->s[index].rxtx.count >= + mdev_state->s[index].intr_trigger_level)) + *buf |= UART_IIR_RDI; + + /* Interrupt priotiry 3: transmitter holding register empty */ + if ((ier & UART_IER_THRI) && + (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail)) + *buf |= UART_IIR_THRI; + + /* Interrupt priotiry 4: Modem status: CTS, DSR, RI or DCD */ + if ((ier & UART_IER_MSI) && + (mdev_state->s[index].uart_reg[UART_MCR] & + (UART_MCR_RTS | UART_MCR_DTR))) + *buf |= UART_IIR_MSI; + + /* bit0: 0=> interrupt pending, 1=> no interrupt is pending */ + if (*buf == 0) + *buf = UART_IIR_NO_INT; + + /* set bit 6 & 7 to be 16550 compatible */ + *buf |= 0xC0; + mutex_unlock(&mdev_state->rxtx_lock); + } + break; + + case UART_LCR: + case UART_MCR: + *buf = mdev_state->s[index].uart_reg[offset]; + break; + + case UART_LSR: + { + u8 lsr = 0; + + mutex_lock(&mdev_state->rxtx_lock); + /* atleast one char in FIFO */ + if (mdev_state->s[index].rxtx.head != + mdev_state->s[index].rxtx.tail) + lsr |= UART_LSR_DR; + + /* if FIFO overrun */ + if (mdev_state->s[index].overrun) + lsr |= UART_LSR_OE; + + /* transmit FIFO empty and tramsitter empty */ + if (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail) + lsr |= UART_LSR_TEMT | UART_LSR_THRE; + + mutex_unlock(&mdev_state->rxtx_lock); + *buf = lsr; + break; + } + case UART_MSR: + *buf = UART_MSR_DSR | UART_MSR_DDSR | UART_MSR_DCD; + + mutex_lock(&mdev_state->rxtx_lock); + /* if AFE is 1 and FIFO have space, set CTS bit */ + if (mdev_state->s[index].uart_reg[UART_MCR] & + UART_MCR_AFE) { + if (mdev_state->s[index].rxtx.count < + mdev_state->s[index].max_fifo_size) + *buf |= UART_MSR_CTS | UART_MSR_DCTS; + } else + *buf |= UART_MSR_CTS | UART_MSR_DCTS; + mutex_unlock(&mdev_state->rxtx_lock); + + break; + + case UART_SCR: + *buf = mdev_state->s[index].uart_reg[offset]; + break; + + default: + break; + } +} + +static void mdev_read_base(struct mdev_state *mdev_state) +{ + int index, pos; + u32 start_lo, start_hi; + u32 mem_type; + + pos = PCI_BASE_ADDRESS_0; + + for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) { + + if (!mdev_state->region_info[index].size) + continue; + + start_lo = (*(u32 *)(mdev_state->vconfig + pos)) & + PCI_BASE_ADDRESS_MEM_MASK; + mem_type = (*(u32 *)(mdev_state->vconfig + pos)) & + PCI_BASE_ADDRESS_MEM_TYPE_MASK; + + switch (mem_type) { + case PCI_BASE_ADDRESS_MEM_TYPE_64: + start_hi = (*(u32 *)(mdev_state->vconfig + pos + 4)); + pos += 4; + break; + case PCI_BASE_ADDRESS_MEM_TYPE_32: + case PCI_BASE_ADDRESS_MEM_TYPE_1M: + /* 1M mem BAR treated as 32-bit BAR */ + default: + /* mem unknown type treated as 32-bit BAR */ + start_hi = 0; + break; + } + pos += 4; + mdev_state->region_info[index].start = ((u64)start_hi << 32) | + start_lo; + } +} + +static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count, + loff_t pos, bool is_write) +{ + struct mdev_state *mdev_state; + unsigned int index; + loff_t offset; + int ret = 0; + + if (!mdev || !buf) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) { + pr_err("%s mdev_state not found\n", __func__); + return -EINVAL; + } + + mutex_lock(&mdev_state->ops_lock); + + index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos); + offset = pos & MTTY_VFIO_PCI_OFFSET_MASK; + switch (index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + +#if defined(DEBUG) + pr_info("%s: PCI config space %s at offset 0x%llx\n", + __func__, is_write ? "write" : "read", offset); +#endif + if (is_write) { + dump_buffer(buf, count); + handle_pci_cfg_write(mdev_state, offset, buf, count); + } else { + memcpy(buf, (mdev_state->vconfig + offset), count); + dump_buffer(buf, count); + } + + break; + + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + if (!mdev_state->region_info[index].start) + mdev_read_base(mdev_state); + + if (is_write) { + dump_buffer(buf, count); + +#if defined(DEBUG_REGS) + pr_info("%s: BAR%d WR @0x%llx %s val:0x%02x dlab:%d\n", + __func__, index, offset, wr_reg[offset], + *buf, mdev_state->s[index].dlab); +#endif + handle_bar_write(index, mdev_state, offset, buf, count); + } else { + handle_bar_read(index, mdev_state, offset, buf, count); + dump_buffer(buf, count); + +#if defined(DEBUG_REGS) + pr_info("%s: BAR%d RD @0x%llx %s val:0x%02x dlab:%d\n", + __func__, index, offset, rd_reg[offset], + *buf, mdev_state->s[index].dlab); +#endif + } + break; + + default: + ret = -1; + goto accessfailed; + } + + ret = count; + + +accessfailed: + mutex_unlock(&mdev_state->ops_lock); + + return ret; +} + +static int mtty_create(struct kobject *kobj, struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + char name[MTTY_STRING_LEN]; + int nr_ports = 0, i; + + if (!mdev) + return -EINVAL; + + for (i = 0; i < 2; i++) { + snprintf(name, MTTY_STRING_LEN, "%s-%d", + dev_driver_string(mdev_parent_dev(mdev)), i + 1); + if (!strcmp(kobj->name, name)) { + nr_ports = i + 1; + break; + } + } + + if (!nr_ports) + return -EINVAL; + + mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); + if (mdev_state == NULL) + return -ENOMEM; + + mdev_state->nr_ports = nr_ports; + mdev_state->irq_index = -1; + mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; + mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; + mutex_init(&mdev_state->rxtx_lock); + mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); + + if (mdev_state->vconfig == NULL) { + kfree(mdev_state); + return -ENOMEM; + } + + mutex_init(&mdev_state->ops_lock); + mdev_state->mdev = mdev; + mdev_set_drvdata(mdev, mdev_state); + + mtty_create_config_space(mdev_state); + + mutex_lock(&mdev_list_lock); + list_add(&mdev_state->next, &mdev_devices_list); + mutex_unlock(&mdev_list_lock); + + return 0; +} + +static int mtty_remove(struct mdev_device *mdev) +{ + struct mdev_state *mds, *tmp_mds; + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + int ret = -EINVAL; + + mutex_lock(&mdev_list_lock); + list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) { + if (mdev_state == mds) { + list_del(&mdev_state->next); + mdev_set_drvdata(mdev, NULL); + kfree(mdev_state->vconfig); + kfree(mdev_state); + ret = 0; + break; + } + } + mutex_unlock(&mdev_list_lock); + + return ret; +} + +static int mtty_reset(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + pr_info("%s: called\n", __func__); + + return 0; +} + +static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 2; + } else { + u8 val; + + ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; + +read_err: + return -EFAULT; +} + +static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (u8 *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 1; + } + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; +write_err: + return -EFAULT; +} + +static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags, + unsigned int index, unsigned int start, + unsigned int count, void *data) +{ + int ret = 0; + struct mdev_state *mdev_state; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + mutex_lock(&mdev_state->ops_lock); + switch (index) { + case VFIO_PCI_INTX_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + case VFIO_IRQ_SET_ACTION_UNMASK: + break; + case VFIO_IRQ_SET_ACTION_TRIGGER: + { + if (flags & VFIO_IRQ_SET_DATA_NONE) { + pr_info("%s: disable INTx\n", __func__); + if (mdev_state->intx_evtfd) + eventfd_ctx_put(mdev_state->intx_evtfd); + break; + } + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int fd = *(int *)data; + + if (fd > 0) { + struct eventfd_ctx *evt; + + evt = eventfd_ctx_fdget(fd); + if (IS_ERR(evt)) { + ret = PTR_ERR(evt); + break; + } + mdev_state->intx_evtfd = evt; + mdev_state->irq_fd = fd; + mdev_state->irq_index = index; + break; + } + } + break; + } + } + break; + case VFIO_PCI_MSI_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + case VFIO_IRQ_SET_ACTION_UNMASK: + break; + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (flags & VFIO_IRQ_SET_DATA_NONE) { + if (mdev_state->msi_evtfd) + eventfd_ctx_put(mdev_state->msi_evtfd); + pr_info("%s: disable MSI\n", __func__); + mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX; + break; + } + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int fd = *(int *)data; + struct eventfd_ctx *evt; + + if (fd <= 0) + break; + + if (mdev_state->msi_evtfd) + break; + + evt = eventfd_ctx_fdget(fd); + if (IS_ERR(evt)) { + ret = PTR_ERR(evt); + break; + } + mdev_state->msi_evtfd = evt; + mdev_state->irq_fd = fd; + mdev_state->irq_index = index; + } + break; + } + break; + case VFIO_PCI_MSIX_IRQ_INDEX: + pr_info("%s: MSIX_IRQ\n", __func__); + break; + case VFIO_PCI_ERR_IRQ_INDEX: + pr_info("%s: ERR_IRQ\n", __func__); + break; + case VFIO_PCI_REQ_IRQ_INDEX: + pr_info("%s: REQ_IRQ\n", __func__); + break; + } + + mutex_unlock(&mdev_state->ops_lock); + return ret; +} + +static int mtty_trigger_interrupt(struct mdev_state *mdev_state) +{ + int ret = -1; + + if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) && + (!mdev_state->msi_evtfd)) + return -EINVAL; + else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) && + (!mdev_state->intx_evtfd)) { + pr_info("%s: Intr eventfd not found\n", __func__); + return -EINVAL; + } + + if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) + ret = eventfd_signal(mdev_state->msi_evtfd, 1); + else + ret = eventfd_signal(mdev_state->intx_evtfd, 1); + +#if defined(DEBUG_INTR) + pr_info("Intx triggered\n"); +#endif + if (ret != 1) + pr_err("%s: eventfd signal failed (%d)\n", __func__, ret); + + return ret; +} + +static int mtty_get_region_info(struct mdev_device *mdev, + struct vfio_region_info *region_info, + u16 *cap_type_id, void **cap_type) +{ + unsigned int size = 0; + struct mdev_state *mdev_state; + u32 bar_index; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + bar_index = region_info->index; + if (bar_index >= VFIO_PCI_NUM_REGIONS) + return -EINVAL; + + mutex_lock(&mdev_state->ops_lock); + + switch (bar_index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + size = MTTY_CONFIG_SPACE_SIZE; + break; + case VFIO_PCI_BAR0_REGION_INDEX: + size = MTTY_IO_BAR_SIZE; + break; + case VFIO_PCI_BAR1_REGION_INDEX: + if (mdev_state->nr_ports == 2) + size = MTTY_IO_BAR_SIZE; + break; + default: + size = 0; + break; + } + + mdev_state->region_info[bar_index].size = size; + mdev_state->region_info[bar_index].vfio_offset = + MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index); + + region_info->size = size; + region_info->offset = MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index); + region_info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + mutex_unlock(&mdev_state->ops_lock); + return 0; +} + +static int mtty_get_irq_info(struct mdev_device *mdev, + struct vfio_irq_info *irq_info) +{ + switch (irq_info->index) { + case VFIO_PCI_INTX_IRQ_INDEX: + case VFIO_PCI_MSI_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + + default: + return -EINVAL; + } + + irq_info->flags = VFIO_IRQ_INFO_EVENTFD; + irq_info->count = 1; + + if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX) + irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED); + else + irq_info->flags |= VFIO_IRQ_INFO_NORESIZE; + + return 0; +} + +static int mtty_get_device_info(struct mdev_device *mdev, + struct vfio_device_info *dev_info) +{ + dev_info->flags = VFIO_DEVICE_FLAGS_PCI; + dev_info->num_regions = VFIO_PCI_NUM_REGIONS; + dev_info->num_irqs = VFIO_PCI_NUM_IRQS; + + return 0; +} + +static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + unsigned long minsz; + struct mdev_state *mdev_state; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -ENODEV; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mtty_get_device_info(mdev, &info); + if (ret) + return ret; + + memcpy(&mdev_state->dev_info, &info, sizeof(info)); + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info info; + u16 cap_type_id = 0; + void *cap_type = NULL; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mtty_get_region_info(mdev, &info, &cap_type_id, + &cap_type); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if ((info.argsz < minsz) || + (info.index >= mdev_state->dev_info.num_irqs)) + return -EINVAL; + + ret = mtty_get_irq_info(mdev, &info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_DEVICE_SET_IRQS: + { + struct vfio_irq_set hdr; + u8 *data = NULL, *ptr = NULL; + size_t data_size = 0; + + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + ret = vfio_set_irqs_validate_and_prepare(&hdr, + mdev_state->dev_info.num_irqs, + VFIO_PCI_NUM_IRQS, + &data_size); + if (ret) + return ret; + + if (data_size) { + ptr = data = memdup_user((void __user *)(arg + minsz), + data_size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start, + hdr.count, data); + + kfree(ptr); + return ret; + } + case VFIO_DEVICE_RESET: + return mtty_reset(mdev); + } + return -ENOTTY; +} + +static int mtty_open(struct mdev_device *mdev) +{ + pr_info("%s\n", __func__); + return 0; +} + +static void mtty_close(struct mdev_device *mdev) +{ + pr_info("%s\n", __func__); +} + +static ssize_t +sample_mtty_dev_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "This is phy device\n"); +} + +static DEVICE_ATTR_RO(sample_mtty_dev); + +static struct attribute *mtty_dev_attrs[] = { + &dev_attr_sample_mtty_dev.attr, + NULL, +}; + +static const struct attribute_group mtty_dev_group = { + .name = "mtty_dev", + .attrs = mtty_dev_attrs, +}; + +static const struct attribute_group *mtty_dev_groups[] = { + &mtty_dev_group, + NULL, +}; + +static ssize_t +sample_mdev_dev_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + if (mdev_from_dev(dev)) + return sprintf(buf, "This is MDEV %s\n", dev_name(dev)); + + return sprintf(buf, "\n"); +} + +static DEVICE_ATTR_RO(sample_mdev_dev); + +static struct attribute *mdev_dev_attrs[] = { + &dev_attr_sample_mdev_dev.attr, + NULL, +}; + +static const struct attribute_group mdev_dev_group = { + .name = "vendor", + .attrs = mdev_dev_attrs, +}; + +static const struct attribute_group *mdev_dev_groups[] = { + &mdev_dev_group, + NULL, +}; + +static ssize_t +name_show(struct kobject *kobj, struct device *dev, char *buf) +{ + char name[MTTY_STRING_LEN]; + int i; + const char *name_str[2] = {"Single port serial", "Dual port serial"}; + + for (i = 0; i < 2; i++) { + snprintf(name, MTTY_STRING_LEN, "%s-%d", + dev_driver_string(dev), i + 1); + if (!strcmp(kobj->name, name)) + return sprintf(buf, "%s\n", name_str[i]); + } + + return -EINVAL; +} + +static MDEV_TYPE_ATTR_RO(name); + +static ssize_t +available_instances_show(struct kobject *kobj, struct device *dev, char *buf) +{ + char name[MTTY_STRING_LEN]; + int i; + struct mdev_state *mds; + int ports = 0, used = 0; + + for (i = 0; i < 2; i++) { + snprintf(name, MTTY_STRING_LEN, "%s-%d", + dev_driver_string(dev), i + 1); + if (!strcmp(kobj->name, name)) { + ports = i + 1; + break; + } + } + + if (!ports) + return -EINVAL; + + list_for_each_entry(mds, &mdev_devices_list, next) + used += mds->nr_ports; + + return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports); +} + +static MDEV_TYPE_ATTR_RO(available_instances); + + +static ssize_t device_api_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); +} + +static MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *mdev_types_attrs[] = { + &mdev_type_attr_name.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_available_instances.attr, + NULL, +}; + +static struct attribute_group mdev_type_group1 = { + .name = "1", + .attrs = mdev_types_attrs, +}; + +static struct attribute_group mdev_type_group2 = { + .name = "2", + .attrs = mdev_types_attrs, +}; + +static struct attribute_group *mdev_type_groups[] = { + &mdev_type_group1, + &mdev_type_group2, + NULL, +}; + +static const struct mdev_parent_ops mdev_fops = { + .owner = THIS_MODULE, + .dev_attr_groups = mtty_dev_groups, + .mdev_attr_groups = mdev_dev_groups, + .supported_type_groups = mdev_type_groups, + .create = mtty_create, + .remove = mtty_remove, + .open = mtty_open, + .release = mtty_close, + .read = mtty_read, + .write = mtty_write, + .ioctl = mtty_ioctl, +}; + +static void mtty_device_release(struct device *dev) +{ + dev_dbg(dev, "mtty: released\n"); +} + +static int __init mtty_dev_init(void) +{ + int ret = 0; + + pr_info("mtty_dev: %s\n", __func__); + + memset(&mtty_dev, 0, sizeof(mtty_dev)); + + idr_init(&mtty_dev.vd_idr); + + ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK + 1, + MTTY_NAME); + + if (ret < 0) { + pr_err("Error: failed to register mtty_dev, err:%d\n", ret); + return ret; + } + + cdev_init(&mtty_dev.vd_cdev, &vd_fops); + cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK + 1); + + pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt)); + + mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME); + + if (IS_ERR(mtty_dev.vd_class)) { + pr_err("Error: failed to register mtty_dev class\n"); + ret = PTR_ERR(mtty_dev.vd_class); + goto failed1; + } + + mtty_dev.dev.class = mtty_dev.vd_class; + mtty_dev.dev.release = mtty_device_release; + dev_set_name(&mtty_dev.dev, "%s", MTTY_NAME); + + ret = device_register(&mtty_dev.dev); + if (ret) + goto failed2; + + ret = mdev_register_device(&mtty_dev.dev, &mdev_fops); + if (ret) + goto failed3; + + mutex_init(&mdev_list_lock); + INIT_LIST_HEAD(&mdev_devices_list); + + goto all_done; + +failed3: + + device_unregister(&mtty_dev.dev); +failed2: + class_destroy(mtty_dev.vd_class); + +failed1: + cdev_del(&mtty_dev.vd_cdev); + unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); + +all_done: + return ret; +} + +static void __exit mtty_dev_exit(void) +{ + mtty_dev.dev.bus = NULL; + mdev_unregister_device(&mtty_dev.dev); + + device_unregister(&mtty_dev.dev); + idr_destroy(&mtty_dev.vd_idr); + cdev_del(&mtty_dev.vd_cdev); + unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); + class_destroy(mtty_dev.vd_class); + mtty_dev.vd_class = NULL; + pr_info("mtty_dev: Unloaded!\n"); +} + +module_init(mtty_dev_init) +module_exit(mtty_dev_exit) + +MODULE_LICENSE("GPL v2"); +MODULE_INFO(supported, "Test driver that simulate serial port over PCI"); +MODULE_VERSION(VERSION_STRING); +MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore new file mode 100644 index 000000000..8fdabf7e5 --- /dev/null +++ b/samples/vfs/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +test-fsmount +test-statx diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile new file mode 100644 index 000000000..6377a6781 --- /dev/null +++ b/samples/vfs/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += test-fsmount test-statx + +userccflags += -I usr/include diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c new file mode 100644 index 000000000..50f47b72e --- /dev/null +++ b/samples/vfs/test-fsmount.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* fd-based mount test. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <linux/mount.h> +#include <linux/unistd.h> + +#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0) + +static void check_messages(int fd) +{ + char buf[4096]; + int err, n; + + err = errno; + + for (;;) { + n = read(fd, buf, sizeof(buf)); + if (n < 0) + break; + n -= 2; + + switch (buf[0]) { + case 'e': + fprintf(stderr, "Error: %*.*s\n", n, n, buf + 2); + break; + case 'w': + fprintf(stderr, "Warning: %*.*s\n", n, n, buf + 2); + break; + case 'i': + fprintf(stderr, "Info: %*.*s\n", n, n, buf + 2); + break; + } + } + + errno = err; +} + +static __attribute__((noreturn)) +void mount_error(int fd, const char *s) +{ + check_messages(fd); + fprintf(stderr, "%s: %m\n", s); + exit(1); +} + +/* Hope -1 isn't a syscall */ +#ifndef __NR_fsopen +#define __NR_fsopen -1 +#endif +#ifndef __NR_fsmount +#define __NR_fsmount -1 +#endif +#ifndef __NR_fsconfig +#define __NR_fsconfig -1 +#endif +#ifndef __NR_move_mount +#define __NR_move_mount -1 +#endif + + +static inline int fsopen(const char *fs_name, unsigned int flags) +{ + return syscall(__NR_fsopen, fs_name, flags); +} + +static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags) +{ + return syscall(__NR_fsmount, fsfd, flags, ms_flags); +} + +static inline int fsconfig(int fsfd, unsigned int cmd, + const char *key, const void *val, int aux) +{ + return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux); +} + +static inline int move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, + from_dfd, from_pathname, + to_dfd, to_pathname, flags); +} + +#define E_fsconfig(fd, cmd, key, val, aux) \ + do { \ + if (fsconfig(fd, cmd, key, val, aux) == -1) \ + mount_error(fd, key ?: "create"); \ + } while (0) + +int main(int argc, char *argv[]) +{ + int fsfd, mfd; + + /* Mount a publically available AFS filesystem */ + fsfd = fsopen("afs", 0); + if (fsfd == -1) { + perror("fsopen"); + exit(1); + } + + E_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell.", 0); + E_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); + + mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY); + if (mfd < 0) + mount_error(fsfd, "fsmount"); + E(close(fsfd)); + + if (move_mount(mfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH) < 0) { + perror("move_mount"); + exit(1); + } + + E(close(mfd)); + exit(0); +} diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c new file mode 100644 index 000000000..49c7a46ce --- /dev/null +++ b/samples/vfs/test-statx.c @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Test the statx() system call. + * + * Note that the output of this program is intended to look like the output of + * /bin/stat where possible. + * + * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define _GNU_SOURCE +#define _ATFILE_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <ctype.h> +#include <errno.h> +#include <time.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <linux/stat.h> +#include <linux/fcntl.h> +#define statx foo +#define statx_timestamp foo_timestamp +struct statx; +struct statx_timestamp; +#include <sys/stat.h> +#undef statx +#undef statx_timestamp + +#define AT_STATX_SYNC_TYPE 0x6000 +#define AT_STATX_SYNC_AS_STAT 0x0000 +#define AT_STATX_FORCE_SYNC 0x2000 +#define AT_STATX_DONT_SYNC 0x4000 + +#ifndef __NR_statx +#define __NR_statx -1 +#endif + +static __attribute__((unused)) +ssize_t statx(int dfd, const char *filename, unsigned flags, + unsigned int mask, struct statx *buffer) +{ + return syscall(__NR_statx, dfd, filename, flags, mask, buffer); +} + +static void print_time(const char *field, struct statx_timestamp *ts) +{ + struct tm tm; + time_t tim; + char buffer[100]; + int len; + + tim = ts->tv_sec; + if (!localtime_r(&tim, &tm)) { + perror("localtime_r"); + exit(1); + } + len = strftime(buffer, 100, "%F %T", &tm); + if (len == 0) { + perror("strftime"); + exit(1); + } + printf("%s", field); + fwrite(buffer, 1, len, stdout); + printf(".%09u", ts->tv_nsec); + len = strftime(buffer, 100, "%z", &tm); + if (len == 0) { + perror("strftime2"); + exit(1); + } + fwrite(buffer, 1, len, stdout); + printf("\n"); +} + +static void dump_statx(struct statx *stx) +{ + char buffer[256], ft = '?'; + + printf("results=%x\n", stx->stx_mask); + + printf(" "); + if (stx->stx_mask & STATX_SIZE) + printf(" Size: %-15llu", (unsigned long long)stx->stx_size); + if (stx->stx_mask & STATX_BLOCKS) + printf(" Blocks: %-10llu", (unsigned long long)stx->stx_blocks); + printf(" IO Block: %-6llu", (unsigned long long)stx->stx_blksize); + if (stx->stx_mask & STATX_TYPE) { + switch (stx->stx_mode & S_IFMT) { + case S_IFIFO: printf(" FIFO\n"); ft = 'p'; break; + case S_IFCHR: printf(" character special file\n"); ft = 'c'; break; + case S_IFDIR: printf(" directory\n"); ft = 'd'; break; + case S_IFBLK: printf(" block special file\n"); ft = 'b'; break; + case S_IFREG: printf(" regular file\n"); ft = '-'; break; + case S_IFLNK: printf(" symbolic link\n"); ft = 'l'; break; + case S_IFSOCK: printf(" socket\n"); ft = 's'; break; + default: + printf(" unknown type (%o)\n", stx->stx_mode & S_IFMT); + break; + } + } else { + printf(" no type\n"); + } + + sprintf(buffer, "%02x:%02x", stx->stx_dev_major, stx->stx_dev_minor); + printf("Device: %-15s", buffer); + if (stx->stx_mask & STATX_INO) + printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino); + if (stx->stx_mask & STATX_NLINK) + printf(" Links: %-5u", stx->stx_nlink); + if (stx->stx_mask & STATX_TYPE) { + switch (stx->stx_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + printf(" Device type: %u,%u", + stx->stx_rdev_major, stx->stx_rdev_minor); + break; + } + } + printf("\n"); + + if (stx->stx_mask & STATX_MODE) + printf("Access: (%04o/%c%c%c%c%c%c%c%c%c%c) ", + stx->stx_mode & 07777, + ft, + stx->stx_mode & S_IRUSR ? 'r' : '-', + stx->stx_mode & S_IWUSR ? 'w' : '-', + stx->stx_mode & S_IXUSR ? 'x' : '-', + stx->stx_mode & S_IRGRP ? 'r' : '-', + stx->stx_mode & S_IWGRP ? 'w' : '-', + stx->stx_mode & S_IXGRP ? 'x' : '-', + stx->stx_mode & S_IROTH ? 'r' : '-', + stx->stx_mode & S_IWOTH ? 'w' : '-', + stx->stx_mode & S_IXOTH ? 'x' : '-'); + if (stx->stx_mask & STATX_UID) + printf("Uid: %5d ", stx->stx_uid); + if (stx->stx_mask & STATX_GID) + printf("Gid: %5d\n", stx->stx_gid); + + if (stx->stx_mask & STATX_ATIME) + print_time("Access: ", &stx->stx_atime); + if (stx->stx_mask & STATX_MTIME) + print_time("Modify: ", &stx->stx_mtime); + if (stx->stx_mask & STATX_CTIME) + print_time("Change: ", &stx->stx_ctime); + if (stx->stx_mask & STATX_BTIME) + print_time(" Birth: ", &stx->stx_btime); + + if (stx->stx_attributes_mask) { + unsigned char bits, mbits; + int loop, byte; + + static char attr_representation[64 + 1] = + /* STATX_ATTR_ flags: */ + "????????" /* 63-56 */ + "????????" /* 55-48 */ + "????????" /* 47-40 */ + "????????" /* 39-32 */ + "????????" /* 31-24 0x00000000-ff000000 */ + "????????" /* 23-16 0x00000000-00ff0000 */ + "???me???" /* 15- 8 0x00000000-0000ff00 */ + "?dai?c??" /* 7- 0 0x00000000-000000ff */ + ; + + printf("Attributes: %016llx (", + (unsigned long long)stx->stx_attributes); + for (byte = 64 - 8; byte >= 0; byte -= 8) { + bits = stx->stx_attributes >> byte; + mbits = stx->stx_attributes_mask >> byte; + for (loop = 7; loop >= 0; loop--) { + int bit = byte + loop; + + if (!(mbits & 0x80)) + putchar('.'); /* Not supported */ + else if (bits & 0x80) + putchar(attr_representation[63 - bit]); + else + putchar('-'); /* Not set */ + bits <<= 1; + mbits <<= 1; + } + if (byte) + putchar(' '); + } + printf(")\n"); + } +} + +static void dump_hex(unsigned long long *data, int from, int to) +{ + unsigned offset, print_offset = 1, col = 0; + + from /= 8; + to = (to + 7) / 8; + + for (offset = from; offset < to; offset++) { + if (print_offset) { + printf("%04x: ", offset * 8); + print_offset = 0; + } + printf("%016llx", data[offset]); + col++; + if ((col & 3) == 0) { + printf("\n"); + print_offset = 1; + } else { + printf(" "); + } + } + + if (!print_offset) + printf("\n"); +} + +int main(int argc, char **argv) +{ + struct statx stx; + int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW; + + unsigned int mask = STATX_BASIC_STATS | STATX_BTIME; + + for (argv++; *argv; argv++) { + if (strcmp(*argv, "-F") == 0) { + atflag &= ~AT_STATX_SYNC_TYPE; + atflag |= AT_STATX_FORCE_SYNC; + continue; + } + if (strcmp(*argv, "-D") == 0) { + atflag &= ~AT_STATX_SYNC_TYPE; + atflag |= AT_STATX_DONT_SYNC; + continue; + } + if (strcmp(*argv, "-L") == 0) { + atflag &= ~AT_SYMLINK_NOFOLLOW; + continue; + } + if (strcmp(*argv, "-O") == 0) { + mask &= ~STATX_BASIC_STATS; + continue; + } + if (strcmp(*argv, "-A") == 0) { + atflag |= AT_NO_AUTOMOUNT; + continue; + } + if (strcmp(*argv, "-R") == 0) { + raw = 1; + continue; + } + + memset(&stx, 0xbf, sizeof(stx)); + ret = statx(AT_FDCWD, *argv, atflag, mask, &stx); + printf("statx(%s) = %d\n", *argv, ret); + if (ret < 0) { + perror(*argv); + exit(1); + } + + if (raw) + dump_hex((unsigned long long *)&stx, 0, sizeof(stx)); + + dump_statx(&stx); + } + return 0; +} diff --git a/samples/watch_queue/.gitignore b/samples/watch_queue/.gitignore new file mode 100644 index 000000000..2aa3c7e56 --- /dev/null +++ b/samples/watch_queue/.gitignore @@ -0,0 +1 @@ +watch_test diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile new file mode 100644 index 000000000..c0db3a6bc --- /dev/null +++ b/samples/watch_queue/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += watch_test + +userccflags += -I usr/include diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c new file mode 100644 index 000000000..8c6cb57d5 --- /dev/null +++ b/samples/watch_queue/watch_test.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Use watch_queue API to watch for notifications. + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define _GNU_SOURCE +#include <stdbool.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> +#include <unistd.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <limits.h> +#include <linux/watch_queue.h> +#include <linux/unistd.h> +#include <linux/keyctl.h> + +#ifndef KEYCTL_WATCH_KEY +#define KEYCTL_WATCH_KEY -1 +#endif +#ifndef __NR_keyctl +#define __NR_keyctl -1 +#endif + +#define BUF_SIZE 256 + +static long keyctl_watch_key(int key, int watch_fd, int watch_id) +{ + return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id); +} + +static const char *key_subtypes[256] = { + [NOTIFY_KEY_INSTANTIATED] = "instantiated", + [NOTIFY_KEY_UPDATED] = "updated", + [NOTIFY_KEY_LINKED] = "linked", + [NOTIFY_KEY_UNLINKED] = "unlinked", + [NOTIFY_KEY_CLEARED] = "cleared", + [NOTIFY_KEY_REVOKED] = "revoked", + [NOTIFY_KEY_INVALIDATED] = "invalidated", + [NOTIFY_KEY_SETATTR] = "setattr", +}; + +static void saw_key_change(struct watch_notification *n, size_t len) +{ + struct key_notification *k = (struct key_notification *)n; + + if (len != sizeof(struct key_notification)) { + fprintf(stderr, "Incorrect key message length\n"); + return; + } + + printf("KEY %08x change=%u[%s] aux=%u\n", + k->key_id, n->subtype, key_subtypes[n->subtype], k->aux); +} + +/* + * Consume and display events. + */ +static void consumer(int fd) +{ + unsigned char buffer[433], *p, *end; + union { + struct watch_notification n; + unsigned char buf1[128]; + } n; + ssize_t buf_len; + + for (;;) { + buf_len = read(fd, buffer, sizeof(buffer)); + if (buf_len == -1) { + perror("read"); + exit(1); + } + + if (buf_len == 0) { + printf("-- END --\n"); + return; + } + + if (buf_len > sizeof(buffer)) { + fprintf(stderr, "Read buffer overrun: %zd\n", buf_len); + return; + } + + printf("read() = %zd\n", buf_len); + + p = buffer; + end = buffer + buf_len; + while (p < end) { + size_t largest, len; + + largest = end - p; + if (largest > 128) + largest = 128; + if (largest < sizeof(struct watch_notification)) { + fprintf(stderr, "Short message header: %zu\n", largest); + return; + } + memcpy(&n, p, largest); + + printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n", + p - buffer, n.n.type, n.n.subtype, n.n.info); + + len = n.n.info & WATCH_INFO_LENGTH; + if (len < sizeof(n.n) || len > largest) { + fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest); + exit(1); + } + + switch (n.n.type) { + case WATCH_TYPE_META: + switch (n.n.subtype) { + case WATCH_META_REMOVAL_NOTIFICATION: + printf("REMOVAL of watchpoint %08x\n", + (n.n.info & WATCH_INFO_ID) >> + WATCH_INFO_ID__SHIFT); + break; + case WATCH_META_LOSS_NOTIFICATION: + printf("-- LOSS --\n"); + break; + default: + printf("other meta record\n"); + break; + } + break; + case WATCH_TYPE_KEY_NOTIFY: + saw_key_change(&n.n, len); + break; + default: + printf("other type\n"); + break; + } + + p += len; + } + } +} + +static struct watch_notification_filter filter = { + .nr_filters = 1, + .filters = { + [0] = { + .type = WATCH_TYPE_KEY_NOTIFY, + .subtype_filter[0] = UINT_MAX, + }, + }, +}; + +int main(int argc, char **argv) +{ + int pipefd[2], fd; + + if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) { + perror("pipe2"); + exit(1); + } + fd = pipefd[0]; + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) { + perror("watch_queue(size)"); + exit(1); + } + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) { + perror("watch_queue(filter)"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) { + perror("keyctl"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) { + perror("keyctl"); + exit(1); + } + + consumer(fd); + exit(0); +} diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore new file mode 100644 index 000000000..74153b831 --- /dev/null +++ b/samples/watchdog/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +watchdog-simple diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile new file mode 100644 index 000000000..ab39d23dc --- /dev/null +++ b/samples/watchdog/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += watchdog-simple diff --git a/samples/watchdog/watchdog-simple.c b/samples/watchdog/watchdog-simple.c new file mode 100644 index 000000000..9ce66d2ca --- /dev/null +++ b/samples/watchdog/watchdog-simple.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> + +int main(void) +{ + int fd = open("/dev/watchdog", O_WRONLY); + int ret = 0; + if (fd == -1) { + perror("watchdog"); + exit(EXIT_FAILURE); + } + while (1) { + ret = write(fd, "\0", 1); + if (ret != 1) { + ret = -1; + break; + } + sleep(10); + } + close(fd); + return ret; +} |