From 3afb00d3f86d3d924f88b56fa8285d4e9db85852 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:17:52 +0200 Subject: Merging upstream version 6.10.3. Signed-off-by: Daniel Baumann --- tools/arch/arm64/include/asm/cputype.h | 6 + .../x86/dell-uart-backlight-emulator/.gitignore | 1 + .../arch/x86/dell-uart-backlight-emulator/Makefile | 19 + tools/arch/x86/dell-uart-backlight-emulator/README | 46 + .../dell-uart-backlight-emulator.c | 163 ++ tools/arch/x86/include/asm/cpufeatures.h | 7 +- tools/arch/x86/include/asm/inat.h | 17 +- tools/arch/x86/include/asm/insn.h | 32 +- tools/arch/x86/include/asm/irq_vectors.h | 140 -- tools/arch/x86/include/asm/msr-index.h | 18 +- tools/arch/x86/include/uapi/asm/kvm.h | 22 +- tools/arch/x86/include/uapi/asm/prctl.h | 43 - tools/arch/x86/intel_sdsi/intel_sdsi.c | 66 +- tools/arch/x86/lib/insn.c | 29 + tools/arch/x86/lib/x86-opcode-map.txt | 309 ++- tools/arch/x86/tools/gen-insn-attr-x86.awk | 15 +- tools/bpf/bpftool/Documentation/Makefile | 6 +- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 104 +- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 219 +- .../bpf/bpftool/Documentation/bpftool-feature.rst | 115 +- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 338 ++- tools/bpf/bpftool/Documentation/bpftool-iter.rst | 60 +- tools/bpf/bpftool/Documentation/bpftool-link.rst | 73 +- tools/bpf/bpftool/Documentation/bpftool-map.rst | 232 +- tools/bpf/bpftool/Documentation/bpftool-net.rst | 112 +- tools/bpf/bpftool/Documentation/bpftool-perf.rst | 34 +- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 436 ++-- .../bpftool/Documentation/bpftool-struct_ops.rst | 81 +- tools/bpf/bpftool/Documentation/bpftool.rst | 60 +- tools/bpf/bpftool/Documentation/common_options.rst | 26 +- tools/bpf/bpftool/Makefile | 16 +- tools/bpf/bpftool/bash-completion/bpftool | 61 +- tools/bpf/bpftool/common.c | 2 +- tools/bpf/bpftool/feature.c | 3 +- tools/bpf/bpftool/gen.c | 5 +- tools/bpf/bpftool/link.c | 9 + tools/bpf/bpftool/pids.c | 19 +- tools/bpf/bpftool/prog.c | 6 +- tools/bpf/resolve_btfids/main.c | 2 +- tools/cgroup/memcg_slabinfo.py | 5 +- tools/hv/Build | 3 +- tools/hv/Makefile | 15 +- tools/hv/hv_fcopy_daemon.c | 266 --- tools/hv/hv_fcopy_uio_daemon.c | 490 ++++ tools/hv/vmbus_bufring.c | 318 +++ tools/hv/vmbus_bufring.h | 158 ++ tools/include/asm-generic/bitops/__ffs.h | 4 +- tools/include/asm-generic/bitops/__fls.h | 4 +- tools/include/linux/align.h | 12 + tools/include/linux/bitmap.h | 9 +- tools/include/linux/bitops.h | 4 +- tools/include/linux/bits.h | 8 +- tools/include/linux/compiler.h | 4 + tools/include/linux/filter.h | 18 + tools/include/linux/mm.h | 5 +- tools/include/linux/rbtree_augmented.h | 4 +- tools/include/nolibc/string.h | 46 +- tools/include/nolibc/sys.h | 27 + tools/include/uapi/asm-generic/bitsperlong.h | 4 + tools/include/uapi/asm-generic/fcntl.h | 221 -- tools/include/uapi/asm-generic/unistd.h | 5 +- tools/include/uapi/drm/i915_drm.h | 31 +- tools/include/uapi/linux/bits.h | 15 + tools/include/uapi/linux/bpf.h | 42 +- tools/include/uapi/linux/ethtool.h | 104 - tools/include/uapi/linux/fcntl.h | 123 - tools/include/uapi/linux/fs.h | 396 ---- tools/include/uapi/linux/kvm.h | 4 +- tools/include/uapi/linux/memfd.h | 39 + tools/include/uapi/linux/mount.h | 211 -- tools/include/uapi/linux/netdev.h | 22 + tools/include/uapi/linux/openat2.h | 43 - tools/include/uapi/linux/prctl.h | 309 --- tools/include/uapi/linux/sched.h | 148 -- tools/include/uapi/linux/stat.h | 4 +- tools/include/uapi/linux/usbdevice_fs.h | 231 -- tools/include/uapi/linux/userfaultfd.h | 386 ++++ tools/include/uapi/linux/vhost.h | 230 -- tools/include/uapi/sound/asound.h | 1252 ----------- tools/lib/bpf/bpf.c | 17 +- tools/lib/bpf/bpf.h | 9 + tools/lib/bpf/bpf_core_read.h | 2 +- tools/lib/bpf/bpf_helpers.h | 21 +- tools/lib/bpf/bpf_tracing.h | 70 +- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/btf_dump.c | 13 +- tools/lib/bpf/libbpf.c | 252 ++- tools/lib/bpf/libbpf.h | 29 +- tools/lib/bpf/libbpf.map | 9 + tools/lib/bpf/libbpf_internal.h | 15 +- tools/lib/bpf/libbpf_probes.c | 6 +- tools/lib/bpf/libbpf_version.h | 2 +- tools/lib/bpf/linker.c | 11 +- tools/lib/bpf/ringbuf.c | 53 +- tools/lib/bpf/str_error.c | 16 +- tools/lib/bpf/usdt.bpf.h | 24 +- tools/lib/perf/cpumap.c | 33 +- tools/lib/perf/include/perf/cpumap.h | 16 + tools/lib/perf/libperf.map | 4 + tools/lib/perf/mmap.c | 2 +- tools/lib/rbtree.c | 2 +- tools/lib/subcmd/parse-options.c | 36 +- tools/lib/subcmd/run-command.c | 70 +- tools/lib/subcmd/run-command.h | 3 + tools/memory-model/lock.cat | 20 +- tools/net/ynl/cli.py | 34 +- tools/net/ynl/ethtool.py | 19 +- tools/net/ynl/lib/nlspec.py | 2 + tools/net/ynl/lib/ynl.h | 12 + tools/net/ynl/lib/ynl.py | 162 +- tools/net/ynl/samples/netdev.c | 2 + tools/net/ynl/ynl-gen-c.py | 22 +- tools/net/ynl/ynl-gen-rst.py | 62 +- tools/objtool/noreturns.h | 4 + tools/perf/Build | 14 + tools/perf/Documentation/perf-arm-spe.txt | 12 +- tools/perf/Documentation/perf-report.txt | 9 +- tools/perf/Documentation/perf-sched.txt | 36 + tools/perf/Documentation/perf-script.txt | 7 +- tools/perf/Documentation/perf-test.txt | 13 +- tools/perf/Makefile.config | 25 +- tools/perf/Makefile.perf | 94 +- tools/perf/arch/arm/util/cs-etm.c | 381 ++-- tools/perf/arch/arm64/util/arm-spe.c | 4 +- tools/perf/arch/arm64/util/header.c | 13 +- .../perf/arch/mips/entry/syscalls/syscall_n64.tbl | 1 + tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/powerpc/util/skip-callchain-idx.c | 8 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/x86/Build | 14 + tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 3 +- tools/perf/arch/x86/tests/Build | 14 + tools/perf/arch/x86/tests/gen-insn-x86-dat.sh | 2 +- tools/perf/arch/x86/util/intel-bts.c | 4 +- tools/perf/arch/x86/util/intel-pt.c | 25 +- tools/perf/bench/bench.h | 2 + tools/perf/bench/uprobe.c | 20 +- tools/perf/builtin-annotate.c | 126 +- tools/perf/builtin-bench.c | 2 + tools/perf/builtin-buildid-cache.c | 2 +- tools/perf/builtin-buildid-list.c | 18 +- tools/perf/builtin-c2c.c | 21 +- tools/perf/builtin-inject.c | 96 +- tools/perf/builtin-kallsyms.c | 2 +- tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-kwork.c | 2 +- tools/perf/builtin-list.c | 24 +- tools/perf/builtin-lock.c | 18 +- tools/perf/builtin-mem.c | 4 +- tools/perf/builtin-probe.c | 2 +- tools/perf/builtin-record.c | 6 +- tools/perf/builtin-report.c | 16 +- tools/perf/builtin-sched.c | 6 +- tools/perf/builtin-script.c | 84 +- tools/perf/builtin-stat.c | 52 +- tools/perf/builtin-top.c | 4 +- tools/perf/builtin-trace.c | 37 +- tools/perf/builtin.h | 4 +- tools/perf/check-headers.sh | 23 +- tools/perf/perf-archive.sh | 2 +- tools/perf/perf-completion.sh | 23 +- tools/perf/perf.c | 23 +- .../arch/arm64/ampere/ampereone/cache.json | 4 +- .../arch/arm64/ampere/ampereonex/cache.json | 4 +- .../arch/x86/amdzen5/branch-prediction.json | 93 + tools/perf/pmu-events/arch/x86/amdzen5/decode.json | 115 + .../pmu-events/arch/x86/amdzen5/execution.json | 174 ++ .../arch/x86/amdzen5/floating-point.json | 812 +++++++ .../pmu-events/arch/x86/amdzen5/inst-cache.json | 72 + .../perf/pmu-events/arch/x86/amdzen5/l2-cache.json | 266 +++ .../perf/pmu-events/arch/x86/amdzen5/l3-cache.json | 177 ++ .../pmu-events/arch/x86/amdzen5/load-store.json | 451 ++++ .../arch/x86/amdzen5/memory-controller.json | 101 + .../perf/pmu-events/arch/x86/amdzen5/pipeline.json | 99 + .../pmu-events/arch/x86/amdzen5/recommended.json | 345 +++ .../arch/x86/broadwellx/bdx-metrics.json | 35 +- .../arch/x86/cascadelakex/clx-metrics.json | 85 +- .../pmu-events/arch/x86/cascadelakex/frontend.json | 10 +- .../pmu-events/arch/x86/cascadelakex/memory.json | 2 +- .../pmu-events/arch/x86/cascadelakex/other.json | 2 +- .../pmu-events/arch/x86/cascadelakex/pipeline.json | 2 +- .../arch/x86/cascadelakex/uncore-interconnect.json | 14 +- .../arch/x86/cascadelakex/virtual-memory.json | 2 +- .../arch/x86/emeraldrapids/frontend.json | 2 +- .../pmu-events/arch/x86/emeraldrapids/memory.json | 1 + .../arch/x86/emeraldrapids/pipeline.json | 3 + .../arch/x86/emeraldrapids/uncore-cache.json | 112 +- .../x86/emeraldrapids/uncore-interconnect.json | 26 +- .../pmu-events/arch/x86/grandridge/pipeline.json | 43 +- .../arch/x86/grandridge/uncore-cache.json | 28 +- .../pmu-events/arch/x86/haswellx/hsx-metrics.json | 35 +- .../pmu-events/arch/x86/icelakex/frontend.json | 2 +- .../pmu-events/arch/x86/icelakex/icx-metrics.json | 95 +- .../perf/pmu-events/arch/x86/icelakex/memory.json | 1 + .../pmu-events/arch/x86/icelakex/uncore-cache.json | 22 +- .../arch/x86/icelakex/uncore-interconnect.json | 64 +- .../pmu-events/arch/x86/icelakex/uncore-io.json | 11 - .../perf/pmu-events/arch/x86/lunarlake/cache.json | 24 +- .../pmu-events/arch/x86/lunarlake/frontend.json | 2 +- .../perf/pmu-events/arch/x86/lunarlake/memory.json | 4 +- .../perf/pmu-events/arch/x86/lunarlake/other.json | 4 +- .../pmu-events/arch/x86/lunarlake/pipeline.json | 109 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 21 +- .../perf/pmu-events/arch/x86/meteorlake/cache.json | 30 + .../pmu-events/arch/x86/meteorlake/frontend.json | 4 +- .../pmu-events/arch/x86/meteorlake/memory.json | 20 + .../perf/pmu-events/arch/x86/meteorlake/other.json | 42 +- .../pmu-events/arch/x86/meteorlake/pipeline.json | 44 +- .../arch/x86/meteorlake/uncore-interconnect.json | 22 +- .../pmu-events/arch/x86/sapphirerapids/cache.json | 1 + .../arch/x86/sapphirerapids/frontend.json | 2 +- .../pmu-events/arch/x86/sapphirerapids/memory.json | 1 + .../arch/x86/sapphirerapids/pipeline.json | 19 +- .../arch/x86/sapphirerapids/spr-metrics.json | 119 +- .../arch/x86/sapphirerapids/uncore-cache.json | 112 +- .../x86/sapphirerapids/uncore-interconnect.json | 26 +- .../pmu-events/arch/x86/sierraforest/pipeline.json | 36 +- .../perf/pmu-events/arch/x86/skylake/frontend.json | 10 +- tools/perf/pmu-events/arch/x86/skylakex/cache.json | 9 + .../pmu-events/arch/x86/skylakex/frontend.json | 10 +- .../perf/pmu-events/arch/x86/skylakex/memory.json | 2 +- tools/perf/pmu-events/arch/x86/skylakex/other.json | 2 +- .../pmu-events/arch/x86/skylakex/pipeline.json | 2 +- .../pmu-events/arch/x86/skylakex/skx-metrics.json | 85 +- .../arch/x86/skylakex/uncore-interconnect.json | 14 +- .../pmu-events/arch/x86/skylakex/uncore-io.json | 2 +- .../arch/x86/skylakex/virtual-memory.json | 2 +- .../arch/x86/snowridgex/uncore-cache.json | 4 +- .../arch/x86/snowridgex/uncore-interconnect.json | 6 +- .../pmu-events/arch/x86/snowridgex/uncore-io.json | 11 - tools/perf/scripts/python/parallel-perf.py | 988 ++++++++ tools/perf/tests/bitmap.c | 13 +- tools/perf/tests/builtin-test.c | 25 +- tools/perf/tests/code-reading.c | 8 +- tools/perf/tests/config-fragments/config | 3 + tools/perf/tests/dso-data.c | 67 +- tools/perf/tests/evsel-roundtrip-name.c | 4 +- tools/perf/tests/hists_common.c | 6 +- tools/perf/tests/hists_cumulate.c | 4 +- tools/perf/tests/hists_output.c | 2 +- tools/perf/tests/maps.c | 4 +- tools/perf/tests/mem.c | 11 +- tools/perf/tests/parse-events.c | 58 +- tools/perf/tests/pmu-events.c | 4 +- tools/perf/tests/pmu.c | 467 ++-- tools/perf/tests/shell/annotate.sh | 83 + .../tests/shell/base_probe/test_adding_kernel.sh | 1 + tools/perf/tests/shell/lib/stat_output.sh | 2 +- tools/perf/tests/shell/script.sh | 26 +- tools/perf/tests/shell/stat+json_output.sh | 2 +- tools/perf/tests/shell/stat_bpf_counters.sh | 75 +- tools/perf/tests/shell/test_arm_callgraph_fp.sh | 31 +- tools/perf/tests/symbols.c | 8 +- tools/perf/tests/topology.c | 46 +- tools/perf/tests/vmlinux-kallsyms.c | 6 +- tools/perf/tests/workloads/leafloop.c | 20 +- tools/perf/trace/beauty/Build | 15 + .../beauty/arch/x86/include/asm/irq_vectors.h | 146 ++ .../trace/beauty/arch/x86/include/uapi/asm/prctl.h | 43 + tools/perf/trace/beauty/arch_errno_names.sh | 8 +- tools/perf/trace/beauty/beauty.h | 7 +- tools/perf/trace/beauty/clone.c | 46 +- tools/perf/trace/beauty/clone.sh | 17 + tools/perf/trace/beauty/fcntl.c | 2 +- tools/perf/trace/beauty/flock.c | 2 +- tools/perf/trace/beauty/fs_at_flags.c | 58 + tools/perf/trace/beauty/fs_at_flags.sh | 21 + tools/perf/trace/beauty/fsconfig.sh | 6 +- tools/perf/trace/beauty/fsmount.c | 9 +- tools/perf/trace/beauty/fsmount.sh | 6 +- tools/perf/trace/beauty/fspick.sh | 6 +- tools/perf/trace/beauty/include/linux/socket.h | 3 +- tools/perf/trace/beauty/include/uapi/linux/fcntl.h | 125 ++ tools/perf/trace/beauty/include/uapi/linux/fs.h | 396 ++++ tools/perf/trace/beauty/include/uapi/linux/mount.h | 211 ++ tools/perf/trace/beauty/include/uapi/linux/prctl.h | 331 +++ tools/perf/trace/beauty/include/uapi/linux/sched.h | 148 ++ tools/perf/trace/beauty/include/uapi/linux/stat.h | 197 ++ .../trace/beauty/include/uapi/linux/usbdevice_fs.h | 231 ++ tools/perf/trace/beauty/include/uapi/linux/vhost.h | 238 ++ .../perf/trace/beauty/include/uapi/sound/asound.h | 1252 +++++++++++ tools/perf/trace/beauty/mount_flags.sh | 6 +- tools/perf/trace/beauty/move_mount_flags.sh | 6 +- tools/perf/trace/beauty/prctl.c | 2 +- tools/perf/trace/beauty/prctl_option.sh | 6 +- tools/perf/trace/beauty/rename_flags.sh | 2 +- tools/perf/trace/beauty/sndrv_ctl_ioctl.sh | 4 +- tools/perf/trace/beauty/sndrv_pcm_ioctl.sh | 4 +- tools/perf/trace/beauty/statx.c | 67 +- tools/perf/trace/beauty/statx_mask.sh | 23 + tools/perf/trace/beauty/sync_file_range.c | 11 +- tools/perf/trace/beauty/sync_file_range.sh | 2 +- .../trace/beauty/tracepoints/x86_irq_vectors.sh | 6 +- tools/perf/trace/beauty/usbdevfs_ioctl.sh | 6 +- tools/perf/trace/beauty/vhost_virtio_ioctl.sh | 6 +- tools/perf/trace/beauty/x86_arch_prctl.sh | 4 +- tools/perf/ui/browsers/Build | 1 + tools/perf/ui/browsers/annotate-data.c | 313 +++ tools/perf/ui/browsers/annotate.c | 21 +- tools/perf/ui/browsers/hists.c | 39 +- tools/perf/ui/browsers/map.c | 4 +- tools/perf/ui/gtk/annotate.c | 5 +- tools/perf/ui/hist.c | 92 +- tools/perf/util/Build | 16 + tools/perf/util/annotate-data.c | 1648 +++++++++++++- tools/perf/util/annotate-data.h | 74 +- tools/perf/util/annotate.c | 2359 +++++-------------- tools/perf/util/annotate.h | 129 +- tools/perf/util/auxtrace.c | 15 +- tools/perf/util/auxtrace.h | 1 + tools/perf/util/block-info.c | 24 +- tools/perf/util/block-info.h | 15 +- tools/perf/util/bpf-event.c | 8 +- tools/perf/util/bpf_counter_cgroup.c | 5 +- tools/perf/util/bpf_kwork.c | 16 +- tools/perf/util/bpf_kwork_top.c | 12 +- .../util/bpf_skel/augmented_raw_syscalls.bpf.c | 21 + tools/perf/util/bpf_skel/bench_uprobe.bpf.c | 16 + tools/perf/util/build-id.c | 136 +- tools/perf/util/build-id.h | 2 - tools/perf/util/callchain.c | 4 +- tools/perf/util/cgroup.c | 4 +- tools/perf/util/comm.c | 218 +- tools/perf/util/cpumap.c | 14 +- tools/perf/util/cpumap.h | 2 - tools/perf/util/cs-etm.c | 15 +- tools/perf/util/data-convert-json.c | 2 +- tools/perf/util/db-export.c | 6 +- tools/perf/util/debug.c | 3 + tools/perf/util/debug.h | 1 + tools/perf/util/disasm.c | 1837 +++++++++++++++ tools/perf/util/disasm.h | 112 + tools/perf/util/dlfilter.c | 12 +- tools/perf/util/dso.c | 484 ++-- tools/perf/util/dso.h | 579 ++++- tools/perf/util/dsos.c | 545 +++-- tools/perf/util/dsos.h | 40 +- tools/perf/util/dump-insn.h | 1 + tools/perf/util/dwarf-aux.c | 240 +- tools/perf/util/dwarf-aux.h | 24 +- tools/perf/util/event.c | 8 +- tools/perf/util/evlist.c | 3 +- tools/perf/util/evsel.c | 20 +- tools/perf/util/evsel.h | 4 +- tools/perf/util/genelf.h | 3 + tools/perf/util/header.c | 8 +- tools/perf/util/help-unknown-cmd.c | 51 +- tools/perf/util/hist.c | 78 +- tools/perf/util/hist.h | 217 +- tools/perf/util/intel-pt.c | 22 +- tools/perf/util/machine.c | 225 +- tools/perf/util/machine.h | 4 +- tools/perf/util/map.c | 91 +- tools/perf/util/map.h | 3 + tools/perf/util/maps.c | 53 +- tools/perf/util/mem-events.c | 36 +- tools/perf/util/mem-events.h | 29 +- tools/perf/util/mem-info.c | 35 + tools/perf/util/mem-info.h | 54 + tools/perf/util/metricgroup.c | 10 +- tools/perf/util/metricgroup.h | 1 + tools/perf/util/parse-events.c | 493 ++-- tools/perf/util/parse-events.h | 58 +- tools/perf/util/parse-events.l | 124 +- tools/perf/util/parse-events.y | 201 +- tools/perf/util/pmu.c | 172 +- tools/perf/util/pmu.h | 9 +- tools/perf/util/pmus.c | 115 +- tools/perf/util/pmus.h | 3 + tools/perf/util/print-events.c | 55 +- tools/perf/util/print_insn.c | 75 +- tools/perf/util/print_insn.h | 8 +- tools/perf/util/probe-event.c | 31 +- tools/perf/util/probe-finder.c | 4 +- tools/perf/util/record.c | 2 +- .../perf/util/scripting-engines/trace-event-perl.c | 6 +- .../util/scripting-engines/trace-event-python.c | 45 +- tools/perf/util/session.c | 26 +- tools/perf/util/session.h | 2 + tools/perf/util/sort.c | 118 +- tools/perf/util/sort.h | 190 +- tools/perf/util/srcline.c | 77 +- tools/perf/util/stat-display.c | 3 + tools/perf/util/stat-shadow.c | 7 + tools/perf/util/stat.c | 2 +- tools/perf/util/stat.h | 1 + tools/perf/util/svghelper.c | 20 +- tools/perf/util/symbol-elf.c | 145 +- tools/perf/util/symbol-minimal.c | 4 +- tools/perf/util/symbol.c | 228 +- tools/perf/util/symbol.h | 12 - tools/perf/util/symbol_fprintf.c | 4 +- tools/perf/util/synthetic-events.c | 24 +- tools/perf/util/thread.c | 4 +- tools/perf/util/tracepoint.c | 56 +- tools/perf/util/tracepoint.h | 3 +- tools/perf/util/unwind-libdw.c | 12 +- tools/perf/util/unwind-libunwind-local.c | 36 +- tools/perf/util/unwind-libunwind.c | 2 +- tools/perf/util/values.h | 1 + tools/perf/util/vdso.c | 56 +- tools/power/acpi/tools/pfrut/pfrut.c | 2 + tools/power/x86/intel-speed-select/isst-config.c | 25 +- .../power/x86/intel-speed-select/isst-core-mbox.c | 3 +- .../power/x86/intel-speed-select/isst-core-tpmi.c | 10 +- tools/power/x86/intel-speed-select/isst-core.c | 1 + tools/power/x86/intel-speed-select/isst-display.c | 30 +- tools/power/x86/intel-speed-select/isst.h | 2 +- tools/power/x86/turbostat/Makefile | 31 +- tools/power/x86/turbostat/turbostat.8 | 4 +- tools/power/x86/turbostat/turbostat.c | 1138 +++++++--- tools/sound/dapm-graph | 303 +++ tools/testing/cxl/test/mem.c | 19 +- tools/testing/kunit/qemu_configs/riscv.py | 2 +- tools/testing/nvdimm/test/ndtest.c | 7 +- tools/testing/nvdimm/test/ndtest.h | 31 - tools/testing/selftests/Makefile | 13 +- tools/testing/selftests/alsa/conf.c | 2 +- tools/testing/selftests/arm64/abi/tpidr2.c | 2 +- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/DENYLIST.aarch64 | 2 - tools/testing/selftests/bpf/DENYLIST.s390x | 1 + tools/testing/selftests/bpf/Makefile | 65 +- tools/testing/selftests/bpf/bench.c | 39 +- .../selftests/bpf/benchs/bench_bpf_crypto.c | 185 ++ .../bpf/benchs/bench_local_storage_create.c | 2 +- tools/testing/selftests/bpf/benchs/bench_trigger.c | 433 ++-- .../selftests/bpf/benchs/run_bench_trigger.sh | 22 +- .../selftests/bpf/benchs/run_bench_uprobes.sh | 2 +- tools/testing/selftests/bpf/bpf_arena_list.h | 4 +- tools/testing/selftests/bpf/bpf_experimental.h | 71 +- tools/testing/selftests/bpf/bpf_kfuncs.h | 3 + tools/testing/selftests/bpf/bpf_tcp_helpers.h | 241 -- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 260 +++ .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h | 28 + tools/testing/selftests/bpf/cgroup_helpers.c | 2 +- tools/testing/selftests/bpf/config | 7 + tools/testing/selftests/bpf/network_helpers.c | 241 +- tools/testing/selftests/bpf/network_helpers.h | 17 +- .../selftests/bpf/prog_tests/arena_atomics.c | 186 ++ .../testing/selftests/bpf/prog_tests/bpf_cookie.c | 114 +- .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 149 +- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- .../selftests/bpf/prog_tests/cls_redirect.c | 38 +- .../selftests/bpf/prog_tests/crypto_sanity.c | 197 ++ tools/testing/selftests/bpf/prog_tests/empty_skb.c | 2 + .../testing/selftests/bpf/prog_tests/fexit_sleep.c | 8 +- .../testing/selftests/bpf/prog_tests/fib_lookup.c | 132 +- .../selftests/bpf/prog_tests/flow_dissector.c | 1 - tools/testing/selftests/bpf/prog_tests/for_each.c | 62 + .../selftests/bpf/prog_tests/ip_check_defrag.c | 2 + .../selftests/bpf/prog_tests/kprobe_multi_test.c | 322 ++- tools/testing/selftests/bpf/prog_tests/ksyms.c | 30 +- .../selftests/bpf/prog_tests/module_attach.c | 6 + tools/testing/selftests/bpf/prog_tests/mptcp.c | 18 + .../selftests/bpf/prog_tests/ns_current_pid_tgid.c | 214 +- tools/testing/selftests/bpf/prog_tests/perf_skip.c | 137 ++ .../selftests/bpf/prog_tests/preempt_lock.c | 9 + tools/testing/selftests/bpf/prog_tests/ringbuf.c | 121 + .../testing/selftests/bpf/prog_tests/send_signal.c | 2 +- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 55 +- tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 2 +- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 2361 ++++++++++++++++++-- .../selftests/bpf/prog_tests/sockmap_basic.c | 171 +- .../selftests/bpf/prog_tests/sockmap_listen.c | 38 + tools/testing/selftests/bpf/prog_tests/sockopt.c | 65 +- .../selftests/bpf/prog_tests/sockopt_inherit.c | 64 +- .../bpf/prog_tests/stacktrace_build_id_nmi.c | 2 +- tools/testing/selftests/bpf/prog_tests/tc_netkit.c | 94 + .../testing/selftests/bpf/prog_tests/tc_redirect.c | 2 +- tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 14 + .../bpf/prog_tests/test_struct_ops_module.c | 159 +- .../testing/selftests/bpf/prog_tests/test_tunnel.c | 4 + .../selftests/bpf/prog_tests/timer_lockup.c | 91 + .../selftests/bpf/prog_tests/trace_printk.c | 36 +- .../selftests/bpf/prog_tests/trace_vprintk.c | 36 +- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 132 +- tools/testing/selftests/bpf/prog_tests/verifier.c | 6 + .../bpf/prog_tests/verifier_kfunc_prog_types.c | 11 + tools/testing/selftests/bpf/prog_tests/wq.c | 40 + .../selftests/bpf/prog_tests/xdp_adjust_tail.c | 2 +- .../selftests/bpf/prog_tests/xdp_metadata.c | 16 + tools/testing/selftests/bpf/progs/arena_atomics.c | 178 ++ tools/testing/selftests/bpf/progs/arena_list.c | 2 +- tools/testing/selftests/bpf/progs/bind4_prog.c | 24 +- tools/testing/selftests/bpf/progs/bind6_prog.c | 24 +- tools/testing/selftests/bpf/progs/bind_prog.h | 19 + tools/testing/selftests/bpf/progs/bpf_cc_cubic.c | 189 ++ tools/testing/selftests/bpf/progs/bpf_cubic.c | 74 +- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 62 +- .../selftests/bpf/progs/bpf_dctcp_release.c | 10 +- tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c | 8 +- .../testing/selftests/bpf/progs/bpf_tracing_net.h | 52 + .../bpf/progs/btf_dump_test_case_multidim.c | 4 +- .../bpf/progs/btf_dump_test_case_syntax.c | 4 +- .../selftests/bpf/progs/cgrp_kfunc_common.h | 2 +- tools/testing/selftests/bpf/progs/connect4_prog.c | 12 +- tools/testing/selftests/bpf/progs/connect6_prog.c | 6 + .../selftests/bpf/progs/connect_unix_prog.c | 6 + tools/testing/selftests/bpf/progs/cpumask_common.h | 2 +- .../testing/selftests/bpf/progs/cpumask_failure.c | 3 - tools/testing/selftests/bpf/progs/crypto_basic.c | 68 + tools/testing/selftests/bpf/progs/crypto_bench.c | 109 + tools/testing/selftests/bpf/progs/crypto_common.h | 66 + tools/testing/selftests/bpf/progs/crypto_sanity.c | 169 ++ tools/testing/selftests/bpf/progs/dynptr_fail.c | 12 +- tools/testing/selftests/bpf/progs/fib_lookup.c | 2 +- .../selftests/bpf/progs/for_each_multi_maps.c | 49 + .../selftests/bpf/progs/getpeername4_prog.c | 24 + .../selftests/bpf/progs/getpeername6_prog.c | 31 + .../selftests/bpf/progs/getsockname4_prog.c | 24 + .../selftests/bpf/progs/getsockname6_prog.c | 31 + tools/testing/selftests/bpf/progs/iters.c | 2 +- .../selftests/bpf/progs/jeq_infer_not_null_fail.c | 4 + .../selftests/bpf/progs/kprobe_multi_session.c | 79 + .../bpf/progs/kprobe_multi_session_cookie.c | 58 + tools/testing/selftests/bpf/progs/map_kptr.c | 10 + tools/testing/selftests/bpf/progs/mptcp_sock.c | 4 +- tools/testing/selftests/bpf/progs/mptcpify.c | 4 + tools/testing/selftests/bpf/progs/preempt_lock.c | 132 ++ .../testing/selftests/bpf/progs/sample_map_ret0.c | 34 - tools/testing/selftests/bpf/progs/sample_ret0.c | 7 - tools/testing/selftests/bpf/progs/sendmsg4_prog.c | 6 + tools/testing/selftests/bpf/progs/sendmsg6_prog.c | 57 + .../selftests/bpf/progs/sendmsg_unix_prog.c | 6 + tools/testing/selftests/bpf/progs/skb_pkt_end.c | 2 + tools/testing/selftests/bpf/progs/sock_addr_kern.c | 65 + .../selftests/bpf/progs/sockopt_qos_to_cc.c | 16 +- .../selftests/bpf/progs/struct_ops_forgotten_cb.c | 19 + .../selftests/bpf/progs/struct_ops_module.c | 36 +- .../selftests/bpf/progs/struct_ops_nulled_out_cb.c | 22 + .../selftests/bpf/progs/task_kfunc_common.h | 2 +- .../selftests/bpf/progs/tcp_ca_incompl_cong_ops.c | 12 +- tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 121 + .../selftests/bpf/progs/tcp_ca_unsupp_cong_op.c | 2 +- tools/testing/selftests/bpf/progs/tcp_ca_update.c | 18 +- .../selftests/bpf/progs/tcp_ca_write_sk_pacing.c | 20 +- tools/testing/selftests/bpf/progs/tcp_rtt.c | 6 + .../bpf/progs/test_access_variable_array.c | 2 +- .../testing/selftests/bpf/progs/test_bpf_cookie.c | 16 + .../selftests/bpf/progs/test_btf_skc_cls_ingress.c | 16 +- .../selftests/bpf/progs/test_global_func10.c | 4 + .../selftests/bpf/progs/test_lwt_redirect.c | 2 +- .../selftests/bpf/progs/test_module_attach.c | 23 + .../selftests/bpf/progs/test_ns_current_pid_tgid.c | 31 +- tools/testing/selftests/bpf/progs/test_perf_skip.c | 15 + tools/testing/selftests/bpf/progs/test_ringbuf_n.c | 47 + .../selftests/bpf/progs/test_ringbuf_write.c | 46 + .../selftests/bpf/progs/test_sk_storage_tracing.c | 2 +- .../selftests/bpf/progs/test_skmsg_load_helpers.c | 27 +- .../testing/selftests/bpf/progs/test_sock_fields.c | 5 +- .../selftests/bpf/progs/test_sockmap_pass_prog.c | 17 +- .../bpf/progs/test_sockmap_skb_verdict_attach.c | 2 +- tools/testing/selftests/bpf/progs/test_tc_link.c | 35 +- .../testing/selftests/bpf/progs/test_tcpbpf_kern.c | 13 +- .../testing/selftests/bpf/progs/test_tunnel_kern.c | 47 +- .../selftests/bpf/progs/test_xdp_noinline.c | 27 +- tools/testing/selftests/bpf/progs/test_xdp_vlan.c | 2 +- tools/testing/selftests/bpf/progs/timer.c | 3 +- tools/testing/selftests/bpf/progs/timer_failure.c | 2 +- tools/testing/selftests/bpf/progs/timer_lockup.c | 87 + tools/testing/selftests/bpf/progs/timer_mim.c | 2 +- .../testing/selftests/bpf/progs/timer_mim_reject.c | 2 +- tools/testing/selftests/bpf/progs/trigger_bench.c | 107 +- tools/testing/selftests/bpf/progs/uprobe_multi.c | 50 +- .../testing/selftests/bpf/progs/verifier_bounds.c | 63 + .../bpf/progs/verifier_helper_restricted.c | 8 +- .../bpf/progs/verifier_iterating_callbacks.c | 155 +- .../bpf/progs/verifier_kfunc_prog_types.c | 122 + tools/testing/selftests/bpf/progs/verifier_movsx.c | 63 + .../selftests/bpf/progs/verifier_or_jmp32_k.c | 41 + .../selftests/bpf/progs/verifier_sock_addr.c | 331 +++ .../selftests/bpf/progs/verifier_sockmap_mutate.c | 187 ++ .../bpf/progs/verifier_subprog_precision.c | 89 + tools/testing/selftests/bpf/progs/wq.c | 180 ++ tools/testing/selftests/bpf/progs/wq_failures.c | 144 ++ tools/testing/selftests/bpf/test_cpp.cpp | 5 + tools/testing/selftests/bpf/test_offload.py | 1405 ------------ tools/testing/selftests/bpf/test_sock_addr.c | 1434 ------------ tools/testing/selftests/bpf/test_sock_addr.sh | 58 - tools/testing/selftests/bpf/test_sockmap.c | 19 +- .../selftests/bpf/test_tcp_check_syncookie_user.c | 117 +- tools/testing/selftests/bpf/testing_helpers.c | 16 +- tools/testing/selftests/bpf/trace_helpers.c | 109 +- tools/testing/selftests/bpf/trace_helpers.h | 9 + tools/testing/selftests/bpf/uprobe_multi.c | 2 +- tools/testing/selftests/bpf/veristat.c | 5 +- tools/testing/selftests/bpf/xdp_hw_metadata.c | 16 +- tools/testing/selftests/bpf/xskxceiver.c | 123 +- tools/testing/selftests/bpf/xskxceiver.h | 12 +- tools/testing/selftests/capabilities/test_execve.c | 12 +- .../testing/selftests/capabilities/validate_cap.c | 7 +- tools/testing/selftests/cgroup/Makefile | 2 +- tools/testing/selftests/cgroup/cgroup_util.h | 2 +- tools/testing/selftests/cgroup/test_cpu.c | 4 +- .../testing/selftests/cgroup/test_cpuset_v1_hp.sh | 46 + tools/testing/selftests/cgroup/test_kmem.c | 4 +- tools/testing/selftests/cgroup/test_memcontrol.c | 4 +- tools/testing/selftests/cgroup/test_zswap.c | 136 +- tools/testing/selftests/clone3/clone3.c | 7 +- .../selftests/clone3/clone3_clear_sighand.c | 2 +- tools/testing/selftests/clone3/clone3_set_tid.c | 121 +- tools/testing/selftests/core/close_range_test.c | 55 +- tools/testing/selftests/cpufreq/cpufreq.sh | 3 +- tools/testing/selftests/cpufreq/main.sh | 47 +- tools/testing/selftests/cpufreq/module.sh | 6 +- tools/testing/selftests/damon/Makefile | 13 +- tools/testing/selftests/damon/_damon_sysfs.py | 177 +- tools/testing/selftests/damon/access_memory.c | 2 +- tools/testing/selftests/damon/damos_quota_goal.py | 77 + tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c | 247 +- tools/testing/selftests/drivers/net/Makefile | 11 + tools/testing/selftests/drivers/net/README.rst | 136 ++ tools/testing/selftests/drivers/net/config | 2 + tools/testing/selftests/drivers/net/hw/Makefile | 28 + tools/testing/selftests/drivers/net/hw/csum.py | 122 + .../selftests/drivers/net/hw/devlink_port_split.py | 309 +++ tools/testing/selftests/drivers/net/hw/ethtool.sh | 297 +++ .../drivers/net/hw/ethtool_extended_state.sh | 116 + .../selftests/drivers/net/hw/ethtool_lib.sh | 120 + .../testing/selftests/drivers/net/hw/ethtool_mm.sh | 341 +++ .../selftests/drivers/net/hw/ethtool_rmon.sh | 145 ++ .../selftests/drivers/net/hw/hw_stats_l3.sh | 334 +++ .../selftests/drivers/net/hw/hw_stats_l3_gre.sh | 111 + .../selftests/drivers/net/hw/lib/py/__init__.py | 16 + tools/testing/selftests/drivers/net/hw/loopback.sh | 103 + .../selftests/drivers/net/hw/pp_alloc_fail.py | 129 ++ tools/testing/selftests/drivers/net/hw/settings | 1 + .../selftests/drivers/net/lib/py/__init__.py | 19 + tools/testing/selftests/drivers/net/lib/py/env.py | 224 ++ tools/testing/selftests/drivers/net/lib/py/load.py | 41 + .../testing/selftests/drivers/net/lib/py/remote.py | 15 + .../selftests/drivers/net/lib/py/remote_netns.py | 21 + .../selftests/drivers/net/lib/py/remote_ssh.py | 39 + .../selftests/drivers/net/microchip/ksz9477_qos.sh | 668 ++++++ .../selftests/drivers/net/mlxsw/ethtool_lanes.sh | 14 +- .../drivers/net/mlxsw/spectrum-2/tc_flower.sh | 55 +- tools/testing/selftests/drivers/net/ping.py | 51 + tools/testing/selftests/drivers/net/queues.py | 66 + tools/testing/selftests/drivers/net/stats.py | 144 ++ .../selftests/drivers/net/virtio_net/Makefile | 15 + .../drivers/net/virtio_net/basic_features.sh | 131 ++ .../selftests/drivers/net/virtio_net/config | 8 + .../drivers/net/virtio_net/virtio_net_common.sh | 99 + tools/testing/selftests/exec/recursion-depth.c | 10 +- tools/testing/selftests/fchmodat2/Makefile | 11 +- .../filesystems/statmount/statmount_test.c | 13 +- tools/testing/selftests/ftrace/config | 26 +- tools/testing/selftests/ftrace/ftracetest | 8 +- tools/testing/selftests/ftrace/ftracetest-ktap | 2 +- .../ftrace/test.d/dynevent/fprobe_args_vfs.tc | 41 + .../ftrace/test.d/ftrace/func_set_ftrace_file.tc | 2 +- .../ftrace/test.d/kprobe/kprobe_args_vfs.tc | 40 + tools/testing/selftests/futex/Makefile | 2 - tools/testing/selftests/hid/config.common | 1 + tools/testing/selftests/hid/hid_bpf.c | 112 +- tools/testing/selftests/hid/progs/hid.c | 46 + .../testing/selftests/hid/progs/hid_bpf_helpers.h | 6 + tools/testing/selftests/hid/tests/base.py | 92 +- tools/testing/selftests/hid/tests/base_device.py | 421 ++++ tools/testing/selftests/hid/tests/base_gamepad.py | 238 ++ tools/testing/selftests/hid/tests/test_gamepad.py | 457 +++- tools/testing/selftests/hid/tests/test_tablet.py | 723 ++++-- tools/testing/selftests/iommu/iommufd.c | 64 +- tools/testing/selftests/iommu/iommufd_utils.h | 6 +- tools/testing/selftests/ipc/msgque.c | 11 +- tools/testing/selftests/kselftest.h | 49 +- tools/testing/selftests/kselftest_deps.sh | 1 + tools/testing/selftests/kvm/Makefile | 10 +- tools/testing/selftests/kvm/aarch64/arch_timer.c | 11 +- .../selftests/kvm/aarch64/page_fault_test.c | 5 +- tools/testing/selftests/kvm/aarch64/psci_test.c | 4 +- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 123 +- tools/testing/selftests/kvm/aarch64/vgic_init.c | 1 - tools/testing/selftests/kvm/aarch64/vgic_irq.c | 15 +- .../selftests/kvm/aarch64/vgic_lpi_stress.c | 410 ++++ .../selftests/kvm/aarch64/vpmu_counter_access.c | 6 +- tools/testing/selftests/kvm/arch_timer.c | 4 +- tools/testing/selftests/kvm/demand_paging_test.c | 94 +- tools/testing/selftests/kvm/dirty_log_perf_test.c | 15 +- tools/testing/selftests/kvm/dirty_log_test.c | 26 +- tools/testing/selftests/kvm/guest_memfd_test.c | 4 +- tools/testing/selftests/kvm/guest_print_test.c | 1 + .../testing/selftests/kvm/hardware_disable_test.c | 3 - tools/testing/selftests/kvm/include/aarch64/gic.h | 21 +- .../testing/selftests/kvm/include/aarch64/gic_v3.h | 586 ++++- .../selftests/kvm/include/aarch64/gic_v3_its.h | 19 + .../selftests/kvm/include/aarch64/processor.h | 21 +- .../testing/selftests/kvm/include/aarch64/ucall.h | 2 +- tools/testing/selftests/kvm/include/aarch64/vgic.h | 5 +- tools/testing/selftests/kvm/include/kvm_util.h | 1111 ++++++++- .../testing/selftests/kvm/include/kvm_util_base.h | 1135 ---------- .../testing/selftests/kvm/include/kvm_util_types.h | 20 + tools/testing/selftests/kvm/include/memstress.h | 1 - .../selftests/kvm/include/riscv/processor.h | 49 +- tools/testing/selftests/kvm/include/riscv/sbi.h | 141 ++ tools/testing/selftests/kvm/include/riscv/ucall.h | 1 + tools/testing/selftests/kvm/include/s390x/ucall.h | 2 +- tools/testing/selftests/kvm/include/test_util.h | 19 + .../selftests/kvm/include/userfaultfd_util.h | 19 +- .../selftests/kvm/include/x86_64/kvm_util_arch.h | 28 + .../selftests/kvm/include/x86_64/processor.h | 12 +- tools/testing/selftests/kvm/include/x86_64/sev.h | 19 +- tools/testing/selftests/kvm/include/x86_64/ucall.h | 2 +- .../testing/selftests/kvm/kvm_binary_stats_test.c | 2 - tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 - tools/testing/selftests/kvm/kvm_page_table_test.c | 4 +- tools/testing/selftests/kvm/lib/aarch64/gic.c | 18 +- .../selftests/kvm/lib/aarch64/gic_private.h | 4 +- tools/testing/selftests/kvm/lib/aarch64/gic_v3.c | 99 +- .../testing/selftests/kvm/lib/aarch64/gic_v3_its.c | 248 ++ .../testing/selftests/kvm/lib/aarch64/processor.c | 2 + tools/testing/selftests/kvm/lib/aarch64/vgic.c | 38 +- tools/testing/selftests/kvm/lib/assert.c | 3 - tools/testing/selftests/kvm/lib/kvm_util.c | 21 +- tools/testing/selftests/kvm/lib/memstress.c | 13 +- tools/testing/selftests/kvm/lib/riscv/processor.c | 13 + tools/testing/selftests/kvm/lib/riscv/ucall.c | 1 + tools/testing/selftests/kvm/lib/test_util.c | 2 - tools/testing/selftests/kvm/lib/ucall_common.c | 5 +- tools/testing/selftests/kvm/lib/userfaultfd_util.c | 156 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 331 ++- tools/testing/selftests/kvm/lib/x86_64/sev.c | 45 +- .../testing/selftests/kvm/max_guest_memory_test.c | 2 - .../kvm/memslot_modification_stress_test.c | 3 - tools/testing/selftests/kvm/riscv/arch_timer.c | 6 +- tools/testing/selftests/kvm/riscv/ebreak_test.c | 83 + tools/testing/selftests/kvm/riscv/get-reg-list.c | 4 + tools/testing/selftests/kvm/riscv/sbi_pmu_test.c | 682 ++++++ tools/testing/selftests/kvm/rseq_test.c | 48 +- tools/testing/selftests/kvm/s390x/cmma_test.c | 3 +- tools/testing/selftests/kvm/s390x/memop.c | 1 + .../selftests/kvm/s390x/shared_zeropage_test.c | 111 + tools/testing/selftests/kvm/s390x/sync_regs_test.c | 2 - tools/testing/selftests/kvm/s390x/tprot.c | 1 + .../testing/selftests/kvm/set_memory_region_test.c | 21 +- tools/testing/selftests/kvm/steal_time.c | 53 +- tools/testing/selftests/kvm/x86_64/amx_test.c | 4 - .../kvm/x86_64/dirty_log_page_splitting_test.c | 1 + .../kvm/x86_64/exit_on_emulation_failure_test.c | 5 +- .../selftests/kvm/x86_64/fix_hypercall_test.c | 2 - tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c | 2 - tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 2 - tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c | 3 - .../testing/selftests/kvm/x86_64/hyperv_features.c | 6 - tools/testing/selftests/kvm/x86_64/hyperv_ipi.c | 5 - .../testing/selftests/kvm/x86_64/hyperv_svm_test.c | 1 - .../selftests/kvm/x86_64/hyperv_tlb_flush.c | 2 - tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 3 - .../selftests/kvm/x86_64/monitor_mwait_test.c | 4 +- .../selftests/kvm/x86_64/nested_exceptions_test.c | 2 - .../selftests/kvm/x86_64/nx_huge_pages_test.c | 3 - .../selftests/kvm/x86_64/nx_huge_pages_test.sh | 13 +- .../selftests/kvm/x86_64/platform_info_test.c | 61 +- .../selftests/kvm/x86_64/pmu_counters_test.c | 8 - .../selftests/kvm/x86_64/pmu_event_filter_test.c | 9 - .../kvm/x86_64/private_mem_conversions_test.c | 1 - .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 1 - .../testing/selftests/kvm/x86_64/set_sregs_test.c | 1 - .../testing/selftests/kvm/x86_64/sev_init2_tests.c | 152 ++ .../testing/selftests/kvm/x86_64/sev_smoke_test.c | 96 +- .../kvm/x86_64/smaller_maxphyaddr_emulation_test.c | 6 - tools/testing/selftests/kvm/x86_64/smm_test.c | 1 - tools/testing/selftests/kvm/x86_64/state_test.c | 1 - .../selftests/kvm/x86_64/svm_int_ctl_test.c | 3 - .../kvm/x86_64/svm_nested_shutdown_test.c | 5 +- .../kvm/x86_64/svm_nested_soft_inject_test.c | 5 +- .../testing/selftests/kvm/x86_64/sync_regs_test.c | 2 - .../selftests/kvm/x86_64/ucna_injection_test.c | 7 - .../selftests/kvm/x86_64/userspace_msr_exit_test.c | 15 +- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 3 - .../vmx_exception_with_invalid_guest_state.c | 3 - .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 4 - .../kvm/x86_64/vmx_preemption_timer_test.c | 1 - .../testing/selftests/kvm/x86_64/xapic_ipi_test.c | 4 - .../selftests/kvm/x86_64/xapic_state_test.c | 1 - .../testing/selftests/kvm/x86_64/xcr0_cpuid_test.c | 3 - .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 59 +- tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 2 - tools/testing/selftests/landlock/base_test.c | 76 +- tools/testing/selftests/landlock/config | 1 + tools/testing/selftests/landlock/fs_test.c | 532 ++++- tools/testing/selftests/lib.mk | 26 +- .../membarrier/membarrier_test_multi_thread.c | 2 +- .../membarrier/membarrier_test_single_thread.c | 2 +- tools/testing/selftests/memfd/fuse_test.c | 2 +- tools/testing/selftests/memfd/memfd_test.c | 2 +- tools/testing/selftests/mm/.gitignore | 2 + tools/testing/selftests/mm/Makefile | 4 +- tools/testing/selftests/mm/cow.c | 106 +- tools/testing/selftests/mm/gup_longterm.c | 16 +- tools/testing/selftests/mm/hugetlb_madv_vs_map.c | 16 +- tools/testing/selftests/mm/ksm_functional_tests.c | 173 +- tools/testing/selftests/mm/map_fixed_noreplace.c | 24 +- tools/testing/selftests/mm/memfd_secret.c | 51 +- tools/testing/selftests/mm/mlock2-tests.c | 15 +- tools/testing/selftests/mm/mremap_test.c | 204 +- tools/testing/selftests/mm/mseal_test.c | 1894 ++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 13 +- tools/testing/selftests/mm/seal_elf.c | 179 ++ tools/testing/selftests/mm/soft-dirty.c | 2 +- tools/testing/selftests/mm/virtual_address_range.c | 78 +- tools/testing/selftests/net/.gitignore | 4 +- tools/testing/selftests/net/Makefile | 60 +- tools/testing/selftests/net/af_unix/Makefile | 2 +- tools/testing/selftests/net/af_unix/config | 3 + tools/testing/selftests/net/af_unix/msg_oob.c | 734 ++++++ tools/testing/selftests/net/af_unix/scm_rights.c | 307 +++ .../testing/selftests/net/af_unix/test_unix_oob.c | 436 ---- tools/testing/selftests/net/bpf.mk | 53 + tools/testing/selftests/net/bpf_offload.py | 1341 +++++++++++ tools/testing/selftests/net/cmsg_sender.c | 32 +- tools/testing/selftests/net/cmsg_time.sh | 7 +- tools/testing/selftests/net/config | 2 + tools/testing/selftests/net/csum.c | 988 -------- tools/testing/selftests/net/devlink_port_split.py | 309 --- tools/testing/selftests/net/epoll_busy_poll.c | 320 +++ tools/testing/selftests/net/fib_rule_tests.sh | 46 +- tools/testing/selftests/net/fib_tests.sh | 24 +- tools/testing/selftests/net/forwarding/Makefile | 9 +- tools/testing/selftests/net/forwarding/README | 33 + .../net/forwarding/bridge_fdb_learning_limit.sh | 18 + .../selftests/net/forwarding/devlink_lib.sh | 2 + tools/testing/selftests/net/forwarding/ethtool.sh | 301 --- .../net/forwarding/ethtool_extended_state.sh | 117 - .../selftests/net/forwarding/ethtool_lib.sh | 120 - .../testing/selftests/net/forwarding/ethtool_mm.sh | 340 --- .../selftests/net/forwarding/ethtool_rmon.sh | 143 -- .../net/forwarding/forwarding.config.sample | 53 +- .../selftests/net/forwarding/hw_stats_l3.sh | 340 --- .../selftests/net/forwarding/hw_stats_l3_gre.sh | 111 - tools/testing/selftests/net/forwarding/ipip_lib.sh | 1 - tools/testing/selftests/net/forwarding/lib.sh | 253 ++- .../selftests/net/forwarding/lib_sh_test.sh | 208 ++ .../selftests/net/forwarding/local_termination.sh | 30 +- tools/testing/selftests/net/forwarding/loopback.sh | 102 - .../selftests/net/forwarding/router_mpath_nh.sh | 35 + .../net/forwarding/router_mpath_nh_lib.sh | 12 +- .../net/forwarding/router_mpath_nh_res.sh | 35 + .../testing/selftests/net/forwarding/router_nh.sh | 14 + .../selftests/net/forwarding/sch_ets_tests.sh | 19 +- tools/testing/selftests/net/forwarding/sch_red.sh | 10 +- .../selftests/net/forwarding/sch_tbf_core.sh | 2 +- .../testing/selftests/net/forwarding/tc_common.sh | 2 +- .../selftests/net/forwarding/tc_tunnel_key.sh | 2 - tools/testing/selftests/net/gro.c | 138 ++ tools/testing/selftests/net/hsr/Makefile | 3 +- tools/testing/selftests/net/hsr/config | 1 + tools/testing/selftests/net/hsr/hsr_common.sh | 84 + tools/testing/selftests/net/hsr/hsr_ping.sh | 106 +- tools/testing/selftests/net/hsr/hsr_redbox.sh | 121 + tools/testing/selftests/net/lib.sh | 9 +- tools/testing/selftests/net/lib/.gitignore | 2 + tools/testing/selftests/net/lib/Makefile | 15 + tools/testing/selftests/net/lib/csum.c | 1000 +++++++++ tools/testing/selftests/net/lib/py/__init__.py | 8 + tools/testing/selftests/net/lib/py/consts.py | 9 + tools/testing/selftests/net/lib/py/ksft.py | 159 ++ tools/testing/selftests/net/lib/py/netns.py | 31 + tools/testing/selftests/net/lib/py/nsim.py | 134 ++ tools/testing/selftests/net/lib/py/utils.py | 102 + tools/testing/selftests/net/lib/py/ynl.py | 49 + tools/testing/selftests/net/mptcp/diag.sh | 53 + tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_join.sh | 149 +- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 165 +- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 34 +- tools/testing/selftests/net/mptcp/pm_netlink.sh | 295 ++- tools/testing/selftests/net/mptcp/simult_flows.sh | 16 +- tools/testing/selftests/net/nat6to4.bpf.c | 285 +++ tools/testing/selftests/net/nat6to4.c | 285 --- tools/testing/selftests/net/netfilter/.gitignore | 6 + tools/testing/selftests/net/netfilter/Makefile | 52 + .../selftests/net/netfilter/audit_logread.c | 165 ++ .../selftests/net/netfilter/br_netfilter.sh | 171 ++ .../selftests/net/netfilter/bridge_brouter.sh | 122 + tools/testing/selftests/net/netfilter/config | 89 + .../selftests/net/netfilter/connect_close.c | 136 ++ .../selftests/net/netfilter/conntrack_dump_flush.c | 469 ++++ .../net/netfilter/conntrack_icmp_related.sh | 278 +++ .../selftests/net/netfilter/conntrack_ipip_mtu.sh | 191 ++ .../net/netfilter/conntrack_sctp_collision.sh | 87 + .../net/netfilter/conntrack_tcp_unreplied.sh | 164 ++ .../selftests/net/netfilter/conntrack_vrf.sh | 220 ++ tools/testing/selftests/net/netfilter/ipvs.sh | 211 ++ tools/testing/selftests/net/netfilter/lib.sh | 10 + .../net/netfilter/nf_conntrack_packetdrill.sh | 71 + .../selftests/net/netfilter/nf_nat_edemux.sh | 97 + tools/testing/selftests/net/netfilter/nf_queue.c | 395 ++++ tools/testing/selftests/net/netfilter/nft_audit.sh | 268 +++ .../selftests/net/netfilter/nft_concat_range.sh | 1622 ++++++++++++++ .../net/netfilter/nft_concat_range_perf.sh | 9 + .../net/netfilter/nft_conntrack_helper.sh | 171 ++ tools/testing/selftests/net/netfilter/nft_fib.sh | 234 ++ .../selftests/net/netfilter/nft_flowtable.sh | 671 ++++++ tools/testing/selftests/net/netfilter/nft_meta.sh | 142 ++ tools/testing/selftests/net/netfilter/nft_nat.sh | 1156 ++++++++++ .../selftests/net/netfilter/nft_nat_zones.sh | 267 +++ tools/testing/selftests/net/netfilter/nft_queue.sh | 417 ++++ .../selftests/net/netfilter/nft_synproxy.sh | 96 + .../selftests/net/netfilter/nft_zones_many.sh | 164 ++ .../selftests/net/netfilter/packetdrill/common.sh | 33 + .../packetdrill/conntrack_ack_loss_stall.pkt | 118 + .../packetdrill/conntrack_inexact_rst.pkt | 62 + .../packetdrill/conntrack_rst_invalid.pkt | 59 + .../packetdrill/conntrack_syn_challenge_ack.pkt | 44 + .../netfilter/packetdrill/conntrack_synack_old.pkt | 51 + .../packetdrill/conntrack_synack_reuse.pkt | 34 + tools/testing/selftests/net/netfilter/rpath.sh | 175 ++ .../selftests/net/netfilter/sctp_collision.c | 99 + tools/testing/selftests/net/netfilter/settings | 1 + tools/testing/selftests/net/netfilter/xt_string.sh | 133 ++ tools/testing/selftests/net/nl_netdev.py | 98 + .../testing/selftests/net/openvswitch/ovs-dpctl.py | 16 +- tools/testing/selftests/net/sample_map_ret0.bpf.c | 34 + tools/testing/selftests/net/sample_ret0.bpf.c | 10 + .../selftests/net/srv6_end_dx4_netfilter_test.sh | 335 +++ .../selftests/net/srv6_end_dx6_netfilter_test.sh | 340 +++ tools/testing/selftests/net/udpgro.sh | 2 +- tools/testing/selftests/net/udpgro_bench.sh | 2 +- tools/testing/selftests/net/udpgro_frglist.sh | 8 +- tools/testing/selftests/net/udpgro_fwd.sh | 2 +- tools/testing/selftests/net/veth.sh | 2 +- tools/testing/selftests/net/xdp_dummy.bpf.c | 13 + tools/testing/selftests/net/xdp_dummy.c | 13 - tools/testing/selftests/net/xfrm_policy.sh | 4 +- tools/testing/selftests/netfilter/.gitignore | 6 - tools/testing/selftests/netfilter/Makefile | 21 - tools/testing/selftests/netfilter/audit_logread.c | 165 -- .../testing/selftests/netfilter/bridge_brouter.sh | 146 -- .../selftests/netfilter/bridge_netfilter.sh | 188 -- tools/testing/selftests/netfilter/config | 9 - tools/testing/selftests/netfilter/connect_close.c | 136 -- .../selftests/netfilter/conntrack_dump_flush.c | 471 ---- .../selftests/netfilter/conntrack_icmp_related.sh | 315 --- .../netfilter/conntrack_sctp_collision.sh | 89 - .../selftests/netfilter/conntrack_tcp_unreplied.sh | 167 -- tools/testing/selftests/netfilter/conntrack_vrf.sh | 241 -- .../selftests/netfilter/ipip-conntrack-mtu.sh | 207 -- tools/testing/selftests/netfilter/ipvs.sh | 228 -- tools/testing/selftests/netfilter/nf-queue.c | 395 ---- tools/testing/selftests/netfilter/nf_nat_edemux.sh | 127 -- tools/testing/selftests/netfilter/nft_audit.sh | 245 -- .../selftests/netfilter/nft_concat_range.sh | 1645 -------------- .../selftests/netfilter/nft_conntrack_helper.sh | 197 -- tools/testing/selftests/netfilter/nft_fib.sh | 273 --- tools/testing/selftests/netfilter/nft_flowtable.sh | 672 ------ tools/testing/selftests/netfilter/nft_meta.sh | 142 -- tools/testing/selftests/netfilter/nft_nat.sh | 1224 ---------- tools/testing/selftests/netfilter/nft_nat_zones.sh | 309 --- tools/testing/selftests/netfilter/nft_queue.sh | 449 ---- tools/testing/selftests/netfilter/nft_synproxy.sh | 117 - .../selftests/netfilter/nft_trans_stress.sh | 151 -- .../testing/selftests/netfilter/nft_zones_many.sh | 163 -- tools/testing/selftests/netfilter/rpath.sh | 169 -- tools/testing/selftests/netfilter/sctp_collision.c | 99 - tools/testing/selftests/netfilter/settings | 1 - tools/testing/selftests/netfilter/xt_string.sh | 128 -- tools/testing/selftests/nolibc/nolibc-test.c | 82 + tools/testing/selftests/openat2/Makefile | 14 +- tools/testing/selftests/perf_events/.gitignore | 1 + tools/testing/selftests/perf_events/Makefile | 2 +- .../selftests/perf_events/watermark_signal.c | 146 ++ tools/testing/selftests/pidfd/pidfd_fdinfo_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_open_test.c | 4 +- tools/testing/selftests/pidfd/pidfd_poll_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_test.c | 2 +- tools/testing/selftests/powerpc/Makefile | 11 +- tools/testing/selftests/powerpc/alignment/Makefile | 1 + .../testing/selftests/powerpc/benchmarks/Makefile | 5 +- .../testing/selftests/powerpc/cache_shape/Makefile | 1 + tools/testing/selftests/powerpc/copyloops/Makefile | 21 +- tools/testing/selftests/powerpc/dexcr/.gitignore | 2 + tools/testing/selftests/powerpc/dexcr/Makefile | 7 +- tools/testing/selftests/powerpc/dexcr/chdexcr.c | 112 + tools/testing/selftests/powerpc/dexcr/dexcr.c | 40 + tools/testing/selftests/powerpc/dexcr/dexcr.h | 57 + tools/testing/selftests/powerpc/dexcr/dexcr_test.c | 215 ++ .../testing/selftests/powerpc/dexcr/hashchk_test.c | 8 +- tools/testing/selftests/powerpc/dexcr/lsdexcr.c | 103 +- tools/testing/selftests/powerpc/dscr/Makefile | 1 + tools/testing/selftests/powerpc/eeh/Makefile | 1 + tools/testing/selftests/powerpc/flags.mk | 9 + tools/testing/selftests/powerpc/math/Makefile | 1 + tools/testing/selftests/powerpc/mce/Makefile | 1 + tools/testing/selftests/powerpc/mm/Makefile | 1 + tools/testing/selftests/powerpc/nx-gzip/Makefile | 5 +- .../selftests/powerpc/papr_attributes/Makefile | 3 +- .../selftests/powerpc/papr_sysparm/Makefile | 1 + tools/testing/selftests/powerpc/papr_vpd/Makefile | 1 + tools/testing/selftests/powerpc/pmu/Makefile | 44 +- tools/testing/selftests/powerpc/pmu/ebb/Makefile | 21 +- .../powerpc/pmu/event_code_tests/Makefile | 5 +- .../selftests/powerpc/pmu/sampling_tests/Makefile | 5 +- .../testing/selftests/powerpc/primitives/Makefile | 5 +- tools/testing/selftests/powerpc/ptrace/Makefile | 1 + tools/testing/selftests/powerpc/security/Makefile | 5 +- tools/testing/selftests/powerpc/signal/Makefile | 4 +- .../testing/selftests/powerpc/stringloops/Makefile | 11 +- .../selftests/powerpc/switch_endian/Makefile | 5 +- tools/testing/selftests/powerpc/syscalls/Makefile | 5 +- tools/testing/selftests/powerpc/tm/Makefile | 1 + tools/testing/selftests/powerpc/vphn/Makefile | 5 +- tools/testing/selftests/rcutorture/bin/torture.sh | 6 +- .../selftests/rcutorture/configs/rcu/TREE09 | 5 +- tools/testing/selftests/resctrl/cat_test.c | 8 +- tools/testing/selftests/resctrl/cmt_test.c | 8 +- tools/testing/selftests/resctrl/mba_test.c | 10 +- tools/testing/selftests/resctrl/mbm_test.c | 10 +- tools/testing/selftests/resctrl/resctrl.h | 9 +- tools/testing/selftests/resctrl/resctrl_tests.c | 26 +- tools/testing/selftests/resctrl/resctrl_val.c | 62 +- tools/testing/selftests/ring-buffer/.gitignore | 1 + tools/testing/selftests/ring-buffer/Makefile | 8 + tools/testing/selftests/ring-buffer/config | 2 + tools/testing/selftests/ring-buffer/map_test.c | 294 +++ tools/testing/selftests/riscv/Makefile | 2 +- tools/testing/selftests/riscv/sigreturn/.gitignore | 1 + tools/testing/selftests/riscv/sigreturn/Makefile | 12 + .../testing/selftests/riscv/sigreturn/sigreturn.c | 82 + .../testing/selftests/seccomp/seccomp_benchmark.c | 6 +- .../selftests/sigaltstack/current_stack_pointer.h | 2 +- tools/testing/selftests/sync/sync_test.c | 3 +- tools/testing/selftests/timers/adjtick.c | 4 +- .../testing/selftests/timers/alarmtimer-suspend.c | 4 +- tools/testing/selftests/timers/change_skew.c | 4 +- tools/testing/selftests/timers/freq-step.c | 4 +- tools/testing/selftests/timers/leap-a-day.c | 10 +- tools/testing/selftests/timers/leapcrash.c | 4 +- tools/testing/selftests/timers/mqueue-lat.c | 4 +- tools/testing/selftests/timers/posix_timers.c | 12 +- tools/testing/selftests/timers/raw_skew.c | 6 +- tools/testing/selftests/timers/set-2038.c | 4 +- tools/testing/selftests/timers/set-tai.c | 4 +- tools/testing/selftests/timers/set-timer-lat.c | 4 +- tools/testing/selftests/timers/set-tz.c | 4 +- tools/testing/selftests/timers/skew_consistency.c | 4 +- tools/testing/selftests/timers/threadtest.c | 2 +- tools/testing/selftests/timers/valid-adjtimex.c | 6 +- tools/testing/selftests/tty/tty_tstamp_update.c | 48 +- tools/testing/selftests/user_events/ftrace_test.c | 8 + tools/testing/selftests/vDSO/Makefile | 29 +- .../selftests/wireguard/qemu/arch/riscv32.config | 2 +- .../selftests/wireguard/qemu/arch/riscv64.config | 2 +- .../testing/selftests/wireguard/qemu/kernel.config | 1 - tools/testing/selftests/x86/amx.c | 27 +- tools/testing/selftests/x86/lam.c | 4 +- tools/testing/selftests/x86/test_mremap_vdso.c | 43 +- tools/testing/selftests/x86/test_shadow_stack.c | 67 +- tools/testing/selftests/x86/test_vsyscall.c | 506 ++--- tools/tracing/rtla/Makefile.config | 2 +- tools/tracing/rtla/src/osnoise_hist.c | 55 +- tools/tracing/rtla/src/osnoise_top.c | 55 +- tools/tracing/rtla/src/timerlat_hist.c | 234 +- tools/tracing/rtla/src/timerlat_top.c | 233 +- tools/tracing/rtla/src/trace.c | 15 + tools/tracing/rtla/src/trace.h | 1 + tools/workqueue/wq_monitor.py | 9 +- tools/writeback/wb_monitor.py | 172 ++ 1059 files changed, 65577 insertions(+), 32259 deletions(-) create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/.gitignore create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/Makefile create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/README create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c delete mode 100644 tools/arch/x86/include/asm/irq_vectors.h delete mode 100644 tools/arch/x86/include/uapi/asm/prctl.h delete mode 100644 tools/hv/hv_fcopy_daemon.c create mode 100644 tools/hv/hv_fcopy_uio_daemon.c create mode 100644 tools/hv/vmbus_bufring.c create mode 100644 tools/hv/vmbus_bufring.h create mode 100644 tools/include/linux/align.h delete mode 100644 tools/include/uapi/asm-generic/fcntl.h create mode 100644 tools/include/uapi/linux/bits.h delete mode 100644 tools/include/uapi/linux/ethtool.h delete mode 100644 tools/include/uapi/linux/fcntl.h delete mode 100644 tools/include/uapi/linux/fs.h create mode 100644 tools/include/uapi/linux/memfd.h delete mode 100644 tools/include/uapi/linux/mount.h delete mode 100644 tools/include/uapi/linux/openat2.h delete mode 100644 tools/include/uapi/linux/prctl.h delete mode 100644 tools/include/uapi/linux/sched.h delete mode 100644 tools/include/uapi/linux/usbdevice_fs.h create mode 100644 tools/include/uapi/linux/userfaultfd.h delete mode 100644 tools/include/uapi/linux/vhost.h delete mode 100644 tools/include/uapi/sound/asound.h create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/decode.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/execution.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/load-store.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/recommended.json create mode 100755 tools/perf/scripts/python/parallel-perf.py create mode 100755 tools/perf/tests/shell/annotate.sh create mode 100644 tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h create mode 100644 tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h create mode 100755 tools/perf/trace/beauty/clone.sh create mode 100644 tools/perf/trace/beauty/fs_at_flags.c create mode 100755 tools/perf/trace/beauty/fs_at_flags.sh create mode 100644 tools/perf/trace/beauty/include/uapi/linux/fcntl.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/fs.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/mount.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/prctl.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/sched.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/stat.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/vhost.h create mode 100644 tools/perf/trace/beauty/include/uapi/sound/asound.h create mode 100755 tools/perf/trace/beauty/statx_mask.sh create mode 100644 tools/perf/ui/browsers/annotate-data.c create mode 100644 tools/perf/util/disasm.c create mode 100644 tools/perf/util/disasm.h create mode 100644 tools/perf/util/mem-info.c create mode 100644 tools/perf/util/mem-info.h create mode 100755 tools/sound/dapm-graph create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c delete mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_atomics.c create mode 100644 tools/testing/selftests/bpf/prog_tests/crypto_sanity.c create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_skip.c create mode 100644 tools/testing/selftests/bpf/prog_tests/preempt_lock.c create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_lockup.c create mode 100644 tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c create mode 100644 tools/testing/selftests/bpf/prog_tests/wq.c create mode 100644 tools/testing/selftests/bpf/progs/arena_atomics.c create mode 100644 tools/testing/selftests/bpf/progs/bind_prog.h create mode 100644 tools/testing/selftests/bpf/progs/bpf_cc_cubic.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_basic.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_bench.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_common.h create mode 100644 tools/testing/selftests/bpf/progs/crypto_sanity.c create mode 100644 tools/testing/selftests/bpf/progs/for_each_multi_maps.c create mode 100644 tools/testing/selftests/bpf/progs/getpeername4_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getpeername6_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getsockname4_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getsockname6_prog.c create mode 100644 tools/testing/selftests/bpf/progs/kprobe_multi_session.c create mode 100644 tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c create mode 100644 tools/testing/selftests/bpf/progs/preempt_lock.c delete mode 100644 tools/testing/selftests/bpf/progs/sample_map_ret0.c delete mode 100644 tools/testing/selftests/bpf/progs/sample_ret0.c create mode 100644 tools/testing/selftests/bpf/progs/sock_addr_kern.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c create mode 100644 tools/testing/selftests/bpf/progs/test_perf_skip.c create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_n.c create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_write.c create mode 100644 tools/testing/selftests/bpf/progs/timer_lockup.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_sock_addr.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c create mode 100644 tools/testing/selftests/bpf/progs/wq.c create mode 100644 tools/testing/selftests/bpf/progs/wq_failures.c delete mode 100755 tools/testing/selftests/bpf/test_offload.py delete mode 100644 tools/testing/selftests/bpf/test_sock_addr.c delete mode 100755 tools/testing/selftests/bpf/test_sock_addr.sh create mode 100755 tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh create mode 100644 tools/testing/selftests/damon/damos_quota_goal.py create mode 100644 tools/testing/selftests/drivers/net/Makefile create mode 100644 tools/testing/selftests/drivers/net/README.rst create mode 100644 tools/testing/selftests/drivers/net/config create mode 100644 tools/testing/selftests/drivers/net/hw/Makefile create mode 100755 tools/testing/selftests/drivers/net/hw/csum.py create mode 100755 tools/testing/selftests/drivers/net/hw/devlink_port_split.py create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh create mode 100644 tools/testing/selftests/drivers/net/hw/ethtool_lib.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_mm.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh create mode 100755 tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh create mode 100755 tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh create mode 100644 tools/testing/selftests/drivers/net/hw/lib/py/__init__.py create mode 100755 tools/testing/selftests/drivers/net/hw/loopback.sh create mode 100755 tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py create mode 100644 tools/testing/selftests/drivers/net/hw/settings create mode 100644 tools/testing/selftests/drivers/net/lib/py/__init__.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/env.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/load.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/remote.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/remote_netns.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/remote_ssh.py create mode 100755 tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh create mode 100755 tools/testing/selftests/drivers/net/ping.py create mode 100755 tools/testing/selftests/drivers/net/queues.py create mode 100755 tools/testing/selftests/drivers/net/stats.py create mode 100644 tools/testing/selftests/drivers/net/virtio_net/Makefile create mode 100755 tools/testing/selftests/drivers/net/virtio_net/basic_features.sh create mode 100644 tools/testing/selftests/drivers/net/virtio_net/config create mode 100644 tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc create mode 100644 tools/testing/selftests/hid/tests/base_device.py create mode 100644 tools/testing/selftests/hid/tests/base_gamepad.py create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c create mode 100644 tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h delete mode 100644 tools/testing/selftests/kvm/include/kvm_util_base.h create mode 100644 tools/testing/selftests/kvm/include/kvm_util_types.h create mode 100644 tools/testing/selftests/kvm/include/riscv/sbi.h create mode 100644 tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c create mode 100644 tools/testing/selftests/kvm/riscv/ebreak_test.c create mode 100644 tools/testing/selftests/kvm/riscv/sbi_pmu_test.c create mode 100644 tools/testing/selftests/kvm/s390x/shared_zeropage_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/sev_init2_tests.c create mode 100644 tools/testing/selftests/mm/mseal_test.c create mode 100644 tools/testing/selftests/mm/seal_elf.c create mode 100644 tools/testing/selftests/net/af_unix/config create mode 100644 tools/testing/selftests/net/af_unix/msg_oob.c create mode 100644 tools/testing/selftests/net/af_unix/scm_rights.c delete mode 100644 tools/testing/selftests/net/af_unix/test_unix_oob.c create mode 100644 tools/testing/selftests/net/bpf.mk create mode 100755 tools/testing/selftests/net/bpf_offload.py delete mode 100644 tools/testing/selftests/net/csum.c delete mode 100755 tools/testing/selftests/net/devlink_port_split.py create mode 100644 tools/testing/selftests/net/epoll_busy_poll.c delete mode 100755 tools/testing/selftests/net/forwarding/ethtool.sh delete mode 100755 tools/testing/selftests/net/forwarding/ethtool_extended_state.sh delete mode 100644 tools/testing/selftests/net/forwarding/ethtool_lib.sh delete mode 100755 tools/testing/selftests/net/forwarding/ethtool_mm.sh delete mode 100755 tools/testing/selftests/net/forwarding/ethtool_rmon.sh delete mode 100755 tools/testing/selftests/net/forwarding/hw_stats_l3.sh delete mode 100755 tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh create mode 100755 tools/testing/selftests/net/forwarding/lib_sh_test.sh delete mode 100755 tools/testing/selftests/net/forwarding/loopback.sh create mode 100644 tools/testing/selftests/net/hsr/hsr_common.sh create mode 100755 tools/testing/selftests/net/hsr/hsr_redbox.sh create mode 100644 tools/testing/selftests/net/lib/.gitignore create mode 100644 tools/testing/selftests/net/lib/Makefile create mode 100644 tools/testing/selftests/net/lib/csum.c create mode 100644 tools/testing/selftests/net/lib/py/__init__.py create mode 100644 tools/testing/selftests/net/lib/py/consts.py create mode 100644 tools/testing/selftests/net/lib/py/ksft.py create mode 100644 tools/testing/selftests/net/lib/py/netns.py create mode 100644 tools/testing/selftests/net/lib/py/nsim.py create mode 100644 tools/testing/selftests/net/lib/py/utils.py create mode 100644 tools/testing/selftests/net/lib/py/ynl.py create mode 100644 tools/testing/selftests/net/nat6to4.bpf.c delete mode 100644 tools/testing/selftests/net/nat6to4.c create mode 100644 tools/testing/selftests/net/netfilter/.gitignore create mode 100644 tools/testing/selftests/net/netfilter/Makefile create mode 100644 tools/testing/selftests/net/netfilter/audit_logread.c create mode 100755 tools/testing/selftests/net/netfilter/br_netfilter.sh create mode 100755 tools/testing/selftests/net/netfilter/bridge_brouter.sh create mode 100644 tools/testing/selftests/net/netfilter/config create mode 100644 tools/testing/selftests/net/netfilter/connect_close.c create mode 100644 tools/testing/selftests/net/netfilter/conntrack_dump_flush.c create mode 100755 tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_vrf.sh create mode 100755 tools/testing/selftests/net/netfilter/ipvs.sh create mode 100644 tools/testing/selftests/net/netfilter/lib.sh create mode 100755 tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh create mode 100755 tools/testing/selftests/net/netfilter/nf_nat_edemux.sh create mode 100644 tools/testing/selftests/net/netfilter/nf_queue.c create mode 100755 tools/testing/selftests/net/netfilter/nft_audit.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_concat_range.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_fib.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_flowtable.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_meta.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_nat.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_nat_zones.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_queue.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_synproxy.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_zones_many.sh create mode 100755 tools/testing/selftests/net/netfilter/packetdrill/common.sh create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt create mode 100755 tools/testing/selftests/net/netfilter/rpath.sh create mode 100644 tools/testing/selftests/net/netfilter/sctp_collision.c create mode 100644 tools/testing/selftests/net/netfilter/settings create mode 100755 tools/testing/selftests/net/netfilter/xt_string.sh create mode 100755 tools/testing/selftests/net/nl_netdev.py create mode 100644 tools/testing/selftests/net/sample_map_ret0.bpf.c create mode 100644 tools/testing/selftests/net/sample_ret0.bpf.c create mode 100755 tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh create mode 100755 tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh create mode 100644 tools/testing/selftests/net/xdp_dummy.bpf.c delete mode 100644 tools/testing/selftests/net/xdp_dummy.c delete mode 100644 tools/testing/selftests/netfilter/.gitignore delete mode 100644 tools/testing/selftests/netfilter/Makefile delete mode 100644 tools/testing/selftests/netfilter/audit_logread.c delete mode 100755 tools/testing/selftests/netfilter/bridge_brouter.sh delete mode 100644 tools/testing/selftests/netfilter/bridge_netfilter.sh delete mode 100644 tools/testing/selftests/netfilter/config delete mode 100644 tools/testing/selftests/netfilter/connect_close.c delete mode 100644 tools/testing/selftests/netfilter/conntrack_dump_flush.c delete mode 100755 tools/testing/selftests/netfilter/conntrack_icmp_related.sh delete mode 100755 tools/testing/selftests/netfilter/conntrack_sctp_collision.sh delete mode 100755 tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh delete mode 100755 tools/testing/selftests/netfilter/conntrack_vrf.sh delete mode 100755 tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh delete mode 100755 tools/testing/selftests/netfilter/ipvs.sh delete mode 100644 tools/testing/selftests/netfilter/nf-queue.c delete mode 100755 tools/testing/selftests/netfilter/nf_nat_edemux.sh delete mode 100755 tools/testing/selftests/netfilter/nft_audit.sh delete mode 100755 tools/testing/selftests/netfilter/nft_concat_range.sh delete mode 100755 tools/testing/selftests/netfilter/nft_conntrack_helper.sh delete mode 100755 tools/testing/selftests/netfilter/nft_fib.sh delete mode 100755 tools/testing/selftests/netfilter/nft_flowtable.sh delete mode 100755 tools/testing/selftests/netfilter/nft_meta.sh delete mode 100755 tools/testing/selftests/netfilter/nft_nat.sh delete mode 100755 tools/testing/selftests/netfilter/nft_nat_zones.sh delete mode 100755 tools/testing/selftests/netfilter/nft_queue.sh delete mode 100755 tools/testing/selftests/netfilter/nft_synproxy.sh delete mode 100755 tools/testing/selftests/netfilter/nft_trans_stress.sh delete mode 100755 tools/testing/selftests/netfilter/nft_zones_many.sh delete mode 100755 tools/testing/selftests/netfilter/rpath.sh delete mode 100644 tools/testing/selftests/netfilter/sctp_collision.c delete mode 100644 tools/testing/selftests/netfilter/settings delete mode 100755 tools/testing/selftests/netfilter/xt_string.sh create mode 100644 tools/testing/selftests/perf_events/watermark_signal.c create mode 100644 tools/testing/selftests/powerpc/dexcr/chdexcr.c create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr_test.c create mode 100644 tools/testing/selftests/powerpc/flags.mk create mode 100644 tools/testing/selftests/ring-buffer/.gitignore create mode 100644 tools/testing/selftests/ring-buffer/Makefile create mode 100644 tools/testing/selftests/ring-buffer/config create mode 100644 tools/testing/selftests/ring-buffer/map_test.c create mode 100644 tools/testing/selftests/riscv/sigreturn/.gitignore create mode 100644 tools/testing/selftests/riscv/sigreturn/Makefile create mode 100644 tools/testing/selftests/riscv/sigreturn/sigreturn.c create mode 100644 tools/writeback/wb_monitor.py (limited to 'tools') diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 52f076afeb..7b32b99023 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -86,6 +86,9 @@ #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define ARM_CPU_PART_CORTEX_A78C 0xD4B +#define ARM_CPU_PART_NEOVERSE_V2 0xD4F +#define ARM_CPU_PART_CORTEX_X4 0xD82 +#define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define APM_CPU_PART_XGENE 0x000 #define APM_CPU_VAR_POTENZA 0x00 @@ -159,6 +162,9 @@ #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) +#define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) +#define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) +#define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/tools/arch/x86/dell-uart-backlight-emulator/.gitignore b/tools/arch/x86/dell-uart-backlight-emulator/.gitignore new file mode 100644 index 0000000000..5c8cad8d72 --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/.gitignore @@ -0,0 +1 @@ +dell-uart-backlight-emulator diff --git a/tools/arch/x86/dell-uart-backlight-emulator/Makefile b/tools/arch/x86/dell-uart-backlight-emulator/Makefile new file mode 100644 index 0000000000..6ea1d9fd53 --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for Intel Software Defined Silicon provisioning tool + +dell-uart-backlight-emulator: dell-uart-backlight-emulator.c + +BINDIR ?= /usr/bin + +override CFLAGS += -O2 -Wall + +%: %.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +.PHONY : clean +clean : + @rm -f dell-uart-backlight-emulator + +install : dell-uart-backlight-emulator + install -d $(DESTDIR)$(BINDIR) + install -m 755 -p dell-uart-backlight-emulator $(DESTDIR)$(BINDIR)/dell-uart-backlight-emulator diff --git a/tools/arch/x86/dell-uart-backlight-emulator/README b/tools/arch/x86/dell-uart-backlight-emulator/README new file mode 100644 index 0000000000..c0d8e52046 --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/README @@ -0,0 +1,46 @@ +Emulator for DELL0501 UART attached backlight controller +-------------------------------------------------------- + +Dell All In One (AIO) models released after 2017 use a backlight controller +board connected to an UART. + +In DSDT this uart port will be defined as: + + Name (_HID, "DELL0501") + Name (_CID, EisaId ("PNP0501") + +With the DELL0501 indicating that we are dealing with an UART with +the backlight controller board attached. + +This small emulator allows testing +the drivers/platform/x86/dell/dell-uart-backlight.c driver without access +to an actual Dell All In One. + +This requires: +1. A (desktop) PC with a 16550 UART on the motherboard and a standard DB9 + connector connected to this UART. +2. A DB9 NULL modem cable. +3. A second DB9 serial port, this can e.g. be a USB to serial converter + with a DB9 connector plugged into the same desktop PC. +4. A DSDT overlay for the desktop PC replacing the _HID of the 16550 UART + ACPI Device() with "DELL0501" and adding a _CID of "PNP0501", see + DSDT.patch for an example of the necessary DSDT changes. + +With everything setup and the NULL modem cable connected between +the 2 serial ports run: + +./dell-uart-backlight-emulator + +For example when using an USB to serial converter for the second port: + +./dell-uart-backlight-emulator /dev/ttyUSB0 + +And then (re)load the dell-uart-backlight driver: + +sudo rmmod dell-uart-backlight; sudo modprobe dell-uart-backlight dyndbg + +After this check "dmesg" to see if the driver correctly received +the firmware version string from the emulator. If this works there +should be a /sys/class/backlight/dell_uart_backlight/ directory now +and writes to the brightness or bl_power files should be reflected +by matching output from the emulator. diff --git a/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c b/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c new file mode 100644 index 0000000000..655b6c96d8 --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Dell AIO Serial Backlight board emulator for testing + * the Linux dell-uart-backlight driver. + * + * Copyright (C) 2024 Hans de Goede + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int serial_fd; +int brightness = 50; + +static unsigned char dell_uart_checksum(unsigned char *buf, int len) +{ + unsigned char val = 0; + + while (len-- > 0) + val += buf[len]; + + return val ^ 0xff; +} + +/* read() will return -1 on SIGINT / SIGTERM causing the mainloop to cleanly exit */ +void signalhdlr(int signum) +{ +} + +int main(int argc, char *argv[]) +{ + struct sigaction sigact = { .sa_handler = signalhdlr }; + unsigned char buf[4], csum, response[32]; + const char *version_str = "PHI23-V321"; + struct termios tty, saved_tty; + int ret, idx, len = 0; + + if (argc != 2) { + fprintf(stderr, "Invalid or missing arguments\n"); + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + serial_fd = open(argv[1], O_RDWR | O_NOCTTY); + if (serial_fd == -1) { + fprintf(stderr, "Error opening %s: %s\n", argv[1], strerror(errno)); + return 1; + } + + ret = tcgetattr(serial_fd, &tty); + if (ret == -1) { + fprintf(stderr, "Error getting tcattr: %s\n", strerror(errno)); + goto out_close; + } + saved_tty = tty; + + cfsetspeed(&tty, 9600); + cfmakeraw(&tty); + tty.c_cflag &= ~CSTOPB; + tty.c_cflag &= ~CRTSCTS; + tty.c_cflag |= CLOCAL | CREAD; + + ret = tcsetattr(serial_fd, TCSANOW, &tty); + if (ret == -1) { + fprintf(stderr, "Error setting tcattr: %s\n", strerror(errno)); + goto out_restore; + } + + sigaction(SIGINT, &sigact, 0); + sigaction(SIGTERM, &sigact, 0); + + idx = 0; + while (read(serial_fd, &buf[idx], 1) == 1) { + if (idx == 0) { + switch (buf[0]) { + /* 3 MSB bits: cmd-len + 01010 SOF marker */ + case 0x6a: len = 3; break; + case 0x8a: len = 4; break; + default: + fprintf(stderr, "Error unexpected first byte: 0x%02x\n", buf[0]); + continue; /* Try to sync up with sender */ + } + } + + /* Process msg when len bytes have been received */ + if (idx != (len - 1)) { + idx++; + continue; + } + + /* Reset idx for next command */ + idx = 0; + + csum = dell_uart_checksum(buf, len - 1); + if (buf[len - 1] != csum) { + fprintf(stderr, "Error checksum mismatch got 0x%02x expected 0x%02x\n", + buf[len - 1], csum); + continue; + } + + switch ((buf[0] << 8) | buf[1]) { + case 0x6a06: /* cmd = 0x06, get version */ + len = strlen(version_str); + strcpy((char *)&response[2], version_str); + printf("Get version, reply: %s\n", version_str); + break; + case 0x8a0b: /* cmd = 0x0b, set brightness */ + if (buf[2] > 100) { + fprintf(stderr, "Error invalid brightness param: %d\n", buf[2]); + continue; + } + + len = 0; + brightness = buf[2]; + printf("Set brightness %d\n", brightness); + break; + case 0x6a0c: /* cmd = 0x0c, get brightness */ + len = 1; + response[2] = brightness; + printf("Get brightness, reply: %d\n", brightness); + break; + case 0x8a0e: /* cmd = 0x0e, set backlight power */ + if (buf[2] != 0 && buf[2] != 1) { + fprintf(stderr, "Error invalid set power param: %d\n", buf[2]); + continue; + } + + len = 0; + printf("Set power %d\n", buf[2]); + break; + default: + fprintf(stderr, "Error unknown cmd 0x%04x\n", + (buf[0] << 8) | buf[1]); + continue; + } + + /* Respond with */ + response[0] = len + 3; /* response length in bytes */ + response[1] = buf[1]; /* ack cmd */ + csum = dell_uart_checksum(response, len + 2); + response[len + 2] = csum; + ret = write(serial_fd, response, response[0]); + if (ret != (response[0])) + fprintf(stderr, "Error writing %d bytes: %d\n", + response[0], ret); + } + + ret = 0; +out_restore: + tcsetattr(serial_fd, TCSANOW, &saved_tty); +out_close: + close(serial_fd); + return ret; +} diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index a38f8f9ba6..3c74343296 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -461,11 +461,15 @@ /* * Extended auxiliary flags: Linux defined - for features scattered in various - * CPUID levels like 0x80000022, etc. + * CPUID levels like 0x80000022, etc and Linux defined features. * * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ /* * BUG word(s) @@ -515,4 +519,5 @@ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index a610514003..253690eb3c 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -35,6 +35,8 @@ #define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ #define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ #define INAT_PFX_EVEX 15 /* EVEX prefix */ +/* x86-64 REX2 prefix */ +#define INAT_PFX_REX2 16 /* 0xD5 */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -50,7 +52,7 @@ /* Legacy prefix */ #define INAT_PFX_OFFS 0 -#define INAT_PFX_BITS 4 +#define INAT_PFX_BITS 5 #define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) #define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) /* Escape opcodes */ @@ -77,6 +79,9 @@ #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) +#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) +#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9)) +#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -128,6 +133,11 @@ static inline int inat_is_rex_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_REX; } +static inline int inat_is_rex2_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX2; +} + static inline int inat_last_prefix_id(insn_attr_t attr) { if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) @@ -227,4 +237,9 @@ static inline int inat_must_evex(insn_attr_t attr) { return attr & INAT_EVEXONLY; } + +static inline int inat_evex_scalable(insn_attr_t attr) +{ + return attr & INAT_EVEX_SCALABLE; +} #endif diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 65c0d9ce1e..0e5abd896a 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -112,10 +112,15 @@ struct insn { #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) -#define X86_REX_W(rex) ((rex) & 8) -#define X86_REX_R(rex) ((rex) & 4) -#define X86_REX_X(rex) ((rex) & 2) -#define X86_REX_B(rex) ((rex) & 1) +#define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ +#define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ +#define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ +#define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ + +#define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ +#define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ +#define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ +#define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ @@ -161,6 +166,18 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); +static inline int insn_is_rex2(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return insn->rex_prefix.nbytes == 2; +} + +static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) +{ + return X86_REX2_M(insn->rex_prefix.bytes[1]); +} + static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) @@ -198,6 +215,13 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn) return X86_VEX_P(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_vex_w_bit(struct insn *insn) +{ + if (insn->vex_prefix.nbytes < 3) + return 0; + return X86_VEX_W(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h deleted file mode 100644 index d18bfb238f..0000000000 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_IRQ_VECTORS_H -#define _ASM_X86_IRQ_VECTORS_H - -#include -/* - * Linux IRQ vector layout. - * - * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can - * be defined by Linux. They are used as a jump table by the CPU when a - * given vector is triggered - by a CPU-external, CPU-internal or - * software-triggered event. - * - * Linux sets the kernel code address each entry jumps to early during - * bootup, and never changes them. This is the general layout of the - * IDT entries: - * - * Vectors 0 ... 31 : system traps and exceptions - hardcoded events - * Vectors 32 ... 127 : device interrupts - * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... LOCAL_TIMER_VECTOR-1 - * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts - * - * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. - * - * This file enumerates the exact layout of them: - */ - -/* This is used as an interrupt vector when programming the APIC. */ -#define NMI_VECTOR 0x02 - -/* - * IDT vectors usable for external interrupt sources start at 0x20. - * (0x80 is the syscall vector, 0x30-0x3f are for ISA) - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define IA32_SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x30-0x3f are used for ISA interrupts. - * round up to the next 16-vector boundary - */ -#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - */ - -#define SPURIOUS_APIC_VECTOR 0xff -/* - * Sanity check - */ -#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) -# error SPURIOUS_APIC_VECTOR definition error -#endif - -#define ERROR_APIC_VECTOR 0xfe -#define RESCHEDULE_VECTOR 0xfd -#define CALL_FUNCTION_VECTOR 0xfc -#define CALL_FUNCTION_SINGLE_VECTOR 0xfb -#define THERMAL_APIC_VECTOR 0xfa -#define THRESHOLD_APIC_VECTOR 0xf9 -#define REBOOT_VECTOR 0xf8 - -/* - * Generic system vector for platform specific use - */ -#define X86_PLATFORM_IPI_VECTOR 0xf7 - -/* - * IRQ work vector: - */ -#define IRQ_WORK_VECTOR 0xf6 - -/* 0xf5 - unused, was UV_BAU_MESSAGE */ -#define DEFERRED_ERROR_VECTOR 0xf4 - -/* Vector on which hypervisor callbacks will be delivered */ -#define HYPERVISOR_CALLBACK_VECTOR 0xf3 - -/* Vector for KVM to deliver posted interrupt IPI */ -#define POSTED_INTR_VECTOR 0xf2 -#define POSTED_INTR_WAKEUP_VECTOR 0xf1 -#define POSTED_INTR_NESTED_VECTOR 0xf0 - -#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef - -#if IS_ENABLED(CONFIG_HYPERV) -#define HYPERV_REENLIGHTENMENT_VECTOR 0xee -#define HYPERV_STIMER0_VECTOR 0xed -#endif - -#define LOCAL_TIMER_VECTOR 0xec - -#define NR_VECTORS 256 - -#ifdef CONFIG_X86_LOCAL_APIC -#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR -#else -#define FIRST_SYSTEM_VECTOR NR_VECTORS -#endif - -#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) -#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR) - -/* - * Size the maximum number of interrupts. - * - * If the irq_desc[] array has a sparse layout, we can size things - * generously - it scales up linearly with the maximum number of CPUs, - * and the maximum number of IO-APICs, whichever is higher. - * - * In other cases we size more conservatively, to not create too large - * static arrays. - */ - -#define NR_IRQS_LEGACY 16 - -#define CPU_VECTOR_LIMIT (64 * NR_CPUS) -#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) - -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) -#define NR_IRQS \ - (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ - (NR_VECTORS + CPU_VECTOR_LIMIT) : \ - (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -#elif defined(CONFIG_X86_IO_APIC) -#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) -#elif defined(CONFIG_PCI_MSI) -#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) -#else -#define NR_IRQS NR_IRQS_LEGACY -#endif - -#endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 05956bd8ba..e022e6eb76 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -61,10 +61,13 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) /* A mask for bits which the kernel toggles when controlling mitigations */ #define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ - | SPEC_CTRL_RRSBA_DIS_S) + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -163,6 +166,14 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. @@ -185,11 +196,6 @@ * File. */ -#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* - * IA32_XAPIC_DISABLE_STATUS MSR - * supported - */ - #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* * Writeback and invalidate the diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index ef11aa4cab..9fae1b73b5 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -457,8 +457,13 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 -/* attributes for system fd (group 0) */ -#define KVM_X86_XCOMP_GUEST_SUPP 0 +/* vendor-independent attributes for system fd (group 0) */ +#define KVM_X86_GRP_SYSTEM 0 +# define KVM_X86_XCOMP_GUEST_SUPP 0 + +/* vendor-specific groups and attributes for system fd */ +#define KVM_X86_GRP_SEV 1 +# define KVM_X86_SEV_VMSA_FEATURES 0 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -689,6 +694,9 @@ enum sev_cmd_id { /* Guest Migration Extension */ KVM_SEV_SEND_CANCEL, + /* Second time is the charm; improved versions of the above ioctls. */ + KVM_SEV_INIT2, + KVM_SEV_NR_MAX, }; @@ -700,6 +708,14 @@ struct kvm_sev_cmd { __u32 sev_fd; }; +struct kvm_sev_init { + __u64 vmsa_features; + __u32 flags; + __u16 ghcb_version; + __u16 pad1; + __u32 pad2[8]; +}; + struct kvm_sev_launch_start { __u32 handle; __u32 policy; @@ -856,5 +872,7 @@ struct kvm_hyperv_eventfd { #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +#define KVM_X86_SEV_VM 2 +#define KVM_X86_SEV_ES_VM 3 #endif /* _ASM_X86_KVM_H */ diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h deleted file mode 100644 index 384e2cc6ac..0000000000 --- a/tools/arch/x86/include/uapi/asm/prctl.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_X86_PRCTL_H -#define _ASM_X86_PRCTL_H - -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 - -#define ARCH_GET_CPUID 0x1011 -#define ARCH_SET_CPUID 0x1012 - -#define ARCH_GET_XCOMP_SUPP 0x1021 -#define ARCH_GET_XCOMP_PERM 0x1022 -#define ARCH_REQ_XCOMP_PERM 0x1023 -#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 -#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 - -#define ARCH_XCOMP_TILECFG 17 -#define ARCH_XCOMP_TILEDATA 18 - -#define ARCH_MAP_VDSO_X32 0x2001 -#define ARCH_MAP_VDSO_32 0x2002 -#define ARCH_MAP_VDSO_64 0x2003 - -/* Don't use 0x3001-0x3004 because of old glibcs */ - -#define ARCH_GET_UNTAG_MASK 0x4001 -#define ARCH_ENABLE_TAGGED_ADDR 0x4002 -#define ARCH_GET_MAX_TAG_BITS 0x4003 -#define ARCH_FORCE_TAGGED_SVA 0x4004 - -#define ARCH_SHSTK_ENABLE 0x5001 -#define ARCH_SHSTK_DISABLE 0x5002 -#define ARCH_SHSTK_LOCK 0x5003 -#define ARCH_SHSTK_UNLOCK 0x5004 -#define ARCH_SHSTK_STATUS 0x5005 - -/* ARCH_SHSTK_ features bits */ -#define ARCH_SHSTK_SHSTK (1ULL << 0) -#define ARCH_SHSTK_WRSS (1ULL << 1) - -#endif /* _ASM_X86_PRCTL_H */ diff --git a/tools/arch/x86/intel_sdsi/intel_sdsi.c b/tools/arch/x86/intel_sdsi/intel_sdsi.c index ba2a6b6645..766a5d26f5 100644 --- a/tools/arch/x86/intel_sdsi/intel_sdsi.c +++ b/tools/arch/x86/intel_sdsi/intel_sdsi.c @@ -43,6 +43,7 @@ #define METER_CERT_MAX_SIZE 4096 #define STATE_MAX_NUM_LICENSES 16 #define STATE_MAX_NUM_IN_BUNDLE (uint32_t)8 +#define FEAT_LEN 5 /* 4 plus NUL terminator */ #define __round_mask(x, y) ((__typeof__(x))((y) - 1)) #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) @@ -184,6 +185,7 @@ struct sdsi_dev { enum command { CMD_SOCKET_INFO, CMD_METER_CERT, + CMD_METER_CURRENT_CERT, CMD_STATE_CERT, CMD_PROV_AKC, CMD_PROV_CAP, @@ -321,25 +323,27 @@ static char *content_type(uint32_t type) } } -static void get_feature(uint32_t encoding, char *feature) +static void get_feature(uint32_t encoding, char feature[5]) { char *name = (char *)&encoding; + feature[4] = '\0'; feature[3] = name[0]; feature[2] = name[1]; feature[1] = name[2]; feature[0] = name[3]; } -static int sdsi_meter_cert_show(struct sdsi_dev *s) +static int sdsi_meter_cert_show(struct sdsi_dev *s, bool show_current) { char buf[METER_CERT_MAX_SIZE] = {0}; struct bundle_encoding_counter *bec; struct meter_certificate *mc; uint32_t count = 0; FILE *cert_ptr; + char *cert_fname; int ret, size; - char name[4]; + char name[FEAT_LEN]; ret = sdsi_update_registers(s); if (ret) @@ -347,7 +351,6 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) if (!s->regs.en_features.sdsi) { fprintf(stderr, "SDSi feature is present but not enabled.\n"); - fprintf(stderr, " Unable to read meter certificate\n"); return -1; } @@ -362,15 +365,17 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) return ret; } - cert_ptr = fopen("meter_certificate", "r"); + cert_fname = show_current ? "meter_current" : "meter_certificate"; + cert_ptr = fopen(cert_fname, "r"); + if (!cert_ptr) { - perror("Could not open 'meter_certificate' file"); + fprintf(stderr, "Could not open '%s' file: %s", cert_fname, strerror(errno)); return -1; } size = fread(buf, 1, sizeof(buf), cert_ptr); if (!size) { - fprintf(stderr, "Could not read 'meter_certificate' file\n"); + fprintf(stderr, "Could not read '%s' file\n", cert_fname); fclose(cert_ptr); return -1; } @@ -383,7 +388,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("\n"); get_feature(mc->signature, name); - printf("Signature: %.4s\n", name); + printf("Signature: %s\n", name); printf("Version: %d\n", mc->version); printf("Count Unit: %dms\n", mc->counter_unit); @@ -391,7 +396,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("Feature Bundle Length: %d\n", mc->bundle_length); get_feature(mc->mmrc_encoding, name); - printf("MMRC encoding: %.4s\n", name); + printf("MMRC encoding: %s\n", name); printf("MMRC counter: %d\n", mc->mmrc_counter); if (mc->bundle_length % METER_BUNDLE_SIZE) { @@ -409,9 +414,8 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("Number of Feature Counters: %ld\n", BUNDLE_COUNT(mc->bundle_length)); while (count < BUNDLE_COUNT(mc->bundle_length)) { - char feature[5]; + char feature[FEAT_LEN]; - feature[4] = '\0'; get_feature(bec[count].encoding, feature); printf(" %s: %d\n", feature, bec[count].counter); ++count; @@ -494,7 +498,7 @@ static int sdsi_state_cert_show(struct sdsi_dev *s) sizeof(*lki) + // size of the license key info offset; // offset to this blob content struct bundle_encoding *bundle = (void *)(lbc) + sizeof(*lbc); - char feature[5]; + char feature[FEAT_LEN]; uint32_t i; printf(" Blob %d:\n", count - 1); @@ -507,8 +511,6 @@ static int sdsi_state_cert_show(struct sdsi_dev *s) printf(" Blob revision ID: %u\n", lbc->rev_id); printf(" Number of Features: %u\n", lbc->num_bundles); - feature[4] = '\0'; - for (i = 0; i < min(lbc->num_bundles, STATE_MAX_NUM_IN_BUNDLE); i++) { get_feature(bundle[i].encoding, feature); printf(" Feature %d: %s\n", i, feature); @@ -739,7 +741,7 @@ static void sdsi_free_dev(struct sdsi_dev *s) static void usage(char *prog) { - printf("Usage: %s [-l] [-d DEVNO [-i] [-s] [-m] [-a FILE] [-c FILE]]\n", prog); + printf("Usage: %s [-l] [-d DEVNO [-i] [-s] [-m | -C] [-a FILE] [-c FILE]\n", prog); } static void show_help(void) @@ -748,8 +750,9 @@ static void show_help(void) printf(" %-18s\t%s\n", "-l, --list", "list available On Demand devices"); printf(" %-18s\t%s\n", "-d, --devno DEVNO", "On Demand device number"); printf(" %-18s\t%s\n", "-i, --info", "show socket information"); - printf(" %-18s\t%s\n", "-s, --state", "show state certificate"); - printf(" %-18s\t%s\n", "-m, --meter", "show meter certificate"); + printf(" %-18s\t%s\n", "-s, --state", "show state certificate data"); + printf(" %-18s\t%s\n", "-m, --meter", "show meter certificate data"); + printf(" %-18s\t%s\n", "-C, --meter_current", "show live unattested meter data"); printf(" %-18s\t%s\n", "-a, --akc FILE", "provision socket with AKC FILE"); printf(" %-18s\t%s\n", "-c, --cap FILE>", "provision socket with CAP FILE"); } @@ -765,21 +768,22 @@ int main(int argc, char *argv[]) int option_index = 0; static struct option long_options[] = { - {"akc", required_argument, 0, 'a'}, - {"cap", required_argument, 0, 'c'}, - {"devno", required_argument, 0, 'd'}, - {"help", no_argument, 0, 'h'}, - {"info", no_argument, 0, 'i'}, - {"list", no_argument, 0, 'l'}, - {"meter", no_argument, 0, 'm'}, - {"state", no_argument, 0, 's'}, - {0, 0, 0, 0 } + {"akc", required_argument, 0, 'a'}, + {"cap", required_argument, 0, 'c'}, + {"devno", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"info", no_argument, 0, 'i'}, + {"list", no_argument, 0, 'l'}, + {"meter", no_argument, 0, 'm'}, + {"meter_current", no_argument, 0, 'C'}, + {"state", no_argument, 0, 's'}, + {0, 0, 0, 0 } }; progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+a:c:d:hilms", long_options, + while ((opt = getopt_long_only(argc, argv, "+a:c:d:hilmCs", long_options, &option_index)) != -1) { switch (opt) { case 'd': @@ -795,6 +799,9 @@ int main(int argc, char *argv[]) case 'm': command = CMD_METER_CERT; break; + case 'C': + command = CMD_METER_CURRENT_CERT; + break; case 's': command = CMD_STATE_CERT; break; @@ -833,7 +840,10 @@ int main(int argc, char *argv[]) ret = sdsi_read_reg(s); break; case CMD_METER_CERT: - ret = sdsi_meter_cert_show(s); + ret = sdsi_meter_cert_show(s, false); + break; + case CMD_METER_CURRENT_CERT: + ret = sdsi_meter_cert_show(s, true); break; case CMD_STATE_CERT: ret = sdsi_state_cert_show(s); diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index ada4b4a79d..a43b37346a 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -185,6 +185,17 @@ found: if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; + } else if (inat_is_rex2_prefix(attr)) { + insn_set_byte(&insn->rex_prefix, 0, b); + b = peek_nbyte_next(insn_byte_t, insn, 1); + insn_set_byte(&insn->rex_prefix, 1, b); + insn->rex_prefix.nbytes = 2; + insn->next_byte += 2; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + insn->rex_prefix.got = 1; + goto vex_end; } } insn->rex_prefix.got = 1; @@ -283,6 +294,10 @@ int insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); + /* SCALABLE EVEX uses p bits to encode operand size */ + if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) && + p == INAT_PFX_OPNDSZ) + insn->opnd_bytes = 2; if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { @@ -294,6 +309,20 @@ int insn_get_opcode(struct insn *insn) goto end; } + /* Check if there is REX2 prefix or not */ + if (insn_is_rex2(insn)) { + if (insn_rex2_m_bit(insn)) { + /* map 1 is escape 0x0f */ + insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f); + + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr); + } else { + insn->attr = inat_get_opcode_attribute(op); + } + goto end; + } + insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index da9347552b..caedb3ef66 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -23,6 +23,7 @@ # # AVX Superscripts # (ev): this opcode requires EVEX prefix. +# (es): this opcode requires EVEX prefix and is SCALABALE. # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. @@ -33,6 +34,10 @@ # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. +# +# REX2 Prefix +# - (!REX2): REX2 is not allowed +# - (REX2): REX2 variant e.g. JMPABS Table: one byte opcode Referrer: @@ -157,22 +162,22 @@ AVXcode: 6e: OUTS/OUTSB DX,Xb 6f: OUTS/OUTSW/OUTSD DX,Xz # 0x70 - 0x7f -70: JO Jb -71: JNO Jb -72: JB/JNAE/JC Jb -73: JNB/JAE/JNC Jb -74: JZ/JE Jb -75: JNZ/JNE Jb -76: JBE/JNA Jb -77: JNBE/JA Jb -78: JS Jb -79: JNS Jb -7a: JP/JPE Jb -7b: JNP/JPO Jb -7c: JL/JNGE Jb -7d: JNL/JGE Jb -7e: JLE/JNG Jb -7f: JNLE/JG Jb +70: JO Jb (!REX2) +71: JNO Jb (!REX2) +72: JB/JNAE/JC Jb (!REX2) +73: JNB/JAE/JNC Jb (!REX2) +74: JZ/JE Jb (!REX2) +75: JNZ/JNE Jb (!REX2) +76: JBE/JNA Jb (!REX2) +77: JNBE/JA Jb (!REX2) +78: JS Jb (!REX2) +79: JNS Jb (!REX2) +7a: JP/JPE Jb (!REX2) +7b: JNP/JPO Jb (!REX2) +7c: JL/JNGE Jb (!REX2) +7d: JNL/JGE Jb (!REX2) +7e: JLE/JNG Jb (!REX2) +7f: JNLE/JG Jb (!REX2) # 0x80 - 0x8f 80: Grp1 Eb,Ib (1A) 81: Grp1 Ev,Iz (1A) @@ -208,24 +213,24 @@ AVXcode: 9e: SAHF 9f: LAHF # 0xa0 - 0xaf -a0: MOV AL,Ob -a1: MOV rAX,Ov -a2: MOV Ob,AL -a3: MOV Ov,rAX -a4: MOVS/B Yb,Xb -a5: MOVS/W/D/Q Yv,Xv -a6: CMPS/B Xb,Yb -a7: CMPS/W/D Xv,Yv -a8: TEST AL,Ib -a9: TEST rAX,Iz -aa: STOS/B Yb,AL -ab: STOS/W/D/Q Yv,rAX -ac: LODS/B AL,Xb -ad: LODS/W/D/Q rAX,Xv -ae: SCAS/B AL,Yb +a0: MOV AL,Ob (!REX2) +a1: MOV rAX,Ov (!REX2) | JMPABS O (REX2),(o64) +a2: MOV Ob,AL (!REX2) +a3: MOV Ov,rAX (!REX2) +a4: MOVS/B Yb,Xb (!REX2) +a5: MOVS/W/D/Q Yv,Xv (!REX2) +a6: CMPS/B Xb,Yb (!REX2) +a7: CMPS/W/D Xv,Yv (!REX2) +a8: TEST AL,Ib (!REX2) +a9: TEST rAX,Iz (!REX2) +aa: STOS/B Yb,AL (!REX2) +ab: STOS/W/D/Q Yv,rAX (!REX2) +ac: LODS/B AL,Xb (!REX2) +ad: LODS/W/D/Q rAX,Xv (!REX2) +ae: SCAS/B AL,Yb (!REX2) # Note: The May 2011 Intel manual shows Xv for the second parameter of the # next instruction but Yv is correct -af: SCAS/W/D/Q rAX,Yv +af: SCAS/W/D/Q rAX,Yv (!REX2) # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib @@ -266,7 +271,7 @@ d1: Grp2 Ev,1 (1A) d2: Grp2 Eb,CL (1A) d3: Grp2 Ev,CL (1A) d4: AAM Ib (i64) -d5: AAD Ib (i64) +d5: AAD Ib (i64) | REX2 (Prefix),(o64) d6: d7: XLAT/XLATB d8: ESC @@ -281,26 +286,26 @@ df: ESC # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. -e0: LOOPNE/LOOPNZ Jb (f64) -e1: LOOPE/LOOPZ Jb (f64) -e2: LOOP Jb (f64) -e3: JrCXZ Jb (f64) -e4: IN AL,Ib -e5: IN eAX,Ib -e6: OUT Ib,AL -e7: OUT Ib,eAX +e0: LOOPNE/LOOPNZ Jb (f64) (!REX2) +e1: LOOPE/LOOPZ Jb (f64) (!REX2) +e2: LOOP Jb (f64) (!REX2) +e3: JrCXZ Jb (f64) (!REX2) +e4: IN AL,Ib (!REX2) +e5: IN eAX,Ib (!REX2) +e6: OUT Ib,AL (!REX2) +e7: OUT Ib,eAX (!REX2) # With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset # in "near" jumps and calls is 16-bit. For CALL, # push of return address is 16-bit wide, RSP is decremented by 2 # but is not truncated to 16 bits, unlike RIP. -e8: CALL Jz (f64) -e9: JMP-near Jz (f64) -ea: JMP-far Ap (i64) -eb: JMP-short Jb (f64) -ec: IN AL,DX -ed: IN eAX,DX -ee: OUT DX,AL -ef: OUT DX,eAX +e8: CALL Jz (f64) (!REX2) +e9: JMP-near Jz (f64) (!REX2) +ea: JMP-far Ap (i64) (!REX2) +eb: JMP-short Jb (f64) (!REX2) +ec: IN AL,DX (!REX2) +ed: IN eAX,DX (!REX2) +ee: OUT DX,AL (!REX2) +ef: OUT DX,eAX (!REX2) # 0xf0 - 0xff f0: LOCK (Prefix) f1: @@ -386,14 +391,14 @@ AVXcode: 1 2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) 2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) # 0x0f 0x30-0x3f -30: WRMSR -31: RDTSC -32: RDMSR -33: RDPMC -34: SYSENTER -35: SYSEXIT +30: WRMSR (!REX2) +31: RDTSC (!REX2) +32: RDMSR (!REX2) +33: RDPMC (!REX2) +34: SYSENTER (!REX2) +35: SYSEXIT (!REX2) 36: -37: GETSEC +37: GETSEC (!REX2) 38: escape # 3-byte escape 1 39: 3a: escape # 3-byte escape 2 @@ -473,22 +478,22 @@ AVXcode: 1 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). -80: JO Jz (f64) -81: JNO Jz (f64) -82: JB/JC/JNAE Jz (f64) -83: JAE/JNB/JNC Jz (f64) -84: JE/JZ Jz (f64) -85: JNE/JNZ Jz (f64) -86: JBE/JNA Jz (f64) -87: JA/JNBE Jz (f64) -88: JS Jz (f64) -89: JNS Jz (f64) -8a: JP/JPE Jz (f64) -8b: JNP/JPO Jz (f64) -8c: JL/JNGE Jz (f64) -8d: JNL/JGE Jz (f64) -8e: JLE/JNG Jz (f64) -8f: JNLE/JG Jz (f64) +80: JO Jz (f64) (!REX2) +81: JNO Jz (f64) (!REX2) +82: JB/JC/JNAE Jz (f64) (!REX2) +83: JAE/JNB/JNC Jz (f64) (!REX2) +84: JE/JZ Jz (f64) (!REX2) +85: JNE/JNZ Jz (f64) (!REX2) +86: JBE/JNA Jz (f64) (!REX2) +87: JA/JNBE Jz (f64) (!REX2) +88: JS Jz (f64) (!REX2) +89: JNS Jz (f64) (!REX2) +8a: JP/JPE Jz (f64) (!REX2) +8b: JNP/JPO Jz (f64) (!REX2) +8c: JL/JNGE Jz (f64) (!REX2) +8d: JNL/JGE Jz (f64) (!REX2) +8e: JLE/JNG Jz (f64) (!REX2) +8f: JNLE/JG Jz (f64) (!REX2) # 0x0f 0x90-0x9f 90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) 91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) @@ -698,8 +703,8 @@ AVXcode: 2 4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev) 4e: vrsqrt14ps/d Vpd,Wpd (66),(ev) 4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev) -50: vpdpbusd Vx,Hx,Wx (66) -51: vpdpbusds Vx,Hx,Wx (66) +50: vpdpbusd Vx,Hx,Wx (66) | vpdpbssd Vx,Hx,Wx (F2),(v) | vpdpbsud Vx,Hx,Wx (F3),(v) | vpdpbuud Vx,Hx,Wx (v) +51: vpdpbusds Vx,Hx,Wx (66) | vpdpbssds Vx,Hx,Wx (F2),(v) | vpdpbsuds Vx,Hx,Wx (F3),(v) | vpdpbuuds Vx,Hx,Wx (v) 52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev) 53: vpdpwssds Vx,Hx,Wx (66) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev) 54: vpopcntb/w Vx,Wx (66),(ev) @@ -708,7 +713,7 @@ AVXcode: 2 59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo) 5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo) 5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev) -5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) +5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) | TDPFP16PS Vt,Wt,Ht (F2),(v1),(o64) # Skip 0x5d 5e: TDPBSSD Vt,Wt,Ht (F2),(v1) | TDPBSUD Vt,Wt,Ht (F3),(v1) | TDPBUSD Vt,Wt,Ht (66),(v1) | TDPBUUD Vt,Wt,Ht (v1) # Skip 0x5f-0x61 @@ -718,10 +723,12 @@ AVXcode: 2 65: vblendmps/d Vx,Hx,Wx (66),(ev) 66: vpblendmb/w Vx,Hx,Wx (66),(ev) 68: vp2intersectd/q Kx,Hx,Wx (F2),(ev) -# Skip 0x69-0x6f +# Skip 0x69-0x6b +6c: TCMMIMFP16PS Vt,Wt,Ht (66),(v1),(o64) | TCMMRLFP16PS Vt,Wt,Ht (v1),(o64) +# Skip 0x6d-0x6f 70: vpshldvw Vx,Hx,Wx (66),(ev) 71: vpshldvd/q Vx,Hx,Wx (66),(ev) -72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3),(ev) | vpshrdvw Vx,Hx,Wx (66),(ev) +72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3) | vpshrdvw Vx,Hx,Wx (66),(ev) 73: vpshrdvd/q Vx,Hx,Wx (66),(ev) 75: vpermi2b/w Vx,Hx,Wx (66),(ev) 76: vpermi2d/q Vx,Hx,Wx (66),(ev) @@ -777,8 +784,10 @@ ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) -b4: vpmadd52luq Vx,Hx,Wx (66),(ev) -b5: vpmadd52huq Vx,Hx,Wx (66),(ev) +b0: vcvtneebf162ps Vx,Mx (F3),(!11B),(v) | vcvtneeph2ps Vx,Mx (66),(!11B),(v) | vcvtneobf162ps Vx,Mx (F2),(!11B),(v) | vcvtneoph2ps Vx,Mx (!11B),(v) +b1: vbcstnebf162ps Vx,Mw (F3),(!11B),(v) | vbcstnesh2ps Vx,Mw (66),(!11B),(v) +b4: vpmadd52luq Vx,Hx,Wx (66) +b5: vpmadd52huq Vx,Hx,Wx (66) b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) @@ -796,15 +805,35 @@ c7: Grp19 (1A) c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev) c9: sha1msg1 Vdq,Wdq ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev) -cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) -cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) -cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) +cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) | vsha512rnds2 Vqq,Hqq,Udq (F2),(11B),(v) +cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) | vsha512msg1 Vqq,Udq (F2),(11B),(v) +cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) | vsha512msg2 Vqq,Uqq (F2),(11B),(v) cf: vgf2p8mulb Vx,Wx (66) +d2: vpdpwsud Vx,Hx,Wx (F3),(v) | vpdpwusd Vx,Hx,Wx (66),(v) | vpdpwuud Vx,Hx,Wx (v) +d3: vpdpwsuds Vx,Hx,Wx (F3),(v) | vpdpwusds Vx,Hx,Wx (66),(v) | vpdpwuuds Vx,Hx,Wx (v) +d8: AESENCWIDE128KL Qpi (F3),(000),(00B) | AESENCWIDE256KL Qpi (F3),(000),(10B) | AESDECWIDE128KL Qpi (F3),(000),(01B) | AESDECWIDE256KL Qpi (F3),(000),(11B) +da: vsm3msg1 Vdq,Hdq,Udq (v1) | vsm3msg2 Vdq,Hdq,Udq (66),(v1) | vsm4key4 Vx,Hx,Wx (F3),(v) | vsm4rnds4 Vx,Hx,Wx (F2),(v) db: VAESIMC Vdq,Wdq (66),(v1) -dc: vaesenc Vx,Hx,Wx (66) -dd: vaesenclast Vx,Hx,Wx (66) -de: vaesdec Vx,Hx,Wx (66) -df: vaesdeclast Vx,Hx,Wx (66) +dc: vaesenc Vx,Hx,Wx (66) | LOADIWKEY Vx,Hx (F3) | AESENC128KL Vpd,Qpi (F3) +dd: vaesenclast Vx,Hx,Wx (66) | AESDEC128KL Vpd,Qpi (F3) +de: vaesdec Vx,Hx,Wx (66) | AESENC256KL Vpd,Qpi (F3) +df: vaesdeclast Vx,Hx,Wx (66) | AESDEC256KL Vpd,Qpi (F3) +e0: CMPOXADD My,Gy,By (66),(v1),(o64) +e1: CMPNOXADD My,Gy,By (66),(v1),(o64) +e2: CMPBXADD My,Gy,By (66),(v1),(o64) +e3: CMPNBXADD My,Gy,By (66),(v1),(o64) +e4: CMPZXADD My,Gy,By (66),(v1),(o64) +e5: CMPNZXADD My,Gy,By (66),(v1),(o64) +e6: CMPBEXADD My,Gy,By (66),(v1),(o64) +e7: CMPNBEXADD My,Gy,By (66),(v1),(o64) +e8: CMPSXADD My,Gy,By (66),(v1),(o64) +e9: CMPNSXADD My,Gy,By (66),(v1),(o64) +ea: CMPPXADD My,Gy,By (66),(v1),(o64) +eb: CMPNPXADD My,Gy,By (66),(v1),(o64) +ec: CMPLXADD My,Gy,By (66),(v1),(o64) +ed: CMPNLXADD My,Gy,By (66),(v1),(o64) +ee: CMPLEXADD My,Gy,By (66),(v1),(o64) +ef: CMPNLEXADD My,Gy,By (66),(v1),(o64) f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) @@ -812,8 +841,11 @@ f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66) f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) -f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) +f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) | URDMSR Rq,Gq (F2),(11B) | UWRMSR Gq,Rq (F3),(11B) f9: MOVDIRI My,Gy +fa: ENCODEKEY128 Ew,Ew (F3) +fb: ENCODEKEY256 Ew,Ew (F3) +fc: AADD My,Gy | AAND My,Gy (66) | AOR My,Gy (F2) | AXOR My,Gy (F3) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) @@ -893,10 +925,103 @@ c2: vcmpph Vx,Hx,Wx,Ib (ev) | vcmpsh Vx,Hx,Wx,Ib (F3),(ev) cc: sha1rnds4 Vdq,Wdq,Ib ce: vgf2p8affineqb Vx,Wx,Ib (66) cf: vgf2p8affineinvqb Vx,Wx,Ib (66) +de: vsm3rnds2 Vdq,Hdq,Wdq,Ib (66),(v1) df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) | HRESET Gv,Ib (F3),(000),(11B) EndTable +Table: EVEX map 4 +Referrer: +AVXcode: 4 +00: ADD Eb,Gb (ev) +01: ADD Ev,Gv (es) | ADD Ev,Gv (66),(es) +02: ADD Gb,Eb (ev) +03: ADD Gv,Ev (es) | ADD Gv,Ev (66),(es) +08: OR Eb,Gb (ev) +09: OR Ev,Gv (es) | OR Ev,Gv (66),(es) +0a: OR Gb,Eb (ev) +0b: OR Gv,Ev (es) | OR Gv,Ev (66),(es) +10: ADC Eb,Gb (ev) +11: ADC Ev,Gv (es) | ADC Ev,Gv (66),(es) +12: ADC Gb,Eb (ev) +13: ADC Gv,Ev (es) | ADC Gv,Ev (66),(es) +18: SBB Eb,Gb (ev) +19: SBB Ev,Gv (es) | SBB Ev,Gv (66),(es) +1a: SBB Gb,Eb (ev) +1b: SBB Gv,Ev (es) | SBB Gv,Ev (66),(es) +20: AND Eb,Gb (ev) +21: AND Ev,Gv (es) | AND Ev,Gv (66),(es) +22: AND Gb,Eb (ev) +23: AND Gv,Ev (es) | AND Gv,Ev (66),(es) +24: SHLD Ev,Gv,Ib (es) | SHLD Ev,Gv,Ib (66),(es) +28: SUB Eb,Gb (ev) +29: SUB Ev,Gv (es) | SUB Ev,Gv (66),(es) +2a: SUB Gb,Eb (ev) +2b: SUB Gv,Ev (es) | SUB Gv,Ev (66),(es) +2c: SHRD Ev,Gv,Ib (es) | SHRD Ev,Gv,Ib (66),(es) +30: XOR Eb,Gb (ev) +31: XOR Ev,Gv (es) | XOR Ev,Gv (66),(es) +32: XOR Gb,Eb (ev) +33: XOR Gv,Ev (es) | XOR Gv,Ev (66),(es) +# CCMPSCC instructions are: CCOMB, CCOMBE, CCOMF, CCOML, CCOMLE, CCOMNB, CCOMNBE, CCOMNL, CCOMNLE, +# CCOMNO, CCOMNS, CCOMNZ, CCOMO, CCOMS, CCOMT, CCOMZ +38: CCMPSCC Eb,Gb (ev) +39: CCMPSCC Ev,Gv (es) | CCMPSCC Ev,Gv (66),(es) +3a: CCMPSCC Gv,Ev (ev) +3b: CCMPSCC Gv,Ev (es) | CCMPSCC Gv,Ev (66),(es) +40: CMOVO Gv,Ev (es) | CMOVO Gv,Ev (66),(es) | CFCMOVO Ev,Ev (es) | CFCMOVO Ev,Ev (66),(es) | SETO Eb (F2),(ev) +41: CMOVNO Gv,Ev (es) | CMOVNO Gv,Ev (66),(es) | CFCMOVNO Ev,Ev (es) | CFCMOVNO Ev,Ev (66),(es) | SETNO Eb (F2),(ev) +42: CMOVB Gv,Ev (es) | CMOVB Gv,Ev (66),(es) | CFCMOVB Ev,Ev (es) | CFCMOVB Ev,Ev (66),(es) | SETB Eb (F2),(ev) +43: CMOVNB Gv,Ev (es) | CMOVNB Gv,Ev (66),(es) | CFCMOVNB Ev,Ev (es) | CFCMOVNB Ev,Ev (66),(es) | SETNB Eb (F2),(ev) +44: CMOVZ Gv,Ev (es) | CMOVZ Gv,Ev (66),(es) | CFCMOVZ Ev,Ev (es) | CFCMOVZ Ev,Ev (66),(es) | SETZ Eb (F2),(ev) +45: CMOVNZ Gv,Ev (es) | CMOVNZ Gv,Ev (66),(es) | CFCMOVNZ Ev,Ev (es) | CFCMOVNZ Ev,Ev (66),(es) | SETNZ Eb (F2),(ev) +46: CMOVBE Gv,Ev (es) | CMOVBE Gv,Ev (66),(es) | CFCMOVBE Ev,Ev (es) | CFCMOVBE Ev,Ev (66),(es) | SETBE Eb (F2),(ev) +47: CMOVNBE Gv,Ev (es) | CMOVNBE Gv,Ev (66),(es) | CFCMOVNBE Ev,Ev (es) | CFCMOVNBE Ev,Ev (66),(es) | SETNBE Eb (F2),(ev) +48: CMOVS Gv,Ev (es) | CMOVS Gv,Ev (66),(es) | CFCMOVS Ev,Ev (es) | CFCMOVS Ev,Ev (66),(es) | SETS Eb (F2),(ev) +49: CMOVNS Gv,Ev (es) | CMOVNS Gv,Ev (66),(es) | CFCMOVNS Ev,Ev (es) | CFCMOVNS Ev,Ev (66),(es) | SETNS Eb (F2),(ev) +4a: CMOVP Gv,Ev (es) | CMOVP Gv,Ev (66),(es) | CFCMOVP Ev,Ev (es) | CFCMOVP Ev,Ev (66),(es) | SETP Eb (F2),(ev) +4b: CMOVNP Gv,Ev (es) | CMOVNP Gv,Ev (66),(es) | CFCMOVNP Ev,Ev (es) | CFCMOVNP Ev,Ev (66),(es) | SETNP Eb (F2),(ev) +4c: CMOVL Gv,Ev (es) | CMOVL Gv,Ev (66),(es) | CFCMOVL Ev,Ev (es) | CFCMOVL Ev,Ev (66),(es) | SETL Eb (F2),(ev) +4d: CMOVNL Gv,Ev (es) | CMOVNL Gv,Ev (66),(es) | CFCMOVNL Ev,Ev (es) | CFCMOVNL Ev,Ev (66),(es) | SETNL Eb (F2),(ev) +4e: CMOVLE Gv,Ev (es) | CMOVLE Gv,Ev (66),(es) | CFCMOVLE Ev,Ev (es) | CFCMOVLE Ev,Ev (66),(es) | SETLE Eb (F2),(ev) +4f: CMOVNLE Gv,Ev (es) | CMOVNLE Gv,Ev (66),(es) | CFCMOVNLE Ev,Ev (es) | CFCMOVNLE Ev,Ev (66),(es) | SETNLE Eb (F2),(ev) +60: MOVBE Gv,Ev (es) | MOVBE Gv,Ev (66),(es) +61: MOVBE Ev,Gv (es) | MOVBE Ev,Gv (66),(es) +65: WRUSSD Md,Gd (66),(ev) | WRUSSQ Mq,Gq (66),(ev) +66: ADCX Gy,Ey (66),(ev) | ADOX Gy,Ey (F3),(ev) | WRSSD Md,Gd (ev) | WRSSQ Mq,Gq (66),(ev) +69: IMUL Gv,Ev,Iz (es) | IMUL Gv,Ev,Iz (66),(es) +6b: IMUL Gv,Ev,Ib (es) | IMUL Gv,Ev,Ib (66),(es) +80: Grp1 Eb,Ib (1A),(ev) +81: Grp1 Ev,Iz (1A),(es) +83: Grp1 Ev,Ib (1A),(es) +# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, +# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ +84: CTESTSCC (ev) +85: CTESTSCC (es) | CTESTSCC (66),(es) +88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) +8f: POP2 Bq,Rq (000),(11B),(ev) +a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) +ad: SHRD Ev,Gv,CL (es) | SHRD Ev,Gv,CL (66),(es) +af: IMUL Gv,Ev (es) | IMUL Gv,Ev (66),(es) +c0: Grp2 Eb,Ib (1A),(ev) +c1: Grp2 Ev,Ib (1A),(es) +d0: Grp2 Eb,1 (1A),(ev) +d1: Grp2 Ev,1 (1A),(es) +d2: Grp2 Eb,CL (1A),(ev) +d3: Grp2 Ev,CL (1A),(es) +f0: CRC32 Gy,Eb (es) | INVEPT Gq,Mdq (F3),(ev) +f1: CRC32 Gy,Ey (es) | CRC32 Gy,Ey (66),(es) | INVVPID Gy,Mdq (F3),(ev) +f2: INVPCID Gy,Mdq (F3),(ev) +f4: TZCNT Gv,Ev (es) | TZCNT Gv,Ev (66),(es) +f5: LZCNT Gv,Ev (es) | LZCNT Gv,Ev (66),(es) +f6: Grp3_1 Eb (1A),(ev) +f7: Grp3_2 Ev (1A),(es) +f8: MOVDIR64B Gv,Mdqq (66),(ev) | ENQCMD Gv,Mdqq (F2),(ev) | ENQCMDS Gv,Mdqq (F3),(ev) | URDMSR Rq,Gq (F2),(11B),(ev) | UWRMSR Gq,Rq (F3),(11B),(ev) +f9: MOVDIRI My,Gy (ev) +fe: Grp4 (1A),(ev) +ff: Grp5 (1A),(es) | PUSH2 Bq,Rq (110),(11B),(ev) +EndTable + Table: EVEX map 5 Referrer: AVXcode: 5 @@ -975,6 +1100,12 @@ d6: vfcmulcph Vx,Hx,Wx (F2),(ev) | vfmulcph Vx,Hx,Wx (F3),(ev) d7: vfcmulcsh Vx,Hx,Wx (F2),(ev) | vfmulcsh Vx,Hx,Wx (F3),(ev) EndTable +Table: VEX map 7 +Referrer: +AVXcode: 7 +f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1051,7 +1182,7 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) | RDMSRLIST (F2),(110),(11B) | WRMSRLIST (F3),(110),(11B) | PBNDKB (111),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms @@ -1137,6 +1268,8 @@ GrpTable: Grp16 1: prefetch T0 2: prefetch T1 3: prefetch T2 +6: prefetch IT1 +7: prefetch IT0 EndTable GrpTable: Grp17 diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index af38469afd..5770c8097f 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -64,7 +64,9 @@ BEGIN { modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" - rex_expr = "^REX(\\.[XRWB]+)*" + rex_expr = "^((REX(\\.[XRWB]+)+)|(REX$))" + rex2_expr = "\\(REX2\\)" + no_rex2_expr = "\\(!REX2\\)" fpu_expr = "^ESC" # TODO lprefix1_expr = "\\((66|!F3)\\)" @@ -81,6 +83,8 @@ BEGIN { vexonly_expr = "\\(v\\)" # All opcodes with (ev) superscript supports *only* EVEX prefix evexonly_expr = "\\(ev\\)" + # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size + evex_scalable_expr = "\\(es\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -99,6 +103,7 @@ BEGIN { prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" + prefix_num["REX2"] = "INAT_PFX_REX2" clear_vars() } @@ -314,6 +319,10 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, force64_expr)) flags = add_flags(flags, "INAT_FORCE64") + # check REX2 not allowed + if (match(ext, no_rex2_expr)) + flags = add_flags(flags, "INAT_NO_REX2") + # check REX prefix if (match(opcode, rex_expr)) flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)") @@ -325,6 +334,8 @@ function convert_operands(count,opnd, i,j,imm,mod) # check VEX codes if (match(ext, evexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY") + else if (match(ext, evex_scalable_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY | INAT_EVEX_SCALABLE") else if (match(ext, vexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) @@ -351,6 +362,8 @@ function convert_operands(count,opnd, i,j,imm,mod) lptable3[idx] = add_flags(lptable3[idx],flags) variant = "INAT_VARIANT" } + if (match(ext, rex2_expr)) + table[idx] = add_flags(table[idx], "INAT_REX2_VARIANT") if (!match(ext, lprefix_expr)){ table[idx] = add_flags(table[idx],flags) } diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index ac8487dcff..4315652678 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -31,9 +31,9 @@ see_also = $(subst " ",, \ "\n" \ "SEE ALSO\n" \ "========\n" \ - "\t**bpf**\ (2),\n" \ - "\t**bpf-helpers**\\ (7)" \ - $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "**bpf**\ (2),\n" \ + "**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n**$(page)**\\ (8)") \ "\n") $(OUTPUT)%.8: %.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 342716f74e..eaba24320f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -14,82 +14,76 @@ tool for inspection of BTF data SYNOPSIS ======== - **bpftool** [*OPTIONS*] **btf** *COMMAND* +**bpftool** [*OPTIONS*] **btf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } - *COMMANDS* := { **dump** | **help** } +*COMMANDS* := { **dump** | **help** } BTF COMMANDS ============= -| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] -| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] -| **bpftool** **btf help** +| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] +| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] +| **bpftool** **btf help** | -| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } -| *FORMAT* := { **raw** | **c** } -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } +| *FORMAT* := { **raw** | **c** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } DESCRIPTION =========== - **bpftool btf { show | list }** [**id** *BTF_ID*] - Show information about loaded BTF objects. If a BTF ID is - specified, show information only about given BTF object, - otherwise list all BTF objects currently loaded on the - system. +bpftool btf { show | list } [id *BTF_ID*] + Show information about loaded BTF objects. If a BTF ID is specified, show + information only about given BTF object, otherwise list all BTF objects + currently loaded on the system. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BTF - objects. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BTF objects. On such kernels + bpftool will automatically emit this information as well. - **bpftool btf dump** *BTF_SRC* - Dump BTF entries from a given *BTF_SRC*. +bpftool btf dump *BTF_SRC* + Dump BTF entries from a given *BTF_SRC*. - When **id** is specified, BTF object with that ID will be - loaded and all its BTF types emitted. + When **id** is specified, BTF object with that ID will be loaded and all + its BTF types emitted. - When **map** is provided, it's expected that map has - associated BTF object with BTF types describing key and - value. It's possible to select whether to dump only BTF - type(s) associated with key (**key**), value (**value**), - both key and value (**kv**), or all BTF types present in - associated BTF object (**all**). If not specified, **kv** - is assumed. + When **map** is provided, it's expected that map has associated BTF object + with BTF types describing key and value. It's possible to select whether to + dump only BTF type(s) associated with key (**key**), value (**value**), + both key and value (**kv**), or all BTF types present in associated BTF + object (**all**). If not specified, **kv** is assumed. - When **prog** is provided, it's expected that program has - associated BTF object with BTF types. + When **prog** is provided, it's expected that program has associated BTF + object with BTF types. - When specifying *FILE*, an ELF file is expected, containing - .BTF section with well-defined BTF binary format data, - typically produced by clang or pahole. + When specifying *FILE*, an ELF file is expected, containing .BTF section + with well-defined BTF binary format data, typically produced by clang or + pahole. - **format** option can be used to override default (raw) - output format. Raw (**raw**) or C-syntax (**c**) output - formats are supported. + **format** option can be used to override default (raw) output format. Raw + (**raw**) or C-syntax (**c**) output formats are supported. - **bpftool btf help** - Print short help message. +bpftool btf help + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -B, --base-btf *FILE* - Pass a base BTF object. Base BTF objects are typically used - with BTF objects for kernel modules. To avoid duplicating - all kernel symbols required by modules, BTF objects for - modules are "split", they are built incrementally on top of - the kernel (vmlinux) BTF object. So the base BTF reference - should usually point to the kernel BTF. - - When the main BTF object to process (for example, the - module BTF to dump) is passed as a *FILE*, bpftool attempts - to autodetect the path for the base object, and passing - this option is optional. When the main BTF object is passed - through other handles, this option becomes necessary. +.. include:: common_options.rst + +-B, --base-btf *FILE* + Pass a base BTF object. Base BTF objects are typically used with BTF + objects for kernel modules. To avoid duplicating all kernel symbols + required by modules, BTF objects for modules are "split", they are + built incrementally on top of the kernel (vmlinux) BTF object. So the + base BTF reference should usually point to the kernel BTF. + + When the main BTF object to process (for example, the module BTF to + dump) is passed as a *FILE*, bpftool attempts to autodetect the path + for the base object, and passing this option is optional. When the main + BTF object is passed through other handles, this option becomes + necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 2ce900f66d..e8185596a7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -14,134 +14,125 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **cgroup** *COMMAND* +**bpftool** [*OPTIONS*] **cgroup** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } - *COMMANDS* := - { **show** | **list** | **tree** | **attach** | **detach** | **help** } +*COMMANDS* := +{ **show** | **list** | **tree** | **attach** | **detach** | **help** } CGROUP COMMANDS =============== -| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] -| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] -| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] -| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* -| **bpftool** **cgroup help** +| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] +| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] +| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +| **bpftool** **cgroup help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | -| **cgroup_inet_sock_create** | **cgroup_sock_ops** | -| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | -| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | -| **cgroup_inet4_connect** | **cgroup_inet6_connect** | -| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | -| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | -| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | -| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | -| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | -| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | -| **cgroup_unix_recvmsg** | **cgroup_sysctl** | -| **cgroup_getsockopt** | **cgroup_setsockopt** | -| **cgroup_inet_sock_release** } -| *ATTACH_FLAGS* := { **multi** | **override** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | +| **cgroup_inet_sock_create** | **cgroup_sock_ops** | +| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | +| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | +| **cgroup_inet4_connect** | **cgroup_inet6_connect** | +| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | +| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | +| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | +| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | +| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | +| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | +| **cgroup_unix_recvmsg** | **cgroup_sysctl** | +| **cgroup_getsockopt** | **cgroup_setsockopt** | +| **cgroup_inet_sock_release** } +| *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION =========== - **bpftool cgroup { show | list }** *CGROUP* [**effective**] - List all programs attached to the cgroup *CGROUP*. - - Output will start with program ID followed by attach type, - attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] - Iterate over all cgroups in *CGROUP_ROOT* and list all - attached programs. If *CGROUP_ROOT* is not specified, - bpftool uses cgroup v2 mountpoint. - - The output is similar to the output of cgroup show/list - commands: it starts with absolute cgroup path, followed by - program ID, attach type, attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] - Attach program *PROG* to the cgroup *CGROUP* with attach type - *ATTACH_TYPE* and optional *ATTACH_FLAGS*. - - *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs - some bpf program, the program in this cgroup yields to sub-cgroup - program; **multi** if a sub-cgroup installs some bpf program, - that cgroup program gets run in addition to the program in this - cgroup. - - Only one program is allowed to be attached to a cgroup with - no attach flags or the **override** flag. Attaching another - program will release old program and attach the new one. - - Multiple programs are allowed to be attached to a cgroup with - **multi**. They are executed in FIFO order (those that were - attached first, run first). - - Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 - and later. - - *ATTACH_TYPE* can be on of: - **ingress** ingress path of the inet socket (since 4.10); - **egress** egress path of the inet socket (since 4.10); - **sock_create** opening of an inet socket (since 4.10); - **sock_ops** various socket operations (since 4.12); - **device** device access (since 4.15); - **bind4** call to bind(2) for an inet4 socket (since 4.17); - **bind6** call to bind(2) for an inet6 socket (since 4.17); - **post_bind4** return from bind(2) for an inet4 socket (since 4.17); - **post_bind6** return from bind(2) for an inet6 socket (since 4.17); - **connect4** call to connect(2) for an inet4 socket (since 4.17); - **connect6** call to connect(2) for an inet6 socket (since 4.17); - **connect_unix** call to connect(2) for a unix socket (since 6.7); - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp4 socket (since 4.18); - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp6 socket (since 4.18); - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for - an unconnected unix socket (since 6.7); - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp4 socket (since 5.2); - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp6 socket (since 5.2); - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected unix socket (since 6.7); - **sysctl** sysctl access (since 5.2); - **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3); - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); - **sock_release** closing an userspace inet socket (since 5.9). - - **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* - Detach *PROG* from the cgroup *CGROUP* and attach type - *ATTACH_TYPE*. - - **bpftool prog help** - Print short help message. +bpftool cgroup { show | list } *CGROUP* [effective] + List all programs attached to the cgroup *CGROUP*. + + Output will start with program ID followed by attach type, attach flags and + program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +bpftool cgroup tree [*CGROUP_ROOT*] [effective] + Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs. + If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint. + + The output is similar to the output of cgroup show/list commands: it starts + with absolute cgroup path, followed by program ID, attach type, attach + flags and program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] + Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE* + and optional *ATTACH_FLAGS*. + + *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs some + bpf program, the program in this cgroup yields to sub-cgroup program; + **multi** if a sub-cgroup installs some bpf program, that cgroup program + gets run in addition to the program in this cgroup. + + Only one program is allowed to be attached to a cgroup with no attach flags + or the **override** flag. Attaching another program will release old + program and attach the new one. + + Multiple programs are allowed to be attached to a cgroup with **multi**. + They are executed in FIFO order (those that were attached first, run + first). + + Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later. + + *ATTACH_TYPE* can be one of: + + - **ingress** ingress path of the inet socket (since 4.10) + - **egress** egress path of the inet socket (since 4.10) + - **sock_create** opening of an inet socket (since 4.10) + - **sock_ops** various socket operations (since 4.12) + - **device** device access (since 4.15) + - **bind4** call to bind(2) for an inet4 socket (since 4.17) + - **bind6** call to bind(2) for an inet6 socket (since 4.17) + - **post_bind4** return from bind(2) for an inet4 socket (since 4.17) + - **post_bind6** return from bind(2) for an inet6 socket (since 4.17) + - **connect4** call to connect(2) for an inet4 socket (since 4.17) + - **connect6** call to connect(2) for an inet6 socket (since 4.17) + - **connect_unix** call to connect(2) for a unix socket (since 6.7) + - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp4 socket (since 4.18) + - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp6 socket (since 4.18) + - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected unix socket (since 6.7) + - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp4 socket (since 5.2) + - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp6 socket (since 5.2) + - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected unix socket (since 6.7) + - **sysctl** sysctl access (since 5.2) + - **getsockopt** call to getsockopt (since 5.3) + - **setsockopt** call to setsockopt (since 5.3) + - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8) + - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8) + - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7) + - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8) + - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8) + - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7) + - **sock_release** closing a userspace inet socket (since 5.9) + +bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG* + Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. + +bpftool prog help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned programs. +-f, --bpffs + Show file names of pinned programs. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index e44039f89b..c7f837898b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -14,77 +14,70 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device SYNOPSIS ======== - **bpftool** [*OPTIONS*] **feature** *COMMAND* +**bpftool** [*OPTIONS*] **feature** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **probe** | **help** } +*COMMANDS* := { **probe** | **help** } FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] -| **bpftool** **feature list_builtins** *GROUP* -| **bpftool** **feature help** +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list_builtins** *GROUP* +| **bpftool** **feature help** | -| *COMPONENT* := { **kernel** | **dev** *NAME* } -| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } +| *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } DESCRIPTION =========== - **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] - Probe the running kernel and dump a number of eBPF-related - parameters, such as availability of the **bpf**\ () system call, - JIT status, eBPF program types availability, eBPF helper - functions availability, and more. - - By default, bpftool **does not run probes** for - **bpf_probe_write_user**\ () and **bpf_trace_printk**\() - helpers which print warnings to kernel logs. To enable them - and run all probes, the **full** keyword should be used. - - If the **macros** keyword (but not the **-j** option) is - passed, a subset of the output is dumped as a list of - **#define** macros that are ready to be included in a C - header file, for example. If, additionally, **prefix** is - used to define a *PREFIX*, the provided string will be used - as a prefix to the names of the macros: this can be used to - avoid conflicts on macro names when including the output of - this command as a header file. - - Keyword **kernel** can be omitted. If no probe target is - specified, probing the kernel is the default behaviour. - - When the **unprivileged** keyword is used, bpftool will dump - only the features available to a user who does not have the - **CAP_SYS_ADMIN** capability set. The features available in - that case usually represent a small subset of the parameters - supported by the system. Unprivileged users MUST use the - **unprivileged** keyword: This is to avoid misdetection if - bpftool is inadvertently run as non-root, for example. This - keyword is unavailable if bpftool was compiled without - libcap. - - **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] - Probe network device for supported eBPF features and dump - results to the console. - - The keywords **full**, **macros** and **prefix** have the - same role as when probing the kernel. - - **bpftool feature list_builtins** *GROUP* - List items known to bpftool. These can be BPF program types - (**prog_types**), BPF map types (**map_types**), attach types - (**attach_types**), link types (**link_types**), or BPF helper - functions (**helpers**). The command does not probe the system, but - simply lists the elements that bpftool knows from compilation time, - as provided from libbpf (for all object types) or from the BPF UAPI - header (list of helpers). This can be used in scripts to iterate over - BPF types or helpers. - - **bpftool feature help** - Print short help message. +bpftool feature probe [kernel] [full] [macros [prefix *PREFIX*]] + Probe the running kernel and dump a number of eBPF-related parameters, such + as availability of the **bpf**\ () system call, JIT status, eBPF program + types availability, eBPF helper functions availability, and more. + + By default, bpftool **does not run probes** for **bpf_probe_write_user**\ + () and **bpf_trace_printk**\() helpers which print warnings to kernel logs. + To enable them and run all probes, the **full** keyword should be used. + + If the **macros** keyword (but not the **-j** option) is passed, a subset + of the output is dumped as a list of **#define** macros that are ready to + be included in a C header file, for example. If, additionally, **prefix** + is used to define a *PREFIX*, the provided string will be used as a prefix + to the names of the macros: this can be used to avoid conflicts on macro + names when including the output of this command as a header file. + + Keyword **kernel** can be omitted. If no probe target is specified, probing + the kernel is the default behaviour. + + When the **unprivileged** keyword is used, bpftool will dump only the + features available to a user who does not have the **CAP_SYS_ADMIN** + capability set. The features available in that case usually represent a + small subset of the parameters supported by the system. Unprivileged users + MUST use the **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. This keyword is + unavailable if bpftool was compiled without libcap. + +bpftool feature probe dev *NAME* [full] [macros [prefix *PREFIX*]] + Probe network device for supported eBPF features and dump results to the + console. + + The keywords **full**, **macros** and **prefix** have the same role as when + probing the kernel. + +bpftool feature list_builtins *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper functions + (**helpers**). The command does not probe the system, but simply lists the + elements that bpftool knows from compilation time, as provided from libbpf + (for all object types) or from the BPF UAPI header (list of helpers). This + can be used in scripts to iterate over BPF types or helpers. + +bpftool feature help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 5e60825818..c768e6d4ae 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,199 +14,177 @@ tool for BPF code-generation SYNOPSIS ======== - **bpftool** [*OPTIONS*] **gen** *COMMAND* +**bpftool** [*OPTIONS*] **gen** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } - *COMMAND* := { **object** | **skeleton** | **help** } +*COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= -| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] -| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] -| **bpftool** **gen help** +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +| **bpftool** **gen help** DESCRIPTION =========== - **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] - Statically link (combine) together one or more *INPUT_FILE*'s - into a single resulting *OUTPUT_FILE*. All the files involved - are BPF ELF object files. - - The rules of BPF static linking are mostly the same as for - user-space object files, but in addition to combining data - and instruction sections, .BTF and .BTF.ext (if present in - any of the input files) data are combined together. .BTF - data is deduplicated, so all the common types across - *INPUT_FILE*'s will only be represented once in the resulting - BTF information. - - BPF static linking allows to partition BPF source code into - individually compiled files that are then linked into - a single resulting BPF object file, which can be used to - generated BPF skeleton (with **gen skeleton** command) or - passed directly into **libbpf** (using **bpf_object__open()** - family of APIs). - - **bpftool gen skeleton** *FILE* - Generate BPF skeleton C header file for a given *FILE*. - - BPF skeleton is an alternative interface to existing libbpf - APIs for working with BPF objects. Skeleton code is intended - to significantly shorten and simplify code to load and work - with BPF programs from userspace side. Generated code is - tailored to specific input BPF object *FILE*, reflecting its - structure by listing out available maps, program, variables, - etc. Skeleton eliminates the need to lookup mentioned - components by name. Instead, if skeleton instantiation - succeeds, they are populated in skeleton structure as valid - libbpf types (e.g., **struct bpf_map** pointer) and can be - passed to existing generic libbpf APIs. - - In addition to simple and reliable access to maps and - programs, skeleton provides a storage for BPF links (**struct - bpf_link**) for each BPF program within BPF object. When - requested, supported BPF programs will be automatically - attached and resulting BPF links stored for further use by - user in pre-allocated fields in skeleton struct. For BPF - programs that can't be automatically attached by libbpf, - user can attach them manually, but store resulting BPF link - in per-program link field. All such set up links will be - automatically destroyed on BPF skeleton destruction. This - eliminates the need for users to manage links manually and - rely on libbpf support to detach programs and free up - resources. - - Another facility provided by BPF skeleton is an interface to - global variables of all supported kinds: mutable, read-only, - as well as extern ones. This interface allows to pre-setup - initial values of variables before BPF object is loaded and - verified by kernel. For non-read-only variables, the same - interface can be used to fetch values of global variables on - userspace side, even if they are modified by BPF code. - - During skeleton generation, contents of source BPF object - *FILE* is embedded within generated code and is thus not - necessary to keep around. This ensures skeleton and BPF - object file are matching 1-to-1 and always stay in sync. - Generated code is dual-licensed under LGPL-2.1 and - BSD-2-Clause licenses. - - It is a design goal and guarantee that skeleton interfaces - are interoperable with generic libbpf APIs. User should - always be able to use skeleton API to create and load BPF - object, and later use libbpf APIs to keep working with - specific maps, programs, etc. - - As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name. Object name can - either be derived from object file name, i.e., if BPF object - file name is **example.o**, BPF object name will be - **example**. Object name can be also specified explicitly - through **name** *OBJECT_NAME* parameter. The following - custom functions are provided (assuming **example** as - the object name): - - - **example__open** and **example__open_opts**. - These functions are used to instantiate skeleton. It - corresponds to libbpf's **bpf_object__open**\ () API. - **_opts** variants accepts extra **bpf_object_open_opts** - options. - - - **example__load**. - This function creates maps, loads and verifies BPF - programs, initializes global data maps. It corresponds to - libppf's **bpf_object__load**\ () API. - - - **example__open_and_load** combines **example__open** and - **example__load** invocations in one commonly used - operation. - - - **example__attach** and **example__detach** - This pair of functions allow to attach and detach, - correspondingly, already loaded BPF object. Only BPF - programs of types supported by libbpf for auto-attachment - will be auto-attached and their corresponding BPF links - instantiated. For other BPF programs, user can manually - create a BPF link and assign it to corresponding fields in - skeleton struct. **example__detach** will detach both - links created automatically, as well as those populated by - user manually. - - - **example__destroy** - Detach and unload BPF programs, free up all the resources - used by skeleton and BPF object. - - If BPF object has global variables, corresponding structs - with memory layout corresponding to global data data section - layout will be created. Currently supported ones are: *.data*, - *.bss*, *.rodata*, and *.kconfig* structs/data sections. - These data sections/structs can be used to set up initial - values of variables, if set before **example__load**. - Afterwards, if target kernel supports memory-mapped BPF - arrays, same structs can be used to fetch and update - (non-read-only) data from userspace, with same simplicity - as for BPF side. - - **bpftool gen subskeleton** *FILE* - Generate BPF subskeleton C header file for a given *FILE*. - - Subskeletons are similar to skeletons, except they do not own - the corresponding maps, programs, or global variables. They - require that the object file used to generate them is already - loaded into a *bpf_object* by some other means. - - This functionality is useful when a library is included into a - larger BPF program. A subskeleton for the library would have - access to all objects and globals defined in it, without - having to know about the larger program. - - Consequently, there are only two functions defined - for subskeletons: - - - **example__open(bpf_object\*)** - Instantiates a subskeleton from an already opened (but not - necessarily loaded) **bpf_object**. - - - **example__destroy()** - Frees the storage for the subskeleton but *does not* unload - any BPF programs or maps. - - **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] - Generate a minimum BTF file as *OUTPUT*, derived from a given - *INPUT* BTF file, containing all needed BTF types so one, or - more, given eBPF objects CO-RE relocations may be satisfied. - - When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, - libbpf, when loading an eBPF object, has to rely on external - BTF files to be able to calculate CO-RE relocations. - - Usually, an external BTF file is built from existing kernel - DWARF data using pahole. It contains all the types used by - its respective kernel image and, because of that, is big. - - The min_core_btf feature builds smaller BTF files, customized - to one or multiple eBPF objects, so they can be distributed - together with an eBPF CO-RE based application, turning the - application portable to different kernel versions. - - Check examples bellow for more information how to use it. - - **bpftool gen help** - Print short help message. +bpftool gen object *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s into a single + resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for user-space + object files, but in addition to combining data and instruction sections, + .BTF and .BTF.ext (if present in any of the input files) data are combined + together. .BTF data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting BTF + information. + + BPF static linking allows to partition BPF source code into individually + compiled files that are then linked into a single resulting BPF object + file, which can be used to generated BPF skeleton (with **gen skeleton** + command) or passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + +bpftool gen skeleton *FILE* + Generate BPF skeleton C header file for a given *FILE*. + + BPF skeleton is an alternative interface to existing libbpf APIs for + working with BPF objects. Skeleton code is intended to significantly + shorten and simplify code to load and work with BPF programs from userspace + side. Generated code is tailored to specific input BPF object *FILE*, + reflecting its structure by listing out available maps, program, variables, + etc. Skeleton eliminates the need to lookup mentioned components by name. + Instead, if skeleton instantiation succeeds, they are populated in skeleton + structure as valid libbpf types (e.g., **struct bpf_map** pointer) and can + be passed to existing generic libbpf APIs. + + In addition to simple and reliable access to maps and programs, skeleton + provides a storage for BPF links (**struct bpf_link**) for each BPF program + within BPF object. When requested, supported BPF programs will be + automatically attached and resulting BPF links stored for further use by + user in pre-allocated fields in skeleton struct. For BPF programs that + can't be automatically attached by libbpf, user can attach them manually, + but store resulting BPF link in per-program link field. All such set up + links will be automatically destroyed on BPF skeleton destruction. This + eliminates the need for users to manage links manually and rely on libbpf + support to detach programs and free up resources. + + Another facility provided by BPF skeleton is an interface to global + variables of all supported kinds: mutable, read-only, as well as extern + ones. This interface allows to pre-setup initial values of variables before + BPF object is loaded and verified by kernel. For non-read-only variables, + the same interface can be used to fetch values of global variables on + userspace side, even if they are modified by BPF code. + + During skeleton generation, contents of source BPF object *FILE* is + embedded within generated code and is thus not necessary to keep around. + This ensures skeleton and BPF object file are matching 1-to-1 and always + stay in sync. Generated code is dual-licensed under LGPL-2.1 and + BSD-2-Clause licenses. + + It is a design goal and guarantee that skeleton interfaces are + interoperable with generic libbpf APIs. User should always be able to use + skeleton API to create and load BPF object, and later use libbpf APIs to + keep working with specific maps, programs, etc. + + As part of skeleton, few custom functions are generated. Each of them is + prefixed with object name. Object name can either be derived from object + file name, i.e., if BPF object file name is **example.o**, BPF object name + will be **example**. Object name can be also specified explicitly through + **name** *OBJECT_NAME* parameter. The following custom functions are + provided (assuming **example** as the object name): + + - **example__open** and **example__open_opts**. + These functions are used to instantiate skeleton. It corresponds to + libbpf's **bpf_object__open**\ () API. **_opts** variants accepts extra + **bpf_object_open_opts** options. + + - **example__load**. + This function creates maps, loads and verifies BPF programs, initializes + global data maps. It corresponds to libppf's **bpf_object__load**\ () + API. + + - **example__open_and_load** combines **example__open** and + **example__load** invocations in one commonly used operation. + + - **example__attach** and **example__detach**. + This pair of functions allow to attach and detach, correspondingly, + already loaded BPF object. Only BPF programs of types supported by libbpf + for auto-attachment will be auto-attached and their corresponding BPF + links instantiated. For other BPF programs, user can manually create a + BPF link and assign it to corresponding fields in skeleton struct. + **example__detach** will detach both links created automatically, as well + as those populated by user manually. + + - **example__destroy**. + Detach and unload BPF programs, free up all the resources used by + skeleton and BPF object. + + If BPF object has global variables, corresponding structs with memory + layout corresponding to global data data section layout will be created. + Currently supported ones are: *.data*, *.bss*, *.rodata*, and *.kconfig* + structs/data sections. These data sections/structs can be used to set up + initial values of variables, if set before **example__load**. Afterwards, + if target kernel supports memory-mapped BPF arrays, same structs can be + used to fetch and update (non-read-only) data from userspace, with same + simplicity as for BPF side. + +bpftool gen subskeleton *FILE* + Generate BPF subskeleton C header file for a given *FILE*. + + Subskeletons are similar to skeletons, except they do not own the + corresponding maps, programs, or global variables. They require that the + object file used to generate them is already loaded into a *bpf_object* by + some other means. + + This functionality is useful when a library is included into a larger BPF + program. A subskeleton for the library would have access to all objects and + globals defined in it, without having to know about the larger program. + + Consequently, there are only two functions defined for subskeletons: + + - **example__open(bpf_object\*)**. + Instantiates a subskeleton from an already opened (but not necessarily + loaded) **bpf_object**. + + - **example__destroy()**. + Frees the storage for the subskeleton but *does not* unload any BPF + programs or maps. + +bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] + Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF + file, containing all needed BTF types so one, or more, given eBPF objects + CO-RE relocations may be satisfied. + + When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, libbpf, when + loading an eBPF object, has to rely on external BTF files to be able to + calculate CO-RE relocations. + + Usually, an external BTF file is built from existing kernel DWARF data + using pahole. It contains all the types used by its respective kernel image + and, because of that, is big. + + The min_core_btf feature builds smaller BTF files, customized to one or + multiple eBPF objects, so they can be distributed together with an eBPF + CO-RE based application, turning the application portable to different + kernel versions. + + Check examples bellow for more information how to use it. + +bpftool gen help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -L, --use-loader - For skeletons, generate a "light" skeleton (also known as "loader" - skeleton). A light skeleton contains a loader eBPF program. It does - not use the majority of the libbpf infrastructure, and does not need - libelf. +-L, --use-loader + For skeletons, generate a "light" skeleton (also known as "loader" + skeleton). A light skeleton contains a loader eBPF program. It does not use + the majority of the libbpf infrastructure, and does not need libelf. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 84839d4886..2e5d81c906 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -14,50 +14,46 @@ tool to create BPF iterators SYNOPSIS ======== - **bpftool** [*OPTIONS*] **iter** *COMMAND* +**bpftool** [*OPTIONS*] **iter** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **pin** | **help** } +*COMMANDS* := { **pin** | **help** } ITER COMMANDS -=================== +============= -| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] -| **bpftool** **iter help** +| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] +| **bpftool** **iter help** | -| *OBJ* := /a/file/of/bpf_iter_target.o -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *OBJ* := /a/file/of/bpf_iter_target.o +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] - A bpf iterator combines a kernel iterating of - particular kernel data (e.g., tasks, bpf_maps, etc.) - and a bpf program called for each kernel data object - (e.g., one task, one bpf_map, etc.). User space can - *read* kernel iterator output through *read()* syscall. - - The *pin* command creates a bpf iterator from *OBJ*, - and pin it to *PATH*. The *PATH* should be located - in *bpffs* mount. It must not contain a dot - character ('.'), which is reserved for future extensions - of *bpffs*. - - Map element bpf iterator requires an additional parameter - *MAP* so bpf program can iterate over map elements for - that map. User can have a bpf program in kernel to run - with each map element, do checking, filtering, aggregation, - etc. without copying data to user space. - - User can then *cat PATH* to see the bpf iterator output. - - **bpftool iter help** - Print short help message. +bpftool iter pin *OBJ* *PATH* [map *MAP*] + A bpf iterator combines a kernel iterating of particular kernel data (e.g., + tasks, bpf_maps, etc.) and a bpf program called for each kernel data object + (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator + output through *read()* syscall. + + The *pin* command creates a bpf iterator from *OBJ*, and pin it to *PATH*. + The *PATH* should be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + + Map element bpf iterator requires an additional parameter *MAP* so bpf + program can iterate over map elements for that map. User can have a bpf + program in kernel to run with each map element, do checking, filtering, + aggregation, etc. without copying data to user space. + + User can then *cat PATH* to see the bpf iterator output. + +bpftool iter help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 52a4eee4af..6f09d4405e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -14,67 +14,62 @@ tool for inspection and simple manipulation of eBPF links SYNOPSIS ======== - **bpftool** [*OPTIONS*] **link** *COMMAND* +**bpftool** [*OPTIONS*] **link** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := { **show** | **list** | **pin** | **help** } +*COMMANDS* := { **show** | **list** | **pin** | **help** } LINK COMMANDS ============= -| **bpftool** **link { show | list }** [*LINK*] -| **bpftool** **link pin** *LINK* *FILE* -| **bpftool** **link detach** *LINK* -| **bpftool** **link help** +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link detach** *LINK* +| **bpftool** **link help** | -| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool link { show | list }** [*LINK*] - Show information about active links. If *LINK* is - specified show information only about given link, - otherwise list all links currently active on the system. +bpftool link { show | list } [*LINK*] + Show information about active links. If *LINK* is specified show + information only about given link, otherwise list all links currently + active on the system. - Output will start with link ID followed by link type and - zero or more named attributes, some of which depend on type - of link. + Output will start with link ID followed by link type and zero or more named + attributes, some of which depend on type of link. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - links. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF links. On such kernels + bpftool will automatically emit this information as well. - **bpftool link pin** *LINK* *FILE* - Pin link *LINK* as *FILE*. +bpftool link pin *LINK* *FILE* + Pin link *LINK* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool link detach** *LINK* - Force-detach link *LINK*. BPF link and its underlying BPF - program will stay valid, but they will be detached from the - respective BPF hook and BPF link will transition into - a defunct state until last open file descriptor for that - link is closed. +bpftool link detach *LINK* + Force-detach link *LINK*. BPF link and its underlying BPF program will stay + valid, but they will be detached from the respective BPF hook and BPF link + will transition into a defunct state until last open file descriptor for + that link is closed. - **bpftool link help** - Print short help message. +bpftool link help + Print short help message. OPTIONS ======= - .. include:: common_options.rst + .. include:: common_options.rst - -f, --bpffs - When showing BPF links, show file names of pinned - links. + -f, --bpffs + When showing BPF links, show file names of pinned links. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. + -n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 9d6a314dfd..252e4c538e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -14,166 +14,160 @@ tool for inspection and simple manipulation of eBPF maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] **map** *COMMAND* +**bpftool** [*OPTIONS*] **map** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **help** } +*COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **help** } MAP COMMANDS ============= -| **bpftool** **map** { **show** | **list** } [*MAP*] -| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ -| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ -| [**offload_dev** *NAME*] -| **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* [**key** *DATA*] -| **bpftool** **map getnext** *MAP* [**key** *DATA*] -| **bpftool** **map delete** *MAP* **key** *DATA* -| **bpftool** **map pin** *MAP* *FILE* -| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] -| **bpftool** **map peek** *MAP* -| **bpftool** **map push** *MAP* **value** *VALUE* -| **bpftool** **map pop** *MAP* -| **bpftool** **map enqueue** *MAP* **value** *VALUE* -| **bpftool** **map dequeue** *MAP* -| **bpftool** **map freeze** *MAP* -| **bpftool** **map help** +| **bpftool** **map** { **show** | **list** } [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**offload_dev** *NAME*] +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* [**key** *DATA*] +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +| **bpftool** **map peek** *MAP* +| **bpftool** **map push** *MAP* **value** *VALUE* +| **bpftool** **map pop** *MAP* +| **bpftool** **map enqueue** *MAP* **value** *VALUE* +| **bpftool** **map dequeue** *MAP* +| **bpftool** **map freeze** *MAP* +| **bpftool** **map help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } -| *DATA* := { [**hex**] *BYTES* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *VALUE* := { *DATA* | *MAP* | *PROG* } -| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } -| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** -| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** -| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** -| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** -| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *DATA* := { [**hex**] *BYTES* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } +| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== - **bpftool map { show | list }** [*MAP*] - Show information about loaded maps. If *MAP* is specified - show information only about given maps, otherwise list all - maps currently loaded on the system. In case of **name**, - *MAP* may match several maps which will all be shown. +bpftool map { show | list } [*MAP*] + Show information about loaded maps. If *MAP* is specified show information + only about given maps, otherwise list all maps currently loaded on the + system. In case of **name**, *MAP* may match several maps which will all + be shown. - Output will start with map ID followed by map type and - zero or more named attributes (depending on kernel version). + Output will start with map ID followed by map type and zero or more named + attributes (depending on kernel version). - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - maps. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF maps. On such kernels + bpftool will automatically emit this information as well. - **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] - Create a new map with given parameters and pin it to *bpffs* - as *FILE*. +bpftool map create *FILE* type *TYPE* key *KEY_SIZE* value *VALUE_SIZE* entries *MAX_ENTRIES* name *NAME* [flags *FLAGS*] [inner_map *MAP*] [offload_dev *NAME*] + Create a new map with given parameters and pin it to *bpffs* as *FILE*. - *FLAGS* should be an integer which is the combination of - desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h - UAPI header for existing flags). + *FLAGS* should be an integer which is the combination of desired flags, + e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing + flags). - To create maps of type array-of-maps or hash-of-maps, the - **inner_map** keyword must be used to pass an inner map. The - kernel needs it to collect metadata related to the inner maps - that the new map will work with. + To create maps of type array-of-maps or hash-of-maps, the **inner_map** + keyword must be used to pass an inner map. The kernel needs it to collect + metadata related to the inner maps that the new map will work with. - Keyword **offload_dev** expects a network interface name, - and is used to request hardware offload for the map. + Keyword **offload_dev** expects a network interface name, and is used to + request hardware offload for the map. - **bpftool map dump** *MAP* - Dump all entries in a given *MAP*. In case of **name**, - *MAP* may match several maps which will all be dumped. +bpftool map dump *MAP* + Dump all entries in a given *MAP*. In case of **name**, *MAP* may match + several maps which will all be dumped. - **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] - Update map entry for a given *KEY*. +bpftool map update *MAP* [key *DATA*] [value *VALUE*] [*UPDATE_FLAGS*] + Update map entry for a given *KEY*. - *UPDATE_FLAGS* can be one of: **any** update existing entry - or add if doesn't exit; **exist** update only if entry already - exists; **noexist** update only if entry doesn't exist. + *UPDATE_FLAGS* can be one of: **any** update existing entry or add if + doesn't exit; **exist** update only if entry already exists; **noexist** + update only if entry doesn't exist. - If the **hex** keyword is provided in front of the bytes - sequence, the bytes are parsed as hexadecimal values, even if - no "0x" prefix is added. If the keyword is not provided, then - the bytes are parsed as decimal values, unless a "0x" prefix - (for hexadecimal) or a "0" prefix (for octal) is provided. + If the **hex** keyword is provided in front of the bytes sequence, the + bytes are parsed as hexadecimal values, even if no "0x" prefix is added. If + the keyword is not provided, then the bytes are parsed as decimal values, + unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is + provided. - **bpftool map lookup** *MAP* [**key** *DATA*] - Lookup **key** in the map. +bpftool map lookup *MAP* [key *DATA*] + Lookup **key** in the map. - **bpftool map getnext** *MAP* [**key** *DATA*] - Get next key. If *key* is not specified, get first key. +bpftool map getnext *MAP* [key *DATA*] + Get next key. If *key* is not specified, get first key. - **bpftool map delete** *MAP* **key** *DATA* - Remove entry from the map. +bpftool map delete *MAP* key *DATA* + Remove entry from the map. - **bpftool map pin** *MAP* *FILE* - Pin map *MAP* as *FILE*. +bpftool map pin *MAP* *FILE* + Pin map *MAP* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] - Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. +bpftool map event_pipe *MAP* [cpu *N* index *M*] + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. - Install perf rings into a perf event array map and dump - output of any **bpf_perf_event_output**\ () call in the kernel. - By default read the number of CPUs on the system and - install perf ring for each CPU in the corresponding index - in the array. + Install perf rings into a perf event array map and dump output of any + **bpf_perf_event_output**\ () call in the kernel. By default read the + number of CPUs on the system and install perf ring for each CPU in the + corresponding index in the array. - If **cpu** and **index** are specified, install perf ring - for given **cpu** at **index** in the array (single ring). + If **cpu** and **index** are specified, install perf ring for given **cpu** + at **index** in the array (single ring). - Note that installing a perf ring into an array will silently - replace any existing ring. Any other application will stop - receiving events if it installed its rings earlier. + Note that installing a perf ring into an array will silently replace any + existing ring. Any other application will stop receiving events if it + installed its rings earlier. - **bpftool map peek** *MAP* - Peek next value in the queue or stack. +bpftool map peek *MAP* + Peek next value in the queue or stack. - **bpftool map push** *MAP* **value** *VALUE* - Push *VALUE* onto the stack. +bpftool map push *MAP* value *VALUE* + Push *VALUE* onto the stack. - **bpftool map pop** *MAP* - Pop and print value from the stack. +bpftool map pop *MAP* + Pop and print value from the stack. - **bpftool map enqueue** *MAP* **value** *VALUE* - Enqueue *VALUE* into the queue. +bpftool map enqueue *MAP* value *VALUE* + Enqueue *VALUE* into the queue. - **bpftool map dequeue** *MAP* - Dequeue and print value from the queue. +bpftool map dequeue *MAP* + Dequeue and print value from the queue. - **bpftool map freeze** *MAP* - Freeze the map as read-only from user space. Entries from a - frozen map can not longer be updated or deleted with the - **bpf**\ () system call. This operation is not reversible, - and the map remains immutable from user space until its - destruction. However, read and write permissions for BPF - programs to the map remain unchanged. +bpftool map freeze *MAP* + Freeze the map as read-only from user space. Entries from a frozen map can + not longer be updated or deleted with the **bpf**\ () system call. This + operation is not reversible, and the map remains immutable from user space + until its destruction. However, read and write permissions for BPF programs + to the map remain unchanged. - **bpftool map help** - Print short help message. +bpftool map help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned maps. +-f, --bpffs + Show file names of pinned maps. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index dd3f946976..3488128812 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -14,76 +14,74 @@ tool for inspection of networking related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **net** *COMMAND* +**bpftool** [*OPTIONS*] **net** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **attach** | **detach** | **help** } +*COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } NET COMMANDS ============ -| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] -| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* -| **bpftool** **net help** +| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +| **bpftool** **net help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } DESCRIPTION =========== - **bpftool net { show | list }** [ **dev** *NAME* ] - List bpf program attachments in the kernel networking subsystem. - - Currently, device driver xdp attachments, tcx, netkit and old-style tc - classifier/action attachments, flow_dissector as well as netfilter - attachments are implemented, i.e., for - program types **BPF_PROG_TYPE_XDP**, **BPF_PROG_TYPE_SCHED_CLS**, - **BPF_PROG_TYPE_SCHED_ACT**, **BPF_PROG_TYPE_FLOW_DISSECTOR**, - **BPF_PROG_TYPE_NETFILTER**. - - For programs attached to a particular cgroup, e.g., - **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, - **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, - users can use **bpftool cgroup** to dump cgroup attachments. - For sk_{filter, skb, msg, reuseport} and lwt/seg6 - bpf programs, users should consult other tools, e.g., iproute2. - - The current output will start with all xdp program attachments, followed by - all tcx, netkit, then tc class/qdisc bpf program attachments, then flow_dissector - and finally netfilter programs. Both xdp programs and tcx/netkit/tc programs are - ordered based on ifindex number. If multiple bpf programs attached - to the same networking device through **tc**, the order will be first - all bpf programs attached to tcx, netkit, then tc classes, then all bpf programs - attached to non clsact qdiscs, and finally all bpf programs attached - to root and clsact qdisc. - - **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] - Attach bpf program *PROG* to network interface *NAME* with - type specified by *ATTACH_TYPE*. Previously attached bpf program - can be replaced by the command used with **overwrite** option. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - *ATTACH_TYPE* can be of: - **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; - **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; - **xdpdrv** - Native XDP. runs earliest point in driver's receive path; - **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; - - **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* - Detach bpf program attached to network interface *NAME* with - type specified by *ATTACH_TYPE*. To detach bpf program, same - *ATTACH_TYPE* previously used for attach must be specified. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - **bpftool net help** - Print short help message. +bpftool net { show | list } [ dev *NAME* ] + List bpf program attachments in the kernel networking subsystem. + + Currently, device driver xdp attachments, tcx, netkit and old-style tc + classifier/action attachments, flow_dissector as well as netfilter + attachments are implemented, i.e., for program types **BPF_PROG_TYPE_XDP**, + **BPF_PROG_TYPE_SCHED_CLS**, **BPF_PROG_TYPE_SCHED_ACT**, + **BPF_PROG_TYPE_FLOW_DISSECTOR**, **BPF_PROG_TYPE_NETFILTER**. + + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, users + can use **bpftool cgroup** to dump cgroup attachments. For sk_{filter, skb, + msg, reuseport} and lwt/seg6 bpf programs, users should consult other + tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by + all tcx, netkit, then tc class/qdisc bpf program attachments, then + flow_dissector and finally netfilter programs. Both xdp programs and + tcx/netkit/tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc**, the order + will be first all bpf programs attached to tcx, netkit, then tc classes, + then all bpf programs attached to non clsact qdiscs, and finally all bpf + programs attached to root and clsact qdisc. + +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] + Attach bpf program *PROG* to network interface *NAME* with type specified + by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the + command used with **overwrite** option. Currently, only XDP-related modes + are supported for *ATTACH_TYPE*. + + *ATTACH_TYPE* can be of: + **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; + **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; + **xdpdrv** - Native XDP. runs earliest point in driver's receive path; + **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + +bpftool net detach *ATTACH_TYPE* dev *NAME* + Detach bpf program attached to network interface *NAME* with type specified + by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used + for attach must be specified. Currently, only XDP-related modes are + supported for *ATTACH_TYPE*. + +bpftool net help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 5fea633a82..8c1ae55be5 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -14,37 +14,37 @@ tool for inspection of perf related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **perf** *COMMAND* +**bpftool** [*OPTIONS*] **perf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **help** } +*COMMANDS* := +{ **show** | **list** | **help** } PERF COMMANDS ============= -| **bpftool** **perf** { **show** | **list** } -| **bpftool** **perf help** +| **bpftool** **perf** { **show** | **list** } +| **bpftool** **perf help** DESCRIPTION =========== - **bpftool perf { show | list }** - List all raw_tracepoint, tracepoint, kprobe attachment in the system. +bpftool perf { show | list } + List all raw_tracepoint, tracepoint, kprobe attachment in the system. - Output will start with process id and file descriptor in that process, - followed by bpf program id, attachment information, and attachment point. - The attachment point for raw_tracepoint/tracepoint is the trace probe name. - The attachment point for k[ret]probe is either symbol name and offset, - or a kernel virtual address. - The attachment point for u[ret]probe is the file name and the file offset. + Output will start with process id and file descriptor in that process, + followed by bpf program id, attachment information, and attachment point. + The attachment point for raw_tracepoint/tracepoint is the trace probe name. + The attachment point for k[ret]probe is either symbol name and offset, or a + kernel virtual address. The attachment point for u[ret]probe is the file + name and the file offset. - **bpftool perf help** - Print short help message. +bpftool perf help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 58e6a5b10e..d6304e01af 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -14,250 +14,226 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **prog** *COMMAND* +**bpftool** [*OPTIONS*] **prog** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | - { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | - { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | +{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | +{ **-L** | **--use-loader** } } - *COMMANDS* := - { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | - **loadall** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | +**loadall** | **help** } PROG COMMANDS ============= -| **bpftool** **prog** { **show** | **list** } [*PROG*] -| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] -| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] -| **bpftool** **prog pin** *PROG* *FILE* -| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] -| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog tracelog** -| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] -| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* -| **bpftool** **prog help** +| **bpftool** **prog** { **show** | **list** } [*PROG*] +| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +| **bpftool** **prog pin** *PROG* *FILE* +| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog tracelog** +| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* +| **bpftool** **prog help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *TYPE* := { -| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | -| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | -| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | -| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | -| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | -| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | -| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | -| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | -| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | -| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** -| } -| *ATTACH_TYPE* := { -| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | -| **sk_skb_stream_parser** | **flow_dissector** -| } -| *METRICs* := { -| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | -| **itlb_misses** | **dtlb_misses** -| } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *TYPE* := { +| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | +| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | +| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | +| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | +| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | +| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | +| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | +| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | +| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** +| } +| *ATTACH_TYPE* := { +| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | +| **sk_skb_stream_parser** | **flow_dissector** +| } +| *METRICs* := { +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | +| **itlb_misses** | **dtlb_misses** +| } DESCRIPTION =========== - **bpftool prog { show | list }** [*PROG*] - Show information about loaded programs. If *PROG* is - specified show information only about given programs, - otherwise list all programs currently loaded on the system. - In case of **tag** or **name**, *PROG* may match several - programs which will all be shown. - - Output will start with program ID followed by program type and - zero or more named attributes (depending on kernel version). - - Since Linux 5.1 the kernel can collect statistics on BPF - programs (such as the total time spent running the program, - and the number of times it was run). If available, bpftool - shows such statistics. However, the kernel does not collect - them by defaults, as it slightly impacts performance on each - program run. Activation or deactivation of the feature is - performed via the **kernel.bpf_stats_enabled** sysctl knob. - - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - programs. On such kernels bpftool will automatically emit this - information as well. - - **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] - Dump eBPF instructions of the programs from the kernel. By - default, eBPF will be disassembled and printed to standard - output in human-readable format. In this case, **opcodes** - controls if raw opcodes should be printed as well. - - In case of **tag** or **name**, *PROG* may match several - programs which will all be dumped. However, if **file** or - **visual** is specified, *PROG* must match a single program. - - If **file** is specified, the binary image will instead be - written to *FILE*. - - If **visual** is specified, control flow graph (CFG) will be - built instead, and eBPF instructions will be presented with - CFG in DOT format, on standard output. - - If the programs have line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] - Dump jited image (host machine code) of the program. - - If *FILE* is specified image will be written to a file, - otherwise it will be disassembled and printed to stdout. - *PROG* must match a single program when **file** is specified. - - **opcodes** controls if raw opcodes will be printed. - - If the prog has line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog pin** *PROG* *FILE* - Pin program *PROG* as *FILE*. - - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] - Load bpf program(s) from binary *OBJ* and pin as *PATH*. - **bpftool prog load** pins only the first program from the - *OBJ* as *PATH*. **bpftool prog loadall** pins all programs - from the *OBJ* under *PATH* directory. - **type** is optional, if not specified program type will be - inferred from section names. - By default bpftool will create new maps as declared in the ELF - object being loaded. **map** parameter allows for the reuse - of existing maps. It can be specified multiple times, each - time for a different map. *IDX* refers to index of the map - to be replaced in the ELF file counting from 0, while *NAME* - allows to replace a map by name. *MAP* specifies the map to - use, referring to it by **id** or through a **pinned** file. - If **offload_dev** *NAME* is specified program will be loaded - onto given networking device (offload). - If **xdpmeta_dev** *NAME* is specified program will become - device-bound without offloading, this facilitates access - to XDP metadata. - Optional **pinmaps** argument can be provided to pin all - maps under *MAP_DIR* directory. - - If **autoattach** is specified program will be attached - before pin. In that case, only the link (representing the - program attached to its hook) is pinned, not the program as - such, so the path won't show in **bpftool prog show -f**, - only show in **bpftool link show -f**. Also, this only works - when bpftool (libbpf) is able to infer all necessary - information from the object file, in particular, it's not - supported for all program types. If a program does not - support autoattach, bpftool falls back to regular pinning - for that program instead. - - Note: *PATH* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] - Attach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - attached to current networking name space. - - **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] - Detach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - detached from the current networking name space. - - **bpftool prog tracelog** - Dump the trace pipe of the system to the console (stdout). - Hit to stop printing. BPF programs can write to this - trace pipe at runtime with the **bpf_trace_printk**\ () helper. - This should be used only for debugging purposes. For - streaming data from BPF programs to user space, one can use - perf events (see also **bpftool-map**\ (8)). - - **bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] - Run BPF program *PROG* in the kernel testing infrastructure - for BPF, meaning that the program works on the data and - context provided by the user, and not on actual packets or - monitored functions etc. Return value and duration for the - test run are printed out to the console. - - Input data is read from the *FILE* passed with **data_in**. - If this *FILE* is "**-**", input data is read from standard - input. Input context, if any, is read from *FILE* passed with - **ctx_in**. Again, "**-**" can be used to read from standard - input, but only if standard input is not already in use for - input data. If a *FILE* is passed with **data_out**, output - data is written to that file. Similarly, output context is - written to the *FILE* passed with **ctx_out**. For both - output flows, "**-**" can be used to print to the standard - output (as plain text, or JSON if relevant option was - passed). If output keywords are omitted, output data and - context are discarded. Keywords **data_size_out** and - **ctx_size_out** are used to pass the size (in bytes) for the - output buffers to the kernel, although the default of 32 kB - should be more than enough for most cases. - - Keyword **repeat** is used to indicate the number of - consecutive runs to perform. Note that output data and - context printed to files correspond to the last of those - runs. The duration printed out at the end of the runs is an - average over all runs performed by the command. - - Not all program types support test run. Among those which do, - not all of them can take the **ctx_in**/**ctx_out** - arguments. bpftool does not perform checks on program types. - - **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* - Profile *METRICs* for bpf program *PROG* for *DURATION* - seconds or until user hits . *DURATION* is optional. - If *DURATION* is not specified, the profiling will run up to - **UINT_MAX** seconds. - - **bpftool prog help** - Print short help message. +bpftool prog { show | list } [*PROG*] + Show information about loaded programs. If *PROG* is specified show + information only about given programs, otherwise list all programs + currently loaded on the system. In case of **tag** or **name**, *PROG* may + match several programs which will all be shown. + + Output will start with program ID followed by program type and zero or more + named attributes (depending on kernel version). + + Since Linux 5.1 the kernel can collect statistics on BPF programs (such as + the total time spent running the program, and the number of times it was + run). If available, bpftool shows such statistics. However, the kernel does + not collect them by defaults, as it slightly impacts performance on each + program run. Activation or deactivation of the feature is performed via the + **kernel.bpf_stats_enabled** sysctl knob. + + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF programs. On such kernels + bpftool will automatically emit this information as well. + +bpftool prog dump xlated *PROG* [{ file *FILE* | [opcodes] [linum] [visual] }] + Dump eBPF instructions of the programs from the kernel. By default, eBPF + will be disassembled and printed to standard output in human-readable + format. In this case, **opcodes** controls if raw opcodes should be printed + as well. + + In case of **tag** or **name**, *PROG* may match several programs which + will all be dumped. However, if **file** or **visual** is specified, + *PROG* must match a single program. + + If **file** is specified, the binary image will instead be written to + *FILE*. + + If **visual** is specified, control flow graph (CFG) will be built instead, + and eBPF instructions will be presented with CFG in DOT format, on standard + output. + + If the programs have line_info available, the source line will be + displayed. If **linum** is specified, the filename, line number and line + column will also be displayed. + +bpftool prog dump jited *PROG* [{ file *FILE* | [opcodes] [linum] }] + Dump jited image (host machine code) of the program. + + If *FILE* is specified image will be written to a file, otherwise it will + be disassembled and printed to stdout. *PROG* must match a single program + when **file** is specified. + + **opcodes** controls if raw opcodes will be printed. + + If the prog has line_info available, the source line will be displayed. If + **linum** is specified, the filename, line number and line column will also + be displayed. + +bpftool prog pin *PROG* *FILE* + Pin program *PROG* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +bpftool prog { load | loadall } *OBJ* *PATH* [type *TYPE*] [map { idx *IDX* | name *NAME* } *MAP*] [{ offload_dev | xdpmeta_dev } *NAME*] [pinmaps *MAP_DIR*] [autoattach] + Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog + load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog + loadall** pins all programs from the *OBJ* under *PATH* directory. **type** + is optional, if not specified program type will be inferred from section + names. By default bpftool will create new maps as declared in the ELF + object being loaded. **map** parameter allows for the reuse of existing + maps. It can be specified multiple times, each time for a different map. + *IDX* refers to index of the map to be replaced in the ELF file counting + from 0, while *NAME* allows to replace a map by name. *MAP* specifies the + map to use, referring to it by **id** or through a **pinned** file. If + **offload_dev** *NAME* is specified program will be loaded onto given + networking device (offload). If **xdpmeta_dev** *NAME* is specified program + will become device-bound without offloading, this facilitates access to XDP + metadata. Optional **pinmaps** argument can be provided to pin all maps + under *MAP_DIR* directory. + + If **autoattach** is specified program will be attached before pin. In that + case, only the link (representing the program attached to its hook) is + pinned, not the program as such, so the path won't show in **bpftool prog + show -f**, only show in **bpftool link show -f**. Also, this only works + when bpftool (libbpf) is able to infer all necessary information from the + object file, in particular, it's not supported for all program types. If a + program does not support autoattach, bpftool falls back to regular pinning + for that program instead. + + Note: *PATH* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +bpftool prog attach *PROG* *ATTACH_TYPE* [*MAP*] + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is attached to current networking name space. + +bpftool prog detach *PROG* *ATTACH_TYPE* [*MAP*] + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is detached from the current networking name space. + +bpftool prog tracelog + Dump the trace pipe of the system to the console (stdout). Hit to + stop printing. BPF programs can write to this trace pipe at runtime with + the **bpf_trace_printk**\ () helper. This should be used only for debugging + purposes. For streaming data from BPF programs to user space, one can use + perf events (see also **bpftool-map**\ (8)). + +bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*] + Run BPF program *PROG* in the kernel testing infrastructure for BPF, + meaning that the program works on the data and context provided by the + user, and not on actual packets or monitored functions etc. Return value + and duration for the test run are printed out to the console. + + Input data is read from the *FILE* passed with **data_in**. If this *FILE* + is "**-**", input data is read from standard input. Input context, if any, + is read from *FILE* passed with **ctx_in**. Again, "**-**" can be used to + read from standard input, but only if standard input is not already in use + for input data. If a *FILE* is passed with **data_out**, output data is + written to that file. Similarly, output context is written to the *FILE* + passed with **ctx_out**. For both output flows, "**-**" can be used to + print to the standard output (as plain text, or JSON if relevant option was + passed). If output keywords are omitted, output data and context are + discarded. Keywords **data_size_out** and **ctx_size_out** are used to pass + the size (in bytes) for the output buffers to the kernel, although the + default of 32 kB should be more than enough for most cases. + + Keyword **repeat** is used to indicate the number of consecutive runs to + perform. Note that output data and context printed to files correspond to + the last of those runs. The duration printed out at the end of the runs is + an average over all runs performed by the command. + + Not all program types support test run. Among those which do, not all of + them can take the **ctx_in**/**ctx_out** arguments. bpftool does not + perform checks on program types. + +bpftool prog profile *PROG* [duration *DURATION*] *METRICs* + Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until + user hits . *DURATION* is optional. If *DURATION* is not specified, + the profiling will run up to **UINT_MAX** seconds. + +bpftool prog help + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -f, --bpffs - When showing BPF programs, show file names of pinned - programs. - - -m, --mapcompat - Allow loading maps with unknown map definitions. - - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. - - -L, --use-loader - Load program as a "loader" program. This is useful to debug - the generation of such programs. When this option is in - use, bpftool attempts to load the programs from the object - file into the kernel, but does not pin them (therefore, the - *PATH* must not be provided). - - When combined with the **-d**\ \|\ **--debug** option, - additional debug messages are generated, and the execution - of the loader program will use the **bpf_trace_printk**\ () - helper to log each step of loading BTF, creating the maps, - and loading the programs (see **bpftool prog tracelog** as - a way to dump those messages). +.. include:: common_options.rst + +-f, --bpffs + When showing BPF programs, show file names of pinned programs. + +-m, --mapcompat + Allow loading maps with unknown map definitions. + +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. + +-L, --use-loader + Load program as a "loader" program. This is useful to debug the generation + of such programs. When this option is in use, bpftool attempts to load the + programs from the object file into the kernel, but does not pin them + (therefore, the *PATH* must not be provided). + + When combined with the **-d**\ \|\ **--debug** option, additional debug + messages are generated, and the execution of the loader program will use + the **bpf_trace_printk**\ () helper to log each step of loading BTF, + creating the maps, and loading the programs (see **bpftool prog tracelog** + as a way to dump those messages). EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 8022b5321d..e871b9539a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -14,61 +14,60 @@ tool to register/unregister/introspect BPF struct_ops SYNOPSIS ======== - **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* +**bpftool** [*OPTIONS*] **struct_ops** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump** | **register** | **unregister** | **help** } STRUCT_OPS COMMANDS =================== -| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] -| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* -| **bpftool** **struct_ops help** +| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] +| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* +| **bpftool** **struct_ops help** | -| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } -| *OBJ* := /a/file/of/bpf_struct_ops.o +| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } +| *OBJ* := /a/file/of/bpf_struct_ops.o DESCRIPTION =========== - **bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] - Show brief information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it shows information only - for the given struct_ops. Otherwise, it lists all struct_ops - currently existing in the system. - - Output will start with struct_ops map ID, followed by its map - name and its struct_ops's kernel type. - - **bpftool struct_ops dump** [*STRUCT_OPS_MAP*] - Dump details information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it dumps information only - for the given struct_ops. Otherwise, it dumps all struct_ops - currently existing in the system. - - **bpftool struct_ops register** *OBJ* [*LINK_DIR*] - Register bpf struct_ops from *OBJ*. All struct_ops under - the ELF section ".struct_ops" and ".struct_ops.link" will - be registered to its kernel subsystem. For each - struct_ops in the ".struct_ops.link" section, a link - will be created. You can give *LINK_DIR* to provide a - directory path where these links will be pinned with the - same name as their corresponding map name. - - **bpftool struct_ops unregister** *STRUCT_OPS_MAP* - Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. - - **bpftool struct_ops help** - Print short help message. +bpftool struct_ops { show | list } [*STRUCT_OPS_MAP*] + Show brief information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it shows information only for the given + struct_ops. Otherwise, it lists all struct_ops currently existing in the + system. + + Output will start with struct_ops map ID, followed by its map name and its + struct_ops's kernel type. + +bpftool struct_ops dump [*STRUCT_OPS_MAP*] + Dump details information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it dumps information only for the given + struct_ops. Otherwise, it dumps all struct_ops currently existing in the + system. + +bpftool struct_ops register *OBJ* [*LINK_DIR*] + Register bpf struct_ops from *OBJ*. All struct_ops under the ELF section + ".struct_ops" and ".struct_ops.link" will be registered to its kernel + subsystem. For each struct_ops in the ".struct_ops.link" section, a link + will be created. You can give *LINK_DIR* to provide a directory path where + these links will be pinned with the same name as their corresponding map + name. + +bpftool struct_ops unregister *STRUCT_OPS_MAP* + Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. + +bpftool struct_ops help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 09e4f2ff56..f38ae5c404 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -14,57 +14,57 @@ tool for inspection and simple manipulation of eBPF programs and maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } +**bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } - **bpftool** **batch file** *FILE* +**bpftool** **batch file** *FILE* - **bpftool** **version** +**bpftool** **version** - *OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | - **btf** | **gen** | **struct_ops** | **iter** } +*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | +**btf** | **gen** | **struct_ops** | **iter** } - *OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } +*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } - *MAP-COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **event_pipe** | **help** } +*MAP-COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **event_pipe** | **help** } - *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | - **load** | **attach** | **detach** | **help** } +*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | +**load** | **attach** | **detach** | **help** } - *LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } +*LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } - *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } +*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } - *PERF-COMMANDS* := { **show** | **list** | **help** } +*PERF-COMMANDS* := { **show** | **list** | **help** } - *NET-COMMANDS* := { **show** | **list** | **help** } +*NET-COMMANDS* := { **show** | **list** | **help** } - *FEATURE-COMMANDS* := { **probe** | **help** } +*FEATURE-COMMANDS* := { **probe** | **help** } - *BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } +*BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } - *GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } +*GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } - *STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } - *ITER-COMMANDS* := { **pin** | **help** } +*ITER-COMMANDS* := { **pin** | **help** } DESCRIPTION =========== - *bpftool* allows for inspection and simple modification of BPF objects - on the system. +*bpftool* allows for inspection and simple modification of BPF objects on the +system. - Note that format of the output of all tools is not guaranteed to be - stable and should not be depended upon. +Note that format of the output of all tools is not guaranteed to be stable and +should not be depended upon. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -m, --mapcompat - Allow loading maps with unknown map definitions. +-m, --mapcompat + Allow loading maps with unknown map definitions. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst index 30df7a707f..9234b9dab7 100644 --- a/tools/bpf/bpftool/Documentation/common_options.rst +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -1,25 +1,23 @@ .. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -h, --help - Print short help message (similar to **bpftool help**). + Print short help message (similar to **bpftool help**). -V, --version - Print bpftool's version number (similar to **bpftool version**), the - number of the libbpf version in use, and optional features that were - included when bpftool was compiled. Optional features include linking - against LLVM or libbfd to provide the disassembler for JIT-ted - programs (**bpftool prog dump jited**) and usage of BPF skeletons - (some features like **bpftool prog profile** or showing pids - associated to BPF objects may rely on it). + Print bpftool's version number (similar to **bpftool version**), the number + of the libbpf version in use, and optional features that were included when + bpftool was compiled. Optional features include linking against LLVM or + libbfd to provide the disassembler for JIT-ted programs (**bpftool prog + dump jited**) and usage of BPF skeletons (some features like **bpftool prog + profile** or showing pids associated to BPF objects may rely on it). -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. + Generate JSON output. For commands that cannot produce JSON, this option + has no effect. -p, --pretty - Generate human-readable JSON output. Implies **-j**. + Generate human-readable JSON output. Implies **-j**. -d, --debug - Print all logs available, even debug-level information. This includes - logs from libbpf as well as from the verifier, when attempting to - load programs. + Print all logs available, even debug-level information. This includes logs + from libbpf as well as from the verifier, when attempting to load programs. diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index e9154ace80..dfa4f1bebb 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -89,6 +89,10 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif +HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ + $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) +HOST_LDFLAGS := $(LDFLAGS) + INSTALL ?= install RM ?= rm -f @@ -143,7 +147,7 @@ ifeq ($(feature-llvm),1) # If LLVM is available, use it for JIT disassembly CFLAGS += -DHAVE_LLVM_SUPPORT LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets - CFLAGS += $(shell $(LLVM_CONFIG) --cflags --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + CFLAGS += $(shell $(LLVM_CONFIG) --cflags) LIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) LIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) @@ -178,12 +182,9 @@ ifeq ($(filter -DHAVE_LLVM_SUPPORT -DHAVE_LIBBFD_SUPPORT,$(CFLAGS)),) SRCS := $(filter-out jit_disasm.c,$(SRCS)) endif -HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ - $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) - BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o @@ -231,14 +232,11 @@ endif CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) -$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c - $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ - $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) - $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ + $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(HOST_LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 6e4f7ce6bc..04afe2ac22 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -106,19 +106,19 @@ _bpftool_get_link_ids() _bpftool_get_obj_map_names() { - local obj + local obj maps obj=$1 - maps=$(objdump -j maps -t $obj 2>/dev/null | \ - command awk '/g . maps/ {print $NF}') + maps=$(objdump -j .maps -t $obj 2>/dev/null | \ + command awk '/g . .maps/ {print $NF}') COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) ) } _bpftool_get_obj_map_idxs() { - local obj + local obj nmaps obj=$1 @@ -136,7 +136,7 @@ _sysfs_get_netdevs() # Retrieve type of the map that we are operating on. _bpftool_map_guess_map_type() { - local keyword ref + local keyword idx ref="" for (( idx=3; idx < ${#words[@]}-1; idx++ )); do case "${words[$((idx-2))]}" in lookup|update) @@ -255,8 +255,9 @@ _bpftool_map_update_get_name() _bpftool() { - local cur prev words objword json=0 - _init_completion || return + local cur prev words cword comp_args + local json=0 + _init_completion -- "$@" || return # Deal with options if [[ ${words[cword]} == -* ]]; then @@ -293,7 +294,7 @@ _bpftool() esac # Remove all options so completions don't have to deal with them. - local i + local i pprev for (( i=1; i < ${#words[@]}; )); do if [[ ${words[i]::1} == - ]] && [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then @@ -307,7 +308,7 @@ _bpftool() prev=${words[cword - 1]} pprev=${words[cword - 2]} - local object=${words[1]} command=${words[2]} + local object=${words[1]} if [[ -z $object || $cword -eq 1 ]]; then case $cur in @@ -324,8 +325,12 @@ _bpftool() esac fi + local command=${words[2]} [[ $command == help ]] && return 0 + local MAP_TYPE='id pinned name' + local PROG_TYPE='id pinned tag name' + # Completion depends on object and command in use case $object in prog) @@ -346,8 +351,6 @@ _bpftool() ;; esac - local PROG_TYPE='id pinned tag name' - local MAP_TYPE='id pinned name' local METRIC_TYPE='cycles instructions l1d_loads llc_misses \ itlb_misses dtlb_misses' case $command in @@ -457,7 +460,7 @@ _bpftool() obj=${words[3]} if [[ ${words[-4]} == "map" ]]; then - COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) return 0 fi if [[ ${words[-3]} == "map" ]]; then @@ -541,20 +544,9 @@ _bpftool() COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) ) return 0 ;; - 6) - case $prev in - duration) - return 0 - ;; - *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) - return 0 - ;; - esac - return 0 - ;; *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + [[ $prev == duration ]] && return 0 + _bpftool_once_attr "$METRIC_TYPE" return 0 ;; esac @@ -612,7 +604,7 @@ _bpftool() return 0 ;; register) - _filedir + [[ $prev == $command ]] && _filedir return 0 ;; *) @@ -638,9 +630,12 @@ _bpftool() pinned) _filedir ;; - *) + map) _bpftool_one_of_list $MAP_TYPE ;; + *) + _bpftool_once_attr 'map' + ;; esac return 0 ;; @@ -652,7 +647,6 @@ _bpftool() esac ;; map) - local MAP_TYPE='id pinned name' case $command in show|list|dump|peek|pop|dequeue|freeze) case $prev in @@ -793,13 +787,11 @@ _bpftool() # map, depending on the type of the map to update. case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) - local MAP_TYPE='id pinned name' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ -- "$cur" ) ) return 0 ;; prog_array) - local PROG_TYPE='id pinned tag name' COMPREPLY+=( $( compgen -W "$PROG_TYPE" \ -- "$cur" ) ) return 0 @@ -821,7 +813,7 @@ _bpftool() esac _bpftool_once_attr 'key' - local UPDATE_FLAGS='any exist noexist' + local UPDATE_FLAGS='any exist noexist' idx for (( idx=3; idx < ${#words[@]}-1; idx++ )); do if [[ ${words[idx]} == 'value' ]]; then # 'value' is present, but is not the last @@ -893,7 +885,6 @@ _bpftool() esac ;; btf) - local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned name' case $command in dump) @@ -1033,7 +1024,6 @@ _bpftool() local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ grep '^cgroup_')" local ATTACH_FLAGS='multi override' - local PROG_TYPE='id pinned tag name' # Check for $prev = $command first if [ $prev = $command ]; then _filedir @@ -1086,7 +1076,6 @@ _bpftool() esac ;; net) - local PROG_TYPE='id pinned tag name' local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' case $command in show|list) @@ -1193,14 +1182,14 @@ _bpftool() pin|detach) if [[ $prev == "$command" ]]; then COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) - else + elif [[ $pprev == "$command" ]]; then _filedir fi return 0 ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help pin detach show list' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 958e92acca..9b75639434 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -410,7 +410,7 @@ void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd, { const char *prog_name = prog_info->name; const struct btf_type *func_type; - const struct bpf_func_info finfo = {}; + struct bpf_func_info finfo = {}; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); struct btf *prog_btf = NULL; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 708733b0ea..c754a428c8 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") && + !grep(buf, "program of this type cannot use helper "); switch (get_vendor_id(ifindex)) { case 0x19ee: /* Netronome specific */ diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 540c0f2c4f..b3979ddc01 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -386,7 +386,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name */ needs_typeof = btf_is_array(var) || btf_is_ptr_to_func_proto(btf, var); if (needs_typeof) - printf("typeof("); + printf("__typeof__("); err = btf_dump__emit_type_decl(d, var_type_id, &opts); if (err) @@ -1131,7 +1131,8 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ + obj->struct_ops.%1$s = (__typeof__(obj->struct_ops.%1$s))\n\ + bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); } diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index afde9d0c2e..5cd503b763 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -526,6 +526,10 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) show_link_ifindex_json(info->netkit.ifindex, json_wtr); show_link_attach_type_json(info->netkit.attach_type, json_wtr); break; + case BPF_LINK_TYPE_SOCKMAP: + jsonw_uint_field(json_wtr, "map_id", info->sockmap.map_id); + show_link_attach_type_json(info->sockmap.attach_type, json_wtr); + break; case BPF_LINK_TYPE_XDP: show_link_ifindex_json(info->xdp.ifindex, json_wtr); break; @@ -915,6 +919,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) show_link_ifindex_plain(info->netkit.ifindex); show_link_attach_type_plain(info->netkit.attach_type); break; + case BPF_LINK_TYPE_SOCKMAP: + printf("\n\t"); + printf("map_id %u ", info->sockmap.map_id); + show_link_attach_type_plain(info->sockmap.attach_type); + break; case BPF_LINK_TYPE_XDP: printf("\n\t"); show_link_ifindex_plain(info->xdp.ifindex); diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index 00c77edb63..9b898571b4 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -101,7 +101,6 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) char buf[4096 / sizeof(*e) * sizeof(*e)]; struct pid_iter_bpf *skel; int err, ret, fd = -1, i; - libbpf_print_fn_t default_print; *map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL); if (IS_ERR(*map)) { @@ -118,12 +117,18 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) skel->rodata->obj_type = type; - /* we don't want output polluted with libbpf errors if bpf_iter is not - * supported - */ - default_print = libbpf_set_print(libbpf_print_none); - err = pid_iter_bpf__load(skel); - libbpf_set_print(default_print); + if (!verifier_logs) { + libbpf_print_fn_t default_print; + + /* Unless debug information is on, we don't want the output to + * be polluted with libbpf errors if bpf_iter is not supported. + */ + default_print = libbpf_set_print(libbpf_print_none); + err = pid_iter_bpf__load(skel); + libbpf_set_print(default_print); + } else { + err = pid_iter_bpf__load(skel); + } if (err) { /* too bad, kernel doesn't support BPF iterators yet */ err = 0; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 4c4cf16a40..40ea743d13 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1813,6 +1813,10 @@ offload_dev: } if (pinmaps) { + err = create_and_mount_bpffs_dir(pinmaps); + if (err) + goto err_unpin; + err = bpf_object__pin_maps(obj, pinmaps); if (err) { p_err("failed to pin all maps"); @@ -2081,7 +2085,7 @@ static int profile_parse_metrics(int argc, char **argv) NEXT_ARG(); } if (selected_cnt > MAX_NUM_PROFILE_METRICS) { - p_err("too many (%d) metrics, please specify no more than %d metrics at at time", + p_err("too many (%d) metrics, please specify no more than %d metrics at a time", selected_cnt, MAX_NUM_PROFILE_METRICS); return -1; } diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index af393c7dee..b3edc239fe 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -696,7 +696,7 @@ static int sets_patch(struct object *obj) * Make sure id is at the beginning of the pairs * struct, otherwise the below qsort would not work. */ - BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id); qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); /* diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index 1d3a90d93f..270c28a0d0 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -146,12 +146,11 @@ def detect_kernel_config(): def for_each_slab(prog): - PGSlab = 1 << prog.constant('PG_slab') - PGHead = 1 << prog.constant('PG_head') + PGSlab = ~prog.constant('PG_slab') for page in for_each_page(prog): try: - if page.flags.value_() & PGSlab: + if page.page_type.value_() == PGSlab: yield cast('struct slab *', page) except FaultError: pass diff --git a/tools/hv/Build b/tools/hv/Build index 6cf51fa4b3..7d1f169806 100644 --- a/tools/hv/Build +++ b/tools/hv/Build @@ -1,3 +1,4 @@ hv_kvp_daemon-y += hv_kvp_daemon.o hv_vss_daemon-y += hv_vss_daemon.o -hv_fcopy_daemon-y += hv_fcopy_daemon.o +hv_fcopy_uio_daemon-y += hv_fcopy_uio_daemon.o +hv_fcopy_uio_daemon-y += vmbus_bufring.o diff --git a/tools/hv/Makefile b/tools/hv/Makefile index fe770e679a..2e60e2c212 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -2,6 +2,7 @@ # Makefile for Hyper-V tools include ../scripts/Makefile.include +ARCH := $(shell uname -m 2>/dev/null) sbindir ?= /usr/sbin libexecdir ?= /usr/libexec sharedstatedir ?= /var/lib @@ -16,8 +17,12 @@ endif MAKEFLAGS += -r override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include +override CFLAGS += -Wno-address-of-packed-member -ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ALL_TARGETS := hv_kvp_daemon hv_vss_daemon +ifneq ($(ARCH), aarch64) +ALL_TARGETS += hv_fcopy_uio_daemon +endif ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh hv_set_ifconfig.sh @@ -39,10 +44,10 @@ $(HV_VSS_DAEMON_IN): FORCE $(OUTPUT)hv_vss_daemon: $(HV_VSS_DAEMON_IN) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -HV_FCOPY_DAEMON_IN := $(OUTPUT)hv_fcopy_daemon-in.o -$(HV_FCOPY_DAEMON_IN): FORCE - $(Q)$(MAKE) $(build)=hv_fcopy_daemon -$(OUTPUT)hv_fcopy_daemon: $(HV_FCOPY_DAEMON_IN) +HV_FCOPY_UIO_DAEMON_IN := $(OUTPUT)hv_fcopy_uio_daemon-in.o +$(HV_FCOPY_UIO_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_fcopy_uio_daemon +$(OUTPUT)hv_fcopy_uio_daemon: $(HV_FCOPY_UIO_DAEMON_IN) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ clean: diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c deleted file mode 100644 index 16d629b22c..0000000000 --- a/tools/hv/hv_fcopy_daemon.c +++ /dev/null @@ -1,266 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * An implementation of host to guest copy functionality for Linux. - * - * Copyright (C) 2014, Microsoft, Inc. - * - * Author : K. Y. Srinivasan - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int target_fd; -static char target_fname[PATH_MAX]; -static unsigned long long filesize; - -static int hv_start_fcopy(struct hv_start_fcopy *smsg) -{ - int error = HV_E_FAIL; - char *q, *p; - - filesize = 0; - p = (char *)smsg->path_name; - snprintf(target_fname, sizeof(target_fname), "%s/%s", - (char *)smsg->path_name, (char *)smsg->file_name); - - syslog(LOG_INFO, "Target file name: %s", target_fname); - /* - * Check to see if the path is already in place; if not, - * create if required. - */ - while ((q = strchr(p, '/')) != NULL) { - if (q == p) { - p++; - continue; - } - *q = '\0'; - if (access((char *)smsg->path_name, F_OK)) { - if (smsg->copy_flags & CREATE_PATH) { - if (mkdir((char *)smsg->path_name, 0755)) { - syslog(LOG_ERR, "Failed to create %s", - (char *)smsg->path_name); - goto done; - } - } else { - syslog(LOG_ERR, "Invalid path: %s", - (char *)smsg->path_name); - goto done; - } - } - p = q + 1; - *q = '/'; - } - - if (!access(target_fname, F_OK)) { - syslog(LOG_INFO, "File: %s exists", target_fname); - if (!(smsg->copy_flags & OVER_WRITE)) { - error = HV_ERROR_ALREADY_EXISTS; - goto done; - } - } - - target_fd = open(target_fname, - O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC, 0744); - if (target_fd == -1) { - syslog(LOG_INFO, "Open Failed: %s", strerror(errno)); - goto done; - } - - error = 0; -done: - if (error) - target_fname[0] = '\0'; - return error; -} - -static int hv_copy_data(struct hv_do_fcopy *cpmsg) -{ - ssize_t bytes_written; - int ret = 0; - - bytes_written = pwrite(target_fd, cpmsg->data, cpmsg->size, - cpmsg->offset); - - filesize += cpmsg->size; - if (bytes_written != cpmsg->size) { - switch (errno) { - case ENOSPC: - ret = HV_ERROR_DISK_FULL; - break; - default: - ret = HV_E_FAIL; - break; - } - syslog(LOG_ERR, "pwrite failed to write %llu bytes: %ld (%s)", - filesize, (long)bytes_written, strerror(errno)); - } - - return ret; -} - -/* - * Reset target_fname to "" in the two below functions for hibernation: if - * the fcopy operation is aborted by hibernation, the daemon should remove the - * partially-copied file; to achieve this, the hv_utils driver always fakes a - * CANCEL_FCOPY message upon suspend, and later when the VM resumes back, - * the daemon calls hv_copy_cancel() to remove the file; if a file is copied - * successfully before suspend, hv_copy_finished() must reset target_fname to - * avoid that the file can be incorrectly removed upon resume, since the faked - * CANCEL_FCOPY message is spurious in this case. - */ -static int hv_copy_finished(void) -{ - close(target_fd); - target_fname[0] = '\0'; - return 0; -} -static int hv_copy_cancel(void) -{ - close(target_fd); - if (strlen(target_fname) > 0) { - unlink(target_fname); - target_fname[0] = '\0'; - } - return 0; - -} - -void print_usage(char *argv[]) -{ - fprintf(stderr, "Usage: %s [options]\n" - "Options are:\n" - " -n, --no-daemon stay in foreground, don't daemonize\n" - " -h, --help print this help\n", argv[0]); -} - -int main(int argc, char *argv[]) -{ - int fcopy_fd = -1; - int error; - int daemonize = 1, long_index = 0, opt; - int version = FCOPY_CURRENT_VERSION; - union { - struct hv_fcopy_hdr hdr; - struct hv_start_fcopy start; - struct hv_do_fcopy copy; - __u32 kernel_modver; - } buffer = { }; - int in_handshake; - - static struct option long_options[] = { - {"help", no_argument, 0, 'h' }, - {"no-daemon", no_argument, 0, 'n' }, - {0, 0, 0, 0 } - }; - - while ((opt = getopt_long(argc, argv, "hn", long_options, - &long_index)) != -1) { - switch (opt) { - case 'n': - daemonize = 0; - break; - case 'h': - default: - print_usage(argv); - exit(EXIT_FAILURE); - } - } - - if (daemonize && daemon(1, 0)) { - syslog(LOG_ERR, "daemon() failed; error: %s", strerror(errno)); - exit(EXIT_FAILURE); - } - - openlog("HV_FCOPY", 0, LOG_USER); - syslog(LOG_INFO, "starting; pid is:%d", getpid()); - -reopen_fcopy_fd: - if (fcopy_fd != -1) - close(fcopy_fd); - /* Remove any possible partially-copied file on error */ - hv_copy_cancel(); - in_handshake = 1; - fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR); - - if (fcopy_fd < 0) { - syslog(LOG_ERR, "open /dev/vmbus/hv_fcopy failed; error: %d %s", - errno, strerror(errno)); - exit(EXIT_FAILURE); - } - - /* - * Register with the kernel. - */ - if ((write(fcopy_fd, &version, sizeof(int))) != sizeof(int)) { - syslog(LOG_ERR, "Registration failed: %s", strerror(errno)); - exit(EXIT_FAILURE); - } - - while (1) { - /* - * In this loop we process fcopy messages after the - * handshake is complete. - */ - ssize_t len; - - len = pread(fcopy_fd, &buffer, sizeof(buffer), 0); - if (len < 0) { - syslog(LOG_ERR, "pread failed: %s", strerror(errno)); - goto reopen_fcopy_fd; - } - - if (in_handshake) { - if (len != sizeof(buffer.kernel_modver)) { - syslog(LOG_ERR, "invalid version negotiation"); - exit(EXIT_FAILURE); - } - in_handshake = 0; - syslog(LOG_INFO, "kernel module version: %u", - buffer.kernel_modver); - continue; - } - - switch (buffer.hdr.operation) { - case START_FILE_COPY: - error = hv_start_fcopy(&buffer.start); - break; - case WRITE_TO_FILE: - error = hv_copy_data(&buffer.copy); - break; - case COMPLETE_FCOPY: - error = hv_copy_finished(); - break; - case CANCEL_FCOPY: - error = hv_copy_cancel(); - break; - - default: - error = HV_E_FAIL; - syslog(LOG_ERR, "Unknown operation: %d", - buffer.hdr.operation); - - } - - /* - * pwrite() may return an error due to the faked CANCEL_FCOPY - * message upon hibernation. Ignore the error by resetting the - * dev file, i.e. closing and re-opening it. - */ - if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) { - syslog(LOG_ERR, "pwrite failed: %s", strerror(errno)); - goto reopen_fcopy_fd; - } - } -} diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c new file mode 100644 index 0000000000..3ce316cc9f --- /dev/null +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * An implementation of host to guest copy functionality for Linux. + * + * Copyright (C) 2023, Microsoft, Inc. + * + * Author : K. Y. Srinivasan + * Author : Saurabh Sengar + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vmbus_bufring.h" + +#define ICMSGTYPE_NEGOTIATE 0 +#define ICMSGTYPE_FCOPY 7 + +#define WIN8_SRV_MAJOR 1 +#define WIN8_SRV_MINOR 1 +#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) + +#define MAX_FOLDER_NAME 15 +#define MAX_PATH_LEN 15 +#define FCOPY_UIO "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio" + +#define FCOPY_VER_COUNT 1 +static const int fcopy_versions[] = { + WIN8_SRV_VERSION +}; + +#define FW_VER_COUNT 1 +static const int fw_versions[] = { + UTIL_FW_VERSION +}; + +#define HV_RING_SIZE 0x4000 /* 16KB ring buffer size */ + +unsigned char desc[HV_RING_SIZE]; + +static int target_fd; +static char target_fname[PATH_MAX]; +static unsigned long long filesize; + +static int hv_fcopy_create_file(char *file_name, char *path_name, __u32 flags) +{ + int error = HV_E_FAIL; + char *q, *p; + + filesize = 0; + p = path_name; + snprintf(target_fname, sizeof(target_fname), "%s/%s", + path_name, file_name); + + /* + * Check to see if the path is already in place; if not, + * create if required. + */ + while ((q = strchr(p, '/')) != NULL) { + if (q == p) { + p++; + continue; + } + *q = '\0'; + if (access(path_name, F_OK)) { + if (flags & CREATE_PATH) { + if (mkdir(path_name, 0755)) { + syslog(LOG_ERR, "Failed to create %s", + path_name); + goto done; + } + } else { + syslog(LOG_ERR, "Invalid path: %s", path_name); + goto done; + } + } + p = q + 1; + *q = '/'; + } + + if (!access(target_fname, F_OK)) { + syslog(LOG_INFO, "File: %s exists", target_fname); + if (!(flags & OVER_WRITE)) { + error = HV_ERROR_ALREADY_EXISTS; + goto done; + } + } + + target_fd = open(target_fname, + O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC, 0744); + if (target_fd == -1) { + syslog(LOG_INFO, "Open Failed: %s", strerror(errno)); + goto done; + } + + error = 0; +done: + if (error) + target_fname[0] = '\0'; + return error; +} + +/* copy the data into the file */ +static int hv_copy_data(struct hv_do_fcopy *cpmsg) +{ + ssize_t len; + int ret = 0; + + len = pwrite(target_fd, cpmsg->data, cpmsg->size, cpmsg->offset); + + filesize += cpmsg->size; + if (len != cpmsg->size) { + switch (errno) { + case ENOSPC: + ret = HV_ERROR_DISK_FULL; + break; + default: + ret = HV_E_FAIL; + break; + } + syslog(LOG_ERR, "pwrite failed to write %llu bytes: %ld (%s)", + filesize, (long)len, strerror(errno)); + } + + return ret; +} + +static int hv_copy_finished(void) +{ + close(target_fd); + target_fname[0] = '\0'; + + return 0; +} + +static void print_usage(char *argv[]) +{ + fprintf(stderr, "Usage: %s [options]\n" + "Options are:\n" + " -n, --no-daemon stay in foreground, don't daemonize\n" + " -h, --help print this help\n", argv[0]); +} + +static bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, unsigned char *buf, + unsigned int buflen, const int *fw_version, int fw_vercnt, + const int *srv_version, int srv_vercnt, + int *nego_fw_version, int *nego_srv_version) +{ + int icframe_major, icframe_minor; + int icmsg_major, icmsg_minor; + int fw_major, fw_minor; + int srv_major, srv_minor; + int i, j; + bool found_match = false; + struct icmsg_negotiate *negop; + + /* Check that there's enough space for icframe_vercnt, icmsg_vercnt */ + if (buflen < ICMSG_HDR + offsetof(struct icmsg_negotiate, reserved)) { + syslog(LOG_ERR, "Invalid icmsg negotiate"); + return false; + } + + icmsghdrp->icmsgsize = 0x10; + negop = (struct icmsg_negotiate *)&buf[ICMSG_HDR]; + + icframe_major = negop->icframe_vercnt; + icframe_minor = 0; + + icmsg_major = negop->icmsg_vercnt; + icmsg_minor = 0; + + /* Validate negop packet */ + if (icframe_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT || + icmsg_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT || + ICMSG_NEGOTIATE_PKT_SIZE(icframe_major, icmsg_major) > buflen) { + syslog(LOG_ERR, "Invalid icmsg negotiate - icframe_major: %u, icmsg_major: %u\n", + icframe_major, icmsg_major); + goto fw_error; + } + + /* + * Select the framework version number we will + * support. + */ + + for (i = 0; i < fw_vercnt; i++) { + fw_major = (fw_version[i] >> 16); + fw_minor = (fw_version[i] & 0xFFFF); + + for (j = 0; j < negop->icframe_vercnt; j++) { + if (negop->icversion_data[j].major == fw_major && + negop->icversion_data[j].minor == fw_minor) { + icframe_major = negop->icversion_data[j].major; + icframe_minor = negop->icversion_data[j].minor; + found_match = true; + break; + } + } + + if (found_match) + break; + } + + if (!found_match) + goto fw_error; + + found_match = false; + + for (i = 0; i < srv_vercnt; i++) { + srv_major = (srv_version[i] >> 16); + srv_minor = (srv_version[i] & 0xFFFF); + + for (j = negop->icframe_vercnt; + (j < negop->icframe_vercnt + negop->icmsg_vercnt); + j++) { + if (negop->icversion_data[j].major == srv_major && + negop->icversion_data[j].minor == srv_minor) { + icmsg_major = negop->icversion_data[j].major; + icmsg_minor = negop->icversion_data[j].minor; + found_match = true; + break; + } + } + + if (found_match) + break; + } + + /* + * Respond with the framework and service + * version numbers we can support. + */ +fw_error: + if (!found_match) { + negop->icframe_vercnt = 0; + negop->icmsg_vercnt = 0; + } else { + negop->icframe_vercnt = 1; + negop->icmsg_vercnt = 1; + } + + if (nego_fw_version) + *nego_fw_version = (icframe_major << 16) | icframe_minor; + + if (nego_srv_version) + *nego_srv_version = (icmsg_major << 16) | icmsg_minor; + + negop->icversion_data[0].major = icframe_major; + negop->icversion_data[0].minor = icframe_minor; + negop->icversion_data[1].major = icmsg_major; + negop->icversion_data[1].minor = icmsg_minor; + + return found_match; +} + +static void wcstoutf8(char *dest, const __u16 *src, size_t dest_size) +{ + size_t len = 0; + + while (len < dest_size) { + if (src[len] < 0x80) + dest[len++] = (char)(*src++); + else + dest[len++] = 'X'; + } + + dest[len] = '\0'; +} + +static int hv_fcopy_start(struct hv_start_fcopy *smsg_in) +{ + setlocale(LC_ALL, "en_US.utf8"); + size_t file_size, path_size; + char *file_name, *path_name; + char *in_file_name = (char *)smsg_in->file_name; + char *in_path_name = (char *)smsg_in->path_name; + + file_size = wcstombs(NULL, (const wchar_t *restrict)in_file_name, 0) + 1; + path_size = wcstombs(NULL, (const wchar_t *restrict)in_path_name, 0) + 1; + + file_name = (char *)malloc(file_size * sizeof(char)); + path_name = (char *)malloc(path_size * sizeof(char)); + + wcstoutf8(file_name, (__u16 *)in_file_name, file_size); + wcstoutf8(path_name, (__u16 *)in_path_name, path_size); + + return hv_fcopy_create_file(file_name, path_name, smsg_in->copy_flags); +} + +static int hv_fcopy_send_data(struct hv_fcopy_hdr *fcopy_msg, int recvlen) +{ + int operation = fcopy_msg->operation; + + /* + * The strings sent from the host are encoded in + * utf16; convert it to utf8 strings. + * The host assures us that the utf16 strings will not exceed + * the max lengths specified. We will however, reserve room + * for the string terminating character - in the utf16s_utf8s() + * function we limit the size of the buffer where the converted + * string is placed to W_MAX_PATH -1 to guarantee + * that the strings can be properly terminated! + */ + + switch (operation) { + case START_FILE_COPY: + return hv_fcopy_start((struct hv_start_fcopy *)fcopy_msg); + case WRITE_TO_FILE: + return hv_copy_data((struct hv_do_fcopy *)fcopy_msg); + case COMPLETE_FCOPY: + return hv_copy_finished(); + } + + return HV_E_FAIL; +} + +/* process the packet recv from host */ +static int fcopy_pkt_process(struct vmbus_br *txbr) +{ + int ret, offset, pktlen; + int fcopy_srv_version; + const struct vmbus_chanpkt_hdr *pkt; + struct hv_fcopy_hdr *fcopy_msg; + struct icmsg_hdr *icmsghdr; + + pkt = (const struct vmbus_chanpkt_hdr *)desc; + offset = pkt->hlen << 3; + pktlen = (pkt->tlen << 3) - offset; + icmsghdr = (struct icmsg_hdr *)&desc[offset + sizeof(struct vmbuspipe_hdr)]; + icmsghdr->status = HV_E_FAIL; + + if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) { + if (vmbus_prep_negotiate_resp(icmsghdr, desc + offset, pktlen, fw_versions, + FW_VER_COUNT, fcopy_versions, FCOPY_VER_COUNT, + NULL, &fcopy_srv_version)) { + syslog(LOG_INFO, "FCopy IC version %d.%d", + fcopy_srv_version >> 16, fcopy_srv_version & 0xFFFF); + icmsghdr->status = 0; + } + } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) { + /* Ensure recvlen is big enough to contain hv_fcopy_hdr */ + if (pktlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) { + syslog(LOG_ERR, "Invalid Fcopy hdr. Packet length too small: %u", + pktlen); + return -ENOBUFS; + } + + fcopy_msg = (struct hv_fcopy_hdr *)&desc[offset + ICMSG_HDR]; + icmsghdr->status = hv_fcopy_send_data(fcopy_msg, pktlen); + } + + icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; + ret = rte_vmbus_chan_send(txbr, 0x6, desc + offset, pktlen, 0); + if (ret) { + syslog(LOG_ERR, "Write to ringbuffer failed err: %d", ret); + return ret; + } + + return 0; +} + +static void fcopy_get_first_folder(char *path, char *chan_no) +{ + DIR *dir = opendir(path); + struct dirent *entry; + + if (!dir) { + syslog(LOG_ERR, "Failed to open directory (errno=%s).\n", strerror(errno)); + return; + } + + while ((entry = readdir(dir)) != NULL) { + if (entry->d_type == DT_DIR && strcmp(entry->d_name, ".") != 0 && + strcmp(entry->d_name, "..") != 0) { + strcpy(chan_no, entry->d_name); + break; + } + } + + closedir(dir); +} + +int main(int argc, char *argv[]) +{ + int fcopy_fd = -1, tmp = 1; + int daemonize = 1, long_index = 0, opt, ret = -EINVAL; + struct vmbus_br txbr, rxbr; + void *ring; + uint32_t len = HV_RING_SIZE; + char uio_name[MAX_FOLDER_NAME] = {0}; + char uio_dev_path[MAX_PATH_LEN] = {0}; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"no-daemon", no_argument, 0, 'n' }, + {0, 0, 0, 0 } + }; + + while ((opt = getopt_long(argc, argv, "hn", long_options, + &long_index)) != -1) { + switch (opt) { + case 'n': + daemonize = 0; + break; + case 'h': + default: + print_usage(argv); + goto exit; + } + } + + if (daemonize && daemon(1, 0)) { + syslog(LOG_ERR, "daemon() failed; error: %s", strerror(errno)); + goto exit; + } + + openlog("HV_UIO_FCOPY", 0, LOG_USER); + syslog(LOG_INFO, "starting; pid is:%d", getpid()); + + fcopy_get_first_folder(FCOPY_UIO, uio_name); + snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name); + fcopy_fd = open(uio_dev_path, O_RDWR); + + if (fcopy_fd < 0) { + syslog(LOG_ERR, "open %s failed; error: %d %s", + uio_dev_path, errno, strerror(errno)); + ret = fcopy_fd; + goto exit; + } + + ring = vmbus_uio_map(&fcopy_fd, HV_RING_SIZE); + if (!ring) { + ret = errno; + syslog(LOG_ERR, "mmap ringbuffer failed; error: %d %s", ret, strerror(ret)); + goto close; + } + vmbus_br_setup(&txbr, ring, HV_RING_SIZE); + vmbus_br_setup(&rxbr, (char *)ring + HV_RING_SIZE, HV_RING_SIZE); + + rxbr.vbr->imask = 0; + + while (1) { + /* + * In this loop we process fcopy messages after the + * handshake is complete. + */ + ret = pread(fcopy_fd, &tmp, sizeof(int), 0); + if (ret < 0) { + syslog(LOG_ERR, "pread failed: %s", strerror(errno)); + continue; + } + + len = HV_RING_SIZE; + ret = rte_vmbus_chan_recv_raw(&rxbr, desc, &len); + if (unlikely(ret <= 0)) { + /* This indicates a failure to communicate (or worse) */ + syslog(LOG_ERR, "VMBus channel recv error: %d", ret); + } else { + ret = fcopy_pkt_process(&txbr); + if (ret < 0) + goto close; + + /* Signal host */ + if ((write(fcopy_fd, &tmp, sizeof(int))) != sizeof(int)) { + ret = errno; + syslog(LOG_ERR, "Signal to host failed: %s\n", strerror(ret)); + goto close; + } + } + } +close: + close(fcopy_fd); +exit: + return ret; +} diff --git a/tools/hv/vmbus_bufring.c b/tools/hv/vmbus_bufring.c new file mode 100644 index 0000000000..bac32c1109 --- /dev/null +++ b/tools/hv/vmbus_bufring.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Copyright (c) 2009-2012,2016,2023 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vmbus_bufring.h" + +/** + * Compiler barrier. + * + * Guarantees that operation reordering does not occur at compile time + * for operations directly before and after the barrier. + */ +#define rte_compiler_barrier() ({ asm volatile ("" : : : "memory"); }) + +#define VMBUS_RQST_ERROR 0xFFFFFFFFFFFFFFFF +#define ALIGN(val, align) ((typeof(val))((val) & (~((typeof(val))((align) - 1))))) + +void *vmbus_uio_map(int *fd, int size) +{ + void *map; + + map = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); + if (map == MAP_FAILED) + return NULL; + + return map; +} + +/* Increase bufring index by inc with wraparound */ +static inline uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz) +{ + idx += inc; + if (idx >= sz) + idx -= sz; + + return idx; +} + +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen) +{ + br->vbr = buf; + br->windex = br->vbr->windex; + br->dsize = blen - sizeof(struct vmbus_bufring); +} + +static inline __always_inline void +rte_smp_mb(void) +{ + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); +} + +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + uint8_t res; + + asm volatile("lock ; " + "cmpxchgl %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + return res; +} + +static inline uint32_t +vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex, + const void *src0, uint32_t cplen) +{ + uint8_t *br_data = tbr->vbr->data; + uint32_t br_dsize = tbr->dsize; + const uint8_t *src = src0; + + /* XXX use double mapping like Linux kernel? */ + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + memcpy(br_data + windex, src, fraglen); + memcpy(br_data, src + fraglen, cplen - fraglen); + } else { + memcpy(br_data + windex, src, cplen); + } + + return vmbus_br_idxinc(windex, cplen, br_dsize); +} + +/* + * Write scattered channel packet to TX bufring. + * + * The offset of this channel packet is written as a 64bits value + * immediately after this channel packet. + * + * The write goes through three stages: + * 1. Reserve space in ring buffer for the new data. + * Writer atomically moves priv_write_index. + * 2. Copy the new data into the ring. + * 3. Update the tail of the ring (visible to host) that indicates + * next read location. Writer updates write_index + */ +static int +vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen) +{ + struct vmbus_bufring *vbr = tbr->vbr; + uint32_t ring_size = tbr->dsize; + uint32_t old_windex, next_windex, windex, total; + uint64_t save_windex; + int i; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + /* Reserve space in ring */ + do { + uint32_t avail; + + /* Get current free location */ + old_windex = tbr->windex; + + /* Prevent compiler reordering this with calculation */ + rte_compiler_barrier(); + + avail = vmbus_br_availwrite(tbr, old_windex); + + /* If not enough space in ring, then tell caller. */ + if (avail <= total) + return -EAGAIN; + + next_windex = vmbus_br_idxinc(old_windex, total, ring_size); + + /* Atomic update of next write_index for other threads */ + } while (!rte_atomic32_cmpset(&tbr->windex, old_windex, next_windex)); + + /* Space from old..new is now reserved */ + windex = old_windex; + for (i = 0; i < iovlen; i++) + windex = vmbus_txbr_copyto(tbr, windex, iov[i].iov_base, iov[i].iov_len); + + /* Set the offset of the current channel packet. */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* The region reserved should match region used */ + if (windex != next_windex) + return -EINVAL; + + /* Ensure that data is available before updating host index */ + rte_compiler_barrier(); + + /* Checkin for our reservation. wait for our turn to update host */ + while (!rte_atomic32_cmpset(&vbr->windex, old_windex, next_windex)) + _mm_pause(); + + return 0; +} + +int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void *data, + uint32_t dlen, uint32_t flags) +{ + struct vmbus_chanpkt pkt; + unsigned int pktlen, pad_pktlen; + const uint32_t hlen = sizeof(pkt); + uint64_t pad = 0; + struct iovec iov[3]; + int error; + + pktlen = hlen + dlen; + pad_pktlen = ALIGN(pktlen, sizeof(uint64_t)); + + pkt.hdr.type = type; + pkt.hdr.flags = flags; + pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.xactid = VMBUS_RQST_ERROR; + + iov[0].iov_base = &pkt; + iov[0].iov_len = hlen; + iov[1].iov_base = data; + iov[1].iov_len = dlen; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - pktlen; + + error = vmbus_txbr_write(txbr, iov, 3); + + return error; +} + +static inline uint32_t +vmbus_rxbr_copyfrom(const struct vmbus_br *rbr, uint32_t rindex, + void *dst0, size_t cplen) +{ + const uint8_t *br_data = rbr->vbr->data; + uint32_t br_dsize = rbr->dsize; + uint8_t *dst = dst0; + + if (cplen > br_dsize - rindex) { + uint32_t fraglen = br_dsize - rindex; + + /* Wrap-around detected. */ + memcpy(dst, br_data + rindex, fraglen); + memcpy(dst + fraglen, br_data, cplen - fraglen); + } else { + memcpy(dst, br_data + rindex, cplen); + } + + return vmbus_br_idxinc(rindex, cplen, br_dsize); +} + +/* Copy data from receive ring but don't change index */ +static int +vmbus_rxbr_peek(const struct vmbus_br *rbr, void *data, size_t dlen) +{ + uint32_t avail; + + /* + * The requested data and the 64bits channel packet + * offset should be there at least. + */ + avail = vmbus_br_availread(rbr); + if (avail < dlen + sizeof(uint64_t)) + return -EAGAIN; + + vmbus_rxbr_copyfrom(rbr, rbr->vbr->rindex, data, dlen); + return 0; +} + +/* + * Copy data from receive ring and change index + * NOTE: + * We assume (dlen + skip) == sizeof(channel packet). + */ +static int +vmbus_rxbr_read(struct vmbus_br *rbr, void *data, size_t dlen, size_t skip) +{ + struct vmbus_bufring *vbr = rbr->vbr; + uint32_t br_dsize = rbr->dsize; + uint32_t rindex; + + if (vmbus_br_availread(rbr) < dlen + skip + sizeof(uint64_t)) + return -EAGAIN; + + /* Record where host was when we started read (for debug) */ + rbr->windex = rbr->vbr->windex; + + /* + * Copy channel packet from RX bufring. + */ + rindex = vmbus_br_idxinc(rbr->vbr->rindex, skip, br_dsize); + rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen); + + /* + * Discard this channel packet's 64bits offset, which is useless to us. + */ + rindex = vmbus_br_idxinc(rindex, sizeof(uint64_t), br_dsize); + + /* Update the read index _after_ the channel packet is fetched. */ + rte_compiler_barrier(); + + vbr->rindex = rindex; + + return 0; +} + +int rte_vmbus_chan_recv_raw(struct vmbus_br *rxbr, + void *data, uint32_t *len) +{ + struct vmbus_chanpkt_hdr pkt; + uint32_t dlen, bufferlen = *len; + int error; + + error = vmbus_rxbr_peek(rxbr, &pkt, sizeof(pkt)); + if (error) + return error; + + if (unlikely(pkt.hlen < VMBUS_CHANPKT_HLEN_MIN)) + /* XXX this channel is dead actually. */ + return -EIO; + + if (unlikely(pkt.hlen > pkt.tlen)) + return -EIO; + + /* Length are in quad words */ + dlen = pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT; + *len = dlen; + + /* If caller buffer is not large enough */ + if (unlikely(dlen > bufferlen)) + return -ENOBUFS; + + /* Read data and skip packet header */ + error = vmbus_rxbr_read(rxbr, data, dlen, 0); + if (error) + return error; + + /* Return the number of bytes read */ + return dlen + sizeof(uint64_t); +} diff --git a/tools/hv/vmbus_bufring.h b/tools/hv/vmbus_bufring.h new file mode 100644 index 0000000000..6e7caacfff --- /dev/null +++ b/tools/hv/vmbus_bufring.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ + +#ifndef _VMBUS_BUF_H_ +#define _VMBUS_BUF_H_ + +#include +#include + +#define __packed __attribute__((__packed__)) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define ICMSGHDRFLAG_TRANSACTION 1 +#define ICMSGHDRFLAG_REQUEST 2 +#define ICMSGHDRFLAG_RESPONSE 4 + +#define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100 +#define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr)) +#define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \ + (ICMSG_HDR + sizeof(struct icmsg_negotiate) + \ + (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version))) + +/* + * Channel packets + */ + +/* Channel packet flags */ +#define VMBUS_CHANPKT_TYPE_INBAND 0x0006 +#define VMBUS_CHANPKT_TYPE_RXBUF 0x0007 +#define VMBUS_CHANPKT_TYPE_GPA 0x0009 +#define VMBUS_CHANPKT_TYPE_COMP 0x000b + +#define VMBUS_CHANPKT_FLAG_NONE 0 +#define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ + +#define VMBUS_CHANPKT_SIZE_SHIFT 3 +#define VMBUS_CHANPKT_SIZE_ALIGN BIT(VMBUS_CHANPKT_SIZE_SHIFT) +#define VMBUS_CHANPKT_HLEN_MIN \ + (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT) + +/* + * Buffer ring + */ +struct vmbus_bufring { + volatile uint32_t windex; + volatile uint32_t rindex; + + /* + * Interrupt mask {0,1} + * + * For TX bufring, host set this to 1, when it is processing + * the TX bufring, so that we can safely skip the TX event + * notification to host. + * + * For RX bufring, once this is set to 1 by us, host will not + * further dispatch interrupts to us, even if there are data + * pending on the RX bufring. This effectively disables the + * interrupt of the channel to which this RX bufring is attached. + */ + volatile uint32_t imask; + + /* + * Win8 uses some of the reserved bits to implement + * interrupt driven flow management. On the send side + * we can request that the receiver interrupt the sender + * when the ring transitions from being full to being able + * to handle a message of size "pending_send_sz". + * + * Add necessary state for this enhancement. + */ + volatile uint32_t pending_send; + uint32_t reserved1[12]; + + union { + struct { + uint32_t feat_pending_send_sz:1; + }; + uint32_t value; + } feature_bits; + + /* Pad it to rte_mem_page_size() so that data starts on page boundary */ + uint8_t reserved2[4028]; + + /* + * Ring data starts here + RingDataStartOffset + * !!! DO NOT place any fields below this !!! + */ + uint8_t data[]; +} __packed; + +struct vmbus_br { + struct vmbus_bufring *vbr; + uint32_t dsize; + uint32_t windex; /* next available location */ +}; + +struct vmbus_chanpkt_hdr { + uint16_t type; /* VMBUS_CHANPKT_TYPE_ */ + uint16_t hlen; /* header len, in 8 bytes */ + uint16_t tlen; /* total len, in 8 bytes */ + uint16_t flags; /* VMBUS_CHANPKT_FLAG_ */ + uint64_t xactid; +} __packed; + +struct vmbus_chanpkt { + struct vmbus_chanpkt_hdr hdr; +} __packed; + +struct vmbuspipe_hdr { + unsigned int flags; + unsigned int msgsize; +} __packed; + +struct ic_version { + unsigned short major; + unsigned short minor; +} __packed; + +struct icmsg_negotiate { + unsigned short icframe_vercnt; + unsigned short icmsg_vercnt; + unsigned int reserved; + struct ic_version icversion_data[]; /* any size array */ +} __packed; + +struct icmsg_hdr { + struct ic_version icverframe; + unsigned short icmsgtype; + struct ic_version icvermsg; + unsigned short icmsgsize; + unsigned int status; + unsigned char ictransaction_id; + unsigned char icflags; + unsigned char reserved[2]; +} __packed; + +int rte_vmbus_chan_recv_raw(struct vmbus_br *rxbr, void *data, uint32_t *len); +int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void *data, + uint32_t dlen, uint32_t flags); +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen); +void *vmbus_uio_map(int *fd, int size); + +/* Amount of space available for write */ +static inline uint32_t vmbus_br_availwrite(const struct vmbus_br *br, uint32_t windex) +{ + uint32_t rindex = br->vbr->rindex; + + if (windex >= rindex) + return br->dsize - (windex - rindex); + else + return rindex - windex; +} + +static inline uint32_t vmbus_br_availread(const struct vmbus_br *br) +{ + return br->dsize - vmbus_br_availwrite(br, br->vbr->windex); +} + +#endif /* !_VMBUS_BUF_H_ */ diff --git a/tools/include/asm-generic/bitops/__ffs.h b/tools/include/asm-generic/bitops/__ffs.h index 9d13105194..2d94c1e9b2 100644 --- a/tools/include/asm-generic/bitops/__ffs.h +++ b/tools/include/asm-generic/bitops/__ffs.h @@ -11,9 +11,9 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned int __ffs(unsigned long word) { - int num = 0; + unsigned int num = 0; #if __BITS_PER_LONG == 64 if ((word & 0xffffffff) == 0) { diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h index 54ccccf96e..e974ec932e 100644 --- a/tools/include/asm-generic/bitops/__fls.h +++ b/tools/include/asm-generic/bitops/__fls.h @@ -10,9 +10,9 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long generic___fls(unsigned long word) +static __always_inline unsigned int generic___fls(unsigned long word) { - int num = BITS_PER_LONG - 1; + unsigned int num = BITS_PER_LONG - 1; #if BITS_PER_LONG == 64 if (!(word & (~0ul << 32))) { diff --git a/tools/include/linux/align.h b/tools/include/linux/align.h new file mode 100644 index 0000000000..14e34ace80 --- /dev/null +++ b/tools/include/linux/align.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _TOOLS_LINUX_ALIGN_H +#define _TOOLS_LINUX_ALIGN_H + +#include + +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* _TOOLS_LINUX_ALIGN_H */ diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index f3566ea0f9..210c13b1b8 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -3,6 +3,7 @@ #define _TOOLS_LINUX_BITMAP_H #include +#include #include #include #include @@ -25,13 +26,14 @@ bool __bitmap_intersects(const unsigned long *bitmap1, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = 0UL; else { - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); - memset(dst, 0, len); + memset(dst, 0, bitmap_size(nbits)); } } @@ -83,7 +85,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, */ static inline unsigned long *bitmap_zalloc(int nbits) { - return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long)); + return calloc(1, bitmap_size(nbits)); } /* @@ -126,7 +128,6 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long)) #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) static inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index 7319f6ced1..b4e4cd071f 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -20,6 +20,8 @@ #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) + extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); @@ -70,7 +72,7 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } -static inline unsigned fls_long(unsigned long l) +static inline unsigned int fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 7c0cf5031a..0eb24d21aa 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -4,6 +4,7 @@ #include #include +#include #include #define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) @@ -30,15 +31,8 @@ #define GENMASK_INPUT_CHECK(h, l) 0 #endif -#define __GENMASK(h, l) \ - (((~UL(0)) - (UL(1) << (l)) + 1) & \ - (~UL(0) >> (BITS_PER_LONG - 1 - (h)))) #define GENMASK(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) - -#define __GENMASK_ULL(h, l) \ - (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ - (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define GENMASK_ULL(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 7b65566f3e..8a63a99134 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -58,6 +58,10 @@ #define noinline #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 736bdeccdf..65aa8ce142 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -111,6 +111,24 @@ .off = 0, \ .imm = IMM }) +/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ + +#define BPF_MOVSX64_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_MOVSX32_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index 7d73da0980..dc0fc7125b 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -2,8 +2,8 @@ #ifndef _TOOLS_LINUX_MM_H #define _TOOLS_LINUX_MM_H +#include #include -#include #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) @@ -11,9 +11,6 @@ #define PHYS_ADDR_MAX (~(phys_addr_t)0) -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) - #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) #define __va(x) ((void *)((unsigned long)(x))) diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 570bb97944..95483c7d81 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -158,13 +158,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; + rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { - rb->__rb_parent_color = (unsigned long)p | color; + rb->__rb_parent_color = (unsigned long)p + color; } static inline void diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index a01c69dd49..f9ab28421e 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -123,7 +123,7 @@ char *strcpy(char *dst, const char *src) * thus itself, hence the asm() statement below that's meant to disable this * confusing practice. */ -static __attribute__((unused)) +__attribute__((weak,unused,section(".text.nolibc_strlen"))) size_t strlen(const char *str) { size_t len; @@ -187,22 +187,26 @@ char *strndup(const char *str, size_t maxlen) static __attribute__((unused)) size_t strlcat(char *dst, const char *src, size_t size) { - size_t len; - char c; - - for (len = 0; dst[len]; len++) - ; - - for (;;) { - c = *src; - if (len < size) - dst[len] = c; - if (!c) + size_t len = strnlen(dst, size); + + /* + * We want len < size-1. But as size is unsigned and can wrap + * around, we use len + 1 instead. + */ + while (len + 1 < size) { + dst[len] = *src; + if (*src == '\0') break; len++; src++; } + if (len < size) + dst[len] = '\0'; + + while (*src++) + len++; + return len; } @@ -210,16 +214,18 @@ static __attribute__((unused)) size_t strlcpy(char *dst, const char *src, size_t size) { size_t len; - char c; - for (len = 0;;) { - c = src[len]; - if (len < size) - dst[len] = c; - if (!c) - break; - len++; + for (len = 0; len < size; len++) { + dst[len] = src[len]; + if (!dst[len]) + return len; } + if (size) + dst[size-1] = '\0'; + + while (src[len]) + len++; + return len; } diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index dda9dffd1d..7b82bc3cf1 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -22,6 +22,7 @@ #include /* for statx() */ #include #include +#include #include "arch.h" #include "errno.h" @@ -1139,6 +1140,32 @@ int umount2(const char *path, int flags) } +/* + * int uname(struct utsname *buf); + */ + +struct utsname { + char sysname[65]; + char nodename[65]; + char release[65]; + char version[65]; + char machine[65]; + char domainname[65]; +}; + +static __attribute__((unused)) +int sys_uname(struct utsname *buf) +{ + return my_syscall1(__NR_uname, buf); +} + +static __attribute__((unused)) +int uname(struct utsname *buf) +{ + return __sysret(sys_uname(buf)); +} + + /* * int unlink(const char *path); */ diff --git a/tools/include/uapi/asm-generic/bitsperlong.h b/tools/include/uapi/asm-generic/bitsperlong.h index 352cb81947..fadb3f857f 100644 --- a/tools/include/uapi/asm-generic/bitsperlong.h +++ b/tools/include/uapi/asm-generic/bitsperlong.h @@ -24,4 +24,8 @@ #endif #endif +#ifndef __BITS_PER_LONG_LONG +#define __BITS_PER_LONG_LONG 64 +#endif + #endif /* _UAPI__ASM_GENERIC_BITS_PER_LONG */ diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h deleted file mode 100644 index 1c7a0f6632..0000000000 --- a/tools/include/uapi/asm-generic/fcntl.h +++ /dev/null @@ -1,221 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_GENERIC_FCNTL_H -#define _ASM_GENERIC_FCNTL_H - -#include - -/* - * FMODE_EXEC is 0x20 - * FMODE_NONOTIFY is 0x4000000 - * These cannot be used by userspace O_* until internal and external open - * flags are split. - * -Eric Paris - */ - -/* - * When introducing new O_* bits, please check its uniqueness in fcntl_init(). - */ - -#define O_ACCMODE 00000003 -#define O_RDONLY 00000000 -#define O_WRONLY 00000001 -#define O_RDWR 00000002 -#ifndef O_CREAT -#define O_CREAT 00000100 /* not fcntl */ -#endif -#ifndef O_EXCL -#define O_EXCL 00000200 /* not fcntl */ -#endif -#ifndef O_NOCTTY -#define O_NOCTTY 00000400 /* not fcntl */ -#endif -#ifndef O_TRUNC -#define O_TRUNC 00001000 /* not fcntl */ -#endif -#ifndef O_APPEND -#define O_APPEND 00002000 -#endif -#ifndef O_NONBLOCK -#define O_NONBLOCK 00004000 -#endif -#ifndef O_DSYNC -#define O_DSYNC 00010000 /* used to be O_SYNC, see below */ -#endif -#ifndef FASYNC -#define FASYNC 00020000 /* fcntl, for BSD compatibility */ -#endif -#ifndef O_DIRECT -#define O_DIRECT 00040000 /* direct disk access hint */ -#endif -#ifndef O_LARGEFILE -#define O_LARGEFILE 00100000 -#endif -#ifndef O_DIRECTORY -#define O_DIRECTORY 00200000 /* must be a directory */ -#endif -#ifndef O_NOFOLLOW -#define O_NOFOLLOW 00400000 /* don't follow links */ -#endif -#ifndef O_NOATIME -#define O_NOATIME 01000000 -#endif -#ifndef O_CLOEXEC -#define O_CLOEXEC 02000000 /* set close_on_exec */ -#endif - -/* - * Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using - * the O_SYNC flag. We continue to use the existing numerical value - * for O_DSYNC semantics now, but using the correct symbolic name for it. - * This new value is used to request true Posix O_SYNC semantics. It is - * defined in this strange way to make sure applications compiled against - * new headers get at least O_DSYNC semantics on older kernels. - * - * This has the nice side-effect that we can simply test for O_DSYNC - * wherever we do not care if O_DSYNC or O_SYNC is used. - * - * Note: __O_SYNC must never be used directly. - */ -#ifndef O_SYNC -#define __O_SYNC 04000000 -#define O_SYNC (__O_SYNC|O_DSYNC) -#endif - -#ifndef O_PATH -#define O_PATH 010000000 -#endif - -#ifndef __O_TMPFILE -#define __O_TMPFILE 020000000 -#endif - -/* a horrid kludge trying to make sure that this will fail on old kernels */ -#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) - -#ifndef O_NDELAY -#define O_NDELAY O_NONBLOCK -#endif - -#define F_DUPFD 0 /* dup */ -#define F_GETFD 1 /* get close_on_exec */ -#define F_SETFD 2 /* set/clear close_on_exec */ -#define F_GETFL 3 /* get file->f_flags */ -#define F_SETFL 4 /* set file->f_flags */ -#ifndef F_GETLK -#define F_GETLK 5 -#define F_SETLK 6 -#define F_SETLKW 7 -#endif -#ifndef F_SETOWN -#define F_SETOWN 8 /* for sockets. */ -#define F_GETOWN 9 /* for sockets. */ -#endif -#ifndef F_SETSIG -#define F_SETSIG 10 /* for sockets. */ -#define F_GETSIG 11 /* for sockets. */ -#endif - -#if __BITS_PER_LONG == 32 || defined(__KERNEL__) -#ifndef F_GETLK64 -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 -#endif -#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ - -#ifndef F_SETOWN_EX -#define F_SETOWN_EX 15 -#define F_GETOWN_EX 16 -#endif - -#ifndef F_GETOWNER_UIDS -#define F_GETOWNER_UIDS 17 -#endif - -/* - * Open File Description Locks - * - * Usually record locks held by a process are released on *any* close and are - * not inherited across a fork(). - * - * These cmd values will set locks that conflict with process-associated - * record locks, but are "owned" by the open file description, not the - * process. This means that they are inherited across fork() like BSD (flock) - * locks, and they are only released automatically when the last reference to - * the open file against which they were acquired is put. - */ -#define F_OFD_GETLK 36 -#define F_OFD_SETLK 37 -#define F_OFD_SETLKW 38 - -#define F_OWNER_TID 0 -#define F_OWNER_PID 1 -#define F_OWNER_PGRP 2 - -struct f_owner_ex { - int type; - __kernel_pid_t pid; -}; - -/* for F_[GET|SET]FL */ -#define FD_CLOEXEC 1 /* actually anything with low bit set goes */ - -/* for posix fcntl() and lockf() */ -#ifndef F_RDLCK -#define F_RDLCK 0 -#define F_WRLCK 1 -#define F_UNLCK 2 -#endif - -/* for old implementation of bsd flock () */ -#ifndef F_EXLCK -#define F_EXLCK 4 /* or 3 */ -#define F_SHLCK 8 /* or 4 */ -#endif - -/* operations for bsd flock(), also used by the kernel implementation */ -#define LOCK_SH 1 /* shared lock */ -#define LOCK_EX 2 /* exclusive lock */ -#define LOCK_NB 4 /* or'd with one of the above to prevent - blocking */ -#define LOCK_UN 8 /* remove lock */ - -/* - * LOCK_MAND support has been removed from the kernel. We leave the symbols - * here to not break legacy builds, but these should not be used in new code. - */ -#define LOCK_MAND 32 /* This is a mandatory flock ... */ -#define LOCK_READ 64 /* which allows concurrent read operations */ -#define LOCK_WRITE 128 /* which allows concurrent write operations */ -#define LOCK_RW 192 /* which allows concurrent read & write ops */ - -#define F_LINUX_SPECIFIC_BASE 1024 - -#ifndef HAVE_ARCH_STRUCT_FLOCK -struct flock { - short l_type; - short l_whence; - __kernel_off_t l_start; - __kernel_off_t l_len; - __kernel_pid_t l_pid; -#ifdef __ARCH_FLOCK_EXTRA_SYSID - __ARCH_FLOCK_EXTRA_SYSID -#endif -#ifdef __ARCH_FLOCK_PAD - __ARCH_FLOCK_PAD -#endif -}; - -struct flock64 { - short l_type; - short l_whence; - __kernel_loff_t l_start; - __kernel_loff_t l_len; - __kernel_pid_t l_pid; -#ifdef __ARCH_FLOCK64_PAD - __ARCH_FLOCK64_PAD -#endif -}; -#endif /* HAVE_ARCH_STRUCT_FLOCK */ - -#endif /* _ASM_GENERIC_FCNTL_H */ diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 75f00965ab..d983c48a3b 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) #define __NR_lsm_list_modules 461 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) +#define __NR_mseal 462 +__SYSCALL(__NR_mseal, sys_mseal) + #undef __NR_syscalls -#define __NR_syscalls 462 +#define __NR_syscalls 463 /* * 32 bit systems traditionally used different diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 2ee338860b..d4d86e566e 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -806,6 +806,12 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_PXP_STATUS 58 +/* + * Query if kernel allows marking a context to send a Freq hint to SLPC. This + * will enable use of the strategies allowed by the SLPC algorithm. + */ +#define I915_PARAM_HAS_CONTEXT_FREQ_HINT 59 + /* Must be kept compact -- no holes and well documented */ /** @@ -2148,6 +2154,15 @@ struct drm_i915_gem_context_param { * -EIO: The firmware did not succeed in creating the protected context. */ #define I915_CONTEXT_PARAM_PROTECTED_CONTENT 0xd + +/* + * I915_CONTEXT_PARAM_LOW_LATENCY: + * + * Mark this context as a low latency workload which requires aggressive GT + * frequency scaling. Use I915_PARAM_HAS_CONTEXT_FREQ_HINT to check if the kernel + * supports this per context flag. + */ +#define I915_CONTEXT_PARAM_LOW_LATENCY 0xe /* Must be kept compact -- no holes and well documented */ /** @value: Context parameter value to be set or queried */ @@ -2623,19 +2638,29 @@ struct drm_i915_reg_read { * */ +/* + * struct drm_i915_reset_stats - Return global reset and other context stats + * + * Driver keeps few stats for each contexts and also global reset count. + * This struct can be used to query those stats. + */ struct drm_i915_reset_stats { + /** @ctx_id: ID of the requested context */ __u32 ctx_id; + + /** @flags: MBZ */ __u32 flags; - /* All resets since boot/module reload, for all contexts */ + /** @reset_count: All resets since boot/module reload, for all contexts */ __u32 reset_count; - /* Number of batches lost when active in GPU, for this context */ + /** @batch_active: Number of batches lost when active in GPU, for this context */ __u32 batch_active; - /* Number of batches lost pending for execution, for this context */ + /** @batch_pending: Number of batches lost pending for execution, for this context */ __u32 batch_pending; + /** @pad: MBZ */ __u32 pad; }; diff --git a/tools/include/uapi/linux/bits.h b/tools/include/uapi/linux/bits.h new file mode 100644 index 0000000000..3c2a101986 --- /dev/null +++ b/tools/include/uapi/linux/bits.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* bits.h: Macros for dealing with bitmasks. */ + +#ifndef _UAPI_LINUX_BITS_H +#define _UAPI_LINUX_BITS_H + +#define __GENMASK(h, l) \ + (((~_UL(0)) - (_UL(1) << (l)) + 1) & \ + (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) + +#define __GENMASK_ULL(h, l) \ + (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \ + (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) + +#endif /* _UAPI_LINUX_BITS_H */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bcd84985fa..90706a47f6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; @@ -1135,6 +1136,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -1662,8 +1664,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ @@ -3392,6 +3396,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -5020,7 +5028,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5506,7 +5514,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) @@ -6718,6 +6726,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); @@ -6936,6 +6948,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle @@ -7118,6 +7132,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7195,8 +7210,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { @@ -7283,6 +7309,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h deleted file mode 100644 index 47afae3895..0000000000 --- a/tools/include/uapi/linux/ethtool.h +++ /dev/null @@ -1,104 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * ethtool.h: Defines for Linux ethtool. - * - * Copyright (C) 1998 David S. Miller (davem@redhat.com) - * Copyright 2001 Jeff Garzik - * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) - * Portions Copyright 2002 Intel (eli.kupermann@intel.com, - * christopher.leech@intel.com, - * scott.feldman@intel.com) - * Portions Copyright (C) Sun Microsystems 2008 - */ - -#ifndef _UAPI_LINUX_ETHTOOL_H -#define _UAPI_LINUX_ETHTOOL_H - -#include -#include -#include - -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ - -/** - * struct ethtool_channels - configuring number of network channel - * @cmd: ETHTOOL_{G,S}CHANNELS - * @max_rx: Read only. Maximum number of receive channel the driver support. - * @max_tx: Read only. Maximum number of transmit channel the driver support. - * @max_other: Read only. Maximum number of other channel the driver support. - * @max_combined: Read only. Maximum number of combined channel the driver - * support. Set of queues RX, TX or other. - * @rx_count: Valid values are in the range 1 to the max_rx. - * @tx_count: Valid values are in the range 1 to the max_tx. - * @other_count: Valid values are in the range 1 to the max_other. - * @combined_count: Valid values are in the range 1 to the max_combined. - * - * This can be used to configure RX, TX and other channels. - */ - -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -#define ETHTOOL_FWVERS_LEN 32 -#define ETHTOOL_BUSINFO_LEN 32 -#define ETHTOOL_EROMVERS_LEN 32 - -/** - * struct ethtool_drvinfo - general driver and device information - * @cmd: Command number = %ETHTOOL_GDRVINFO - * @driver: Driver short name. This should normally match the name - * in its bus driver structure (e.g. pci_driver::name). Must - * not be an empty string. - * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; may be an empty string - * @erom_version: Expansion ROM version string; may be an empty string - * @bus_info: Device bus address. This should match the dev_name() - * string for the underlying bus device, if there is one. May be - * an empty string. - * @reserved2: Reserved for future use; see the note on reserved space. - * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and - * %ETHTOOL_SPFLAGS commands; also the number of strings in the - * %ETH_SS_PRIV_FLAGS set - * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS - * command; also the number of strings in the %ETH_SS_STATS set - * @testinfo_len: Number of results returned by the %ETHTOOL_TEST - * command; also the number of strings in the %ETH_SS_TEST set - * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM - * and %ETHTOOL_SEEPROM commands, in bytes - * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS - * command, in bytes - * - * Users can use the %ETHTOOL_GSSET_INFO command to get the number of - * strings in any string set (from Linux 2.6.34). - * - * Drivers should set at most @driver, @version, @fw_version and - * @bus_info in their get_drvinfo() implementation. The ethtool - * core fills in the other fields using other driver operations. - */ -struct ethtool_drvinfo { - __u32 cmd; - char driver[32]; - char version[32]; - char fw_version[ETHTOOL_FWVERS_LEN]; - char bus_info[ETHTOOL_BUSINFO_LEN]; - char erom_version[ETHTOOL_EROMVERS_LEN]; - char reserved2[12]; - __u32 n_priv_flags; - __u32 n_stats; - __u32 testinfo_len; - __u32 eedump_len; - __u32 regdump_len; -}; - -#define ETHTOOL_GDRVINFO 0x00000003 - -#endif /* _UAPI_LINUX_ETHTOOL_H */ diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h deleted file mode 100644 index 282e90aeb1..0000000000 --- a/tools/include/uapi/linux/fcntl.h +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_FCNTL_H -#define _UAPI_LINUX_FCNTL_H - -#include -#include - -#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) -#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) - -/* - * Cancel a blocking posix lock; internal use only until we expose an - * asynchronous lock api to userspace: - */ -#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5) - -/* Create a file descriptor with FD_CLOEXEC set. */ -#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6) - -/* - * Request nofications on a directory. - * See below for events that may be notified. - */ -#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) - -/* - * Set and get of pipe page size array - */ -#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) -#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) - -/* - * Set/Get seals - */ -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -/* - * Types of seals - */ -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ -#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ -#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ -/* (1U << 31) is reserved for signed error codes */ - -/* - * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the - * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on - * the specific file. - */ -#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) -#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) -#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) -#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) - -/* - * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be - * used to clear any hints previously set. - */ -#define RWH_WRITE_LIFE_NOT_SET 0 -#define RWH_WRITE_LIFE_NONE 1 -#define RWH_WRITE_LIFE_SHORT 2 -#define RWH_WRITE_LIFE_MEDIUM 3 -#define RWH_WRITE_LIFE_LONG 4 -#define RWH_WRITE_LIFE_EXTREME 5 - -/* - * The originally introduced spelling is remained from the first - * versions of the patch set that introduced the feature, see commit - * v4.13-rc1~212^2~51. - */ -#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET - -/* - * Types of directory notifications that may be requested. - */ -#define DN_ACCESS 0x00000001 /* File accessed */ -#define DN_MODIFY 0x00000002 /* File modified */ -#define DN_CREATE 0x00000004 /* File created */ -#define DN_DELETE 0x00000008 /* File removed */ -#define DN_RENAME 0x00000010 /* File renamed */ -#define DN_ATTRIB 0x00000020 /* File changed attibutes */ -#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ - -/* - * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is - * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to - * unlinkat. The two functions do completely different things and therefore, - * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to - * faccessat would be undefined behavior and thus treating it equivalent to - * AT_EACCESS is valid undefined behavior. - */ -#define AT_FDCWD -100 /* Special value used to indicate - openat should use the current - working directory. */ -#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ -#define AT_EACCESS 0x200 /* Test access permitted for - effective IDs, not real IDs. */ -#define AT_REMOVEDIR 0x200 /* Remove directory instead of - unlinking file. */ -#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ -#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ -#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ - -#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ -#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ -#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ -#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ - -#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ - -/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ -#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to - compare object identity and may not - be usable to open_by_handle_at(2) */ -#if defined(__KERNEL__) -#define AT_GETATTR_NOSEC 0x80000000 -#endif - -#endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h deleted file mode 100644 index 45e4e64fd6..0000000000 --- a/tools/include/uapi/linux/fs.h +++ /dev/null @@ -1,396 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_FS_H -#define _UAPI_LINUX_FS_H - -/* - * This file has definitions for some important file table structures - * and constants and structures used by various generic file system - * ioctl's. Please do not make any changes in this file before - * sending patches for review to linux-fsdevel@vger.kernel.org and - * linux-api@vger.kernel.org. - */ - -#include -#include -#include -#ifndef __KERNEL__ -#include -#endif - -/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */ -#if !defined(__KERNEL__) -#include -#endif - -/* - * It's silly to have NR_OPEN bigger than NR_FILE, but you can change - * the file limit at runtime and only root can increase the per-process - * nr_file rlimit, so it's safe to set up a ridiculously high absolute - * upper limit on files-per-process. - * - * Some programs (notably those using select()) may have to be - * recompiled to take full advantage of the new limits.. - */ - -/* Fixed constants first: */ -#undef NR_OPEN -#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ -#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ - -#define BLOCK_SIZE_BITS 10 -#define BLOCK_SIZE (1< + +/* flags for memfd_create(2) (unsigned int) */ +#define MFD_CLOEXEC 0x0001U +#define MFD_ALLOW_SEALING 0x0002U +#define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U + +/* + * Huge page size encoding when MFD_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB +#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB +#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + +#endif /* _LINUX_MEMFD_H */ diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h deleted file mode 100644 index ad5478dbad..0000000000 --- a/tools/include/uapi/linux/mount.h +++ /dev/null @@ -1,211 +0,0 @@ -#ifndef _UAPI_LINUX_MOUNT_H -#define _UAPI_LINUX_MOUNT_H - -#include - -/* - * These are the fs-independent mount-flags: up to 32 flags are supported - * - * Usage of these is restricted within the kernel to core mount(2) code and - * callers of sys_mount() only. Filesystems should be using the SB_* - * equivalent instead. - */ -#define MS_RDONLY 1 /* Mount read-only */ -#define MS_NOSUID 2 /* Ignore suid and sgid bits */ -#define MS_NODEV 4 /* Disallow access to device special files */ -#define MS_NOEXEC 8 /* Disallow program execution */ -#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ -#define MS_REMOUNT 32 /* Alter flags of a mounted FS */ -#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ -#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ -#define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */ -#define MS_NOATIME 1024 /* Do not update access times. */ -#define MS_NODIRATIME 2048 /* Do not update directory access times */ -#define MS_BIND 4096 -#define MS_MOVE 8192 -#define MS_REC 16384 -#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. - MS_VERBOSE is deprecated. */ -#define MS_SILENT 32768 -#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ -#define MS_UNBINDABLE (1<<17) /* change to unbindable */ -#define MS_PRIVATE (1<<18) /* change to private */ -#define MS_SLAVE (1<<19) /* change to slave */ -#define MS_SHARED (1<<20) /* change to shared */ -#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ -#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ -#define MS_I_VERSION (1<<23) /* Update inode I_version field */ -#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ -#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ - -/* These sb flags are internal to the kernel */ -#define MS_SUBMOUNT (1<<26) -#define MS_NOREMOTELOCK (1<<27) -#define MS_NOSEC (1<<28) -#define MS_BORN (1<<29) -#define MS_ACTIVE (1<<30) -#define MS_NOUSER (1<<31) - -/* - * Superblock flags that can be altered by MS_REMOUNT - */ -#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ - MS_LAZYTIME) - -/* - * Old magic mount flag and mask - */ -#define MS_MGC_VAL 0xC0ED0000 -#define MS_MGC_MSK 0xffff0000 - -/* - * open_tree() flags. - */ -#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ -#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ - -/* - * move_mount() flags. - */ -#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ -#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ -#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ -#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ -#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ -#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ -#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ -#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ -#define MOVE_MOUNT__MASK 0x00000377 - -/* - * fsopen() flags. - */ -#define FSOPEN_CLOEXEC 0x00000001 - -/* - * fspick() flags. - */ -#define FSPICK_CLOEXEC 0x00000001 -#define FSPICK_SYMLINK_NOFOLLOW 0x00000002 -#define FSPICK_NO_AUTOMOUNT 0x00000004 -#define FSPICK_EMPTY_PATH 0x00000008 - -/* - * The type of fsconfig() call made. - */ -enum fsconfig_command { - FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ - FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ - FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ - FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ - FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ - FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Create new or reuse existing superblock */ - FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ - FSCONFIG_CMD_CREATE_EXCL = 8, /* Create new superblock, fail if reusing existing superblock */ -}; - -/* - * fsmount() flags. - */ -#define FSMOUNT_CLOEXEC 0x00000001 - -/* - * Mount attributes. - */ -#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ -#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ -#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ -#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ -#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ -#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ -#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ -#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ -#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ -#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ -#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */ - -/* - * mount_setattr() - */ -struct mount_attr { - __u64 attr_set; - __u64 attr_clr; - __u64 propagation; - __u64 userns_fd; -}; - -/* List of all mount_attr versions. */ -#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ - - -/* - * Structure for getting mount/superblock/filesystem info with statmount(2). - * - * The interface is similar to statx(2): individual fields or groups can be - * selected with the @mask argument of statmount(). Kernel will set the @mask - * field according to the supported fields. - * - * If string fields are selected, then the caller needs to pass a buffer that - * has space after the fixed part of the structure. Nul terminated strings are - * copied there and offsets relative to @str are stored in the relevant fields. - * If the buffer is too small, then EOVERFLOW is returned. The actually used - * size is returned in @size. - */ -struct statmount { - __u32 size; /* Total size, including strings */ - __u32 __spare1; - __u64 mask; /* What results were written */ - __u32 sb_dev_major; /* Device ID */ - __u32 sb_dev_minor; - __u64 sb_magic; /* ..._SUPER_MAGIC */ - __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */ - __u32 fs_type; /* [str] Filesystem type */ - __u64 mnt_id; /* Unique ID of mount */ - __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */ - __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */ - __u32 mnt_parent_id_old; - __u64 mnt_attr; /* MOUNT_ATTR_... */ - __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */ - __u64 mnt_peer_group; /* ID of shared peer group */ - __u64 mnt_master; /* Mount receives propagation from this ID */ - __u64 propagate_from; /* Propagation from in current namespace */ - __u32 mnt_root; /* [str] Root of mount relative to root of fs */ - __u32 mnt_point; /* [str] Mountpoint relative to current root */ - __u64 __spare2[50]; - char str[]; /* Variable size part containing strings */ -}; - -/* - * Structure for passing mount ID and miscellaneous parameters to statmount(2) - * and listmount(2). - * - * For statmount(2) @param represents the request mask. - * For listmount(2) @param represents the last listed mount id (or zero). - */ -struct mnt_id_req { - __u32 size; - __u32 spare; - __u64 mnt_id; - __u64 param; -}; - -/* List of all mnt_id_req versions. */ -#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ - -/* - * @mask bits for statmount(2) - */ -#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ -#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ -#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ -#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ -#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ -#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ - -/* - * Special @mnt_id values that can be passed to listmount - */ -#define LSMT_ROOT 0xffffffffffffffff /* root mount */ - -#endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index bb65ee840c..43742ac5b0 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -146,6 +146,28 @@ enum { NETDEV_A_QSTATS_TX_PACKETS, NETDEV_A_QSTATS_TX_BYTES, NETDEV_A_QSTATS_RX_ALLOC_FAIL, + NETDEV_A_QSTATS_RX_HW_DROPS, + NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, + NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, + NETDEV_A_QSTATS_RX_CSUM_NONE, + NETDEV_A_QSTATS_RX_CSUM_BAD, + NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_BYTES, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, + NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_HW_DROPS, + NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, + NETDEV_A_QSTATS_TX_CSUM_NONE, + NETDEV_A_QSTATS_TX_NEEDS_CSUM, + NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_BYTES, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, + NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_STOP, + NETDEV_A_QSTATS_TX_WAKE, __NETDEV_A_QSTATS_MAX, NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) diff --git a/tools/include/uapi/linux/openat2.h b/tools/include/uapi/linux/openat2.h deleted file mode 100644 index a5feb76049..0000000000 --- a/tools/include/uapi/linux/openat2.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_OPENAT2_H -#define _UAPI_LINUX_OPENAT2_H - -#include - -/* - * Arguments for how openat2(2) should open the target path. If only @flags and - * @mode are non-zero, then openat2(2) operates very similarly to openat(2). - * - * However, unlike openat(2), unknown or invalid bits in @flags result in - * -EINVAL rather than being silently ignored. @mode must be zero unless one of - * {O_CREAT, O_TMPFILE} are set. - * - * @flags: O_* flags. - * @mode: O_CREAT/O_TMPFILE file mode. - * @resolve: RESOLVE_* flags. - */ -struct open_how { - __u64 flags; - __u64 mode; - __u64 resolve; -}; - -/* how->resolve flags for openat2(2). */ -#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings - (includes bind-mounts). */ -#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style - "magic-links". */ -#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks - (implies OEXT_NO_MAGICLINKS) */ -#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like - "..", symlinks, and absolute - paths which escape the dirfd. */ -#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." - be scoped inside the dirfd - (similar to chroot(2)). */ -#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be - completed through cached lookup. May - return -EAGAIN if that's not - possible. */ - -#endif /* _UAPI_LINUX_OPENAT2_H */ diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h deleted file mode 100644 index 370ed14b1a..0000000000 --- a/tools/include/uapi/linux/prctl.h +++ /dev/null @@ -1,309 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_PRCTL_H -#define _LINUX_PRCTL_H - -#include - -/* Values to pass as first argument to prctl() */ - -#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ -#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ - -/* Get/set current->mm->dumpable */ -#define PR_GET_DUMPABLE 3 -#define PR_SET_DUMPABLE 4 - -/* Get/set unaligned access control bits (if meaningful) */ -#define PR_GET_UNALIGN 5 -#define PR_SET_UNALIGN 6 -# define PR_UNALIGN_NOPRINT 1 /* silently fix up unaligned user accesses */ -# define PR_UNALIGN_SIGBUS 2 /* generate SIGBUS on unaligned user access */ - -/* Get/set whether or not to drop capabilities on setuid() away from - * uid 0 (as per security/commoncap.c) */ -#define PR_GET_KEEPCAPS 7 -#define PR_SET_KEEPCAPS 8 - -/* Get/set floating-point emulation control bits (if meaningful) */ -#define PR_GET_FPEMU 9 -#define PR_SET_FPEMU 10 -# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ -# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ - -/* Get/set floating-point exception mode (if meaningful) */ -#define PR_GET_FPEXC 11 -#define PR_SET_FPEXC 12 -# define PR_FP_EXC_SW_ENABLE 0x80 /* Use FPEXC for FP exception enables */ -# define PR_FP_EXC_DIV 0x010000 /* floating point divide by zero */ -# define PR_FP_EXC_OVF 0x020000 /* floating point overflow */ -# define PR_FP_EXC_UND 0x040000 /* floating point underflow */ -# define PR_FP_EXC_RES 0x080000 /* floating point inexact result */ -# define PR_FP_EXC_INV 0x100000 /* floating point invalid operation */ -# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */ -# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */ -# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */ -# define PR_FP_EXC_PRECISE 3 /* precise exception mode */ - -/* Get/set whether we use statistical process timing or accurate timestamp - * based process timing */ -#define PR_GET_TIMING 13 -#define PR_SET_TIMING 14 -# define PR_TIMING_STATISTICAL 0 /* Normal, traditional, - statistical process timing */ -# define PR_TIMING_TIMESTAMP 1 /* Accurate timestamp based - process timing */ - -#define PR_SET_NAME 15 /* Set process name */ -#define PR_GET_NAME 16 /* Get process name */ - -/* Get/set process endian */ -#define PR_GET_ENDIAN 19 -#define PR_SET_ENDIAN 20 -# define PR_ENDIAN_BIG 0 -# define PR_ENDIAN_LITTLE 1 /* True little endian mode */ -# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */ - -/* Get/set process seccomp mode */ -#define PR_GET_SECCOMP 21 -#define PR_SET_SECCOMP 22 - -/* Get/set the capability bounding set (as per security/commoncap.c) */ -#define PR_CAPBSET_READ 23 -#define PR_CAPBSET_DROP 24 - -/* Get/set the process' ability to use the timestamp counter instruction */ -#define PR_GET_TSC 25 -#define PR_SET_TSC 26 -# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */ -# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */ - -/* Get/set securebits (as per security/commoncap.c) */ -#define PR_GET_SECUREBITS 27 -#define PR_SET_SECUREBITS 28 - -/* - * Get/set the timerslack as used by poll/select/nanosleep - * A value of 0 means "use default" - */ -#define PR_SET_TIMERSLACK 29 -#define PR_GET_TIMERSLACK 30 - -#define PR_TASK_PERF_EVENTS_DISABLE 31 -#define PR_TASK_PERF_EVENTS_ENABLE 32 - -/* - * Set early/late kill mode for hwpoison memory corruption. - * This influences when the process gets killed on a memory corruption. - */ -#define PR_MCE_KILL 33 -# define PR_MCE_KILL_CLEAR 0 -# define PR_MCE_KILL_SET 1 - -# define PR_MCE_KILL_LATE 0 -# define PR_MCE_KILL_EARLY 1 -# define PR_MCE_KILL_DEFAULT 2 - -#define PR_MCE_KILL_GET 34 - -/* - * Tune up process memory map specifics. - */ -#define PR_SET_MM 35 -# define PR_SET_MM_START_CODE 1 -# define PR_SET_MM_END_CODE 2 -# define PR_SET_MM_START_DATA 3 -# define PR_SET_MM_END_DATA 4 -# define PR_SET_MM_START_STACK 5 -# define PR_SET_MM_START_BRK 6 -# define PR_SET_MM_BRK 7 -# define PR_SET_MM_ARG_START 8 -# define PR_SET_MM_ARG_END 9 -# define PR_SET_MM_ENV_START 10 -# define PR_SET_MM_ENV_END 11 -# define PR_SET_MM_AUXV 12 -# define PR_SET_MM_EXE_FILE 13 -# define PR_SET_MM_MAP 14 -# define PR_SET_MM_MAP_SIZE 15 - -/* - * This structure provides new memory descriptor - * map which mostly modifies /proc/pid/stat[m] - * output for a task. This mostly done in a - * sake of checkpoint/restore functionality. - */ -struct prctl_mm_map { - __u64 start_code; /* code section bounds */ - __u64 end_code; - __u64 start_data; /* data section bounds */ - __u64 end_data; - __u64 start_brk; /* heap for brk() syscall */ - __u64 brk; - __u64 start_stack; /* stack starts at */ - __u64 arg_start; /* command line arguments bounds */ - __u64 arg_end; - __u64 env_start; /* environment variables bounds */ - __u64 env_end; - __u64 *auxv; /* auxiliary vector */ - __u32 auxv_size; /* vector size */ - __u32 exe_fd; /* /proc/$pid/exe link file */ -}; - -/* - * Set specific pid that is allowed to ptrace the current task. - * A value of 0 mean "no process". - */ -#define PR_SET_PTRACER 0x59616d61 -# define PR_SET_PTRACER_ANY ((unsigned long)-1) - -#define PR_SET_CHILD_SUBREAPER 36 -#define PR_GET_CHILD_SUBREAPER 37 - -/* - * If no_new_privs is set, then operations that grant new privileges (i.e. - * execve) will either fail or not grant them. This affects suid/sgid, - * file capabilities, and LSMs. - * - * Operations that merely manipulate or drop existing privileges (setresuid, - * capset, etc.) will still work. Drop those privileges if you want them gone. - * - * Changing LSM security domain is considered a new privilege. So, for example, - * asking selinux for a specific new context (e.g. with runcon) will result - * in execve returning -EPERM. - * - * See Documentation/userspace-api/no_new_privs.rst for more details. - */ -#define PR_SET_NO_NEW_PRIVS 38 -#define PR_GET_NO_NEW_PRIVS 39 - -#define PR_GET_TID_ADDRESS 40 - -#define PR_SET_THP_DISABLE 41 -#define PR_GET_THP_DISABLE 42 - -/* - * No longer implemented, but left here to ensure the numbers stay reserved: - */ -#define PR_MPX_ENABLE_MANAGEMENT 43 -#define PR_MPX_DISABLE_MANAGEMENT 44 - -#define PR_SET_FP_MODE 45 -#define PR_GET_FP_MODE 46 -# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ -# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ - -/* Control the ambient capability set */ -#define PR_CAP_AMBIENT 47 -# define PR_CAP_AMBIENT_IS_SET 1 -# define PR_CAP_AMBIENT_RAISE 2 -# define PR_CAP_AMBIENT_LOWER 3 -# define PR_CAP_AMBIENT_CLEAR_ALL 4 - -/* arm64 Scalable Vector Extension controls */ -/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */ -#define PR_SVE_SET_VL 50 /* set task vector length */ -# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ -#define PR_SVE_GET_VL 51 /* get task vector length */ -/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */ -# define PR_SVE_VL_LEN_MASK 0xffff -# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ - -/* Per task speculation control */ -#define PR_GET_SPECULATION_CTRL 52 -#define PR_SET_SPECULATION_CTRL 53 -/* Speculation control variants */ -# define PR_SPEC_STORE_BYPASS 0 -# define PR_SPEC_INDIRECT_BRANCH 1 -# define PR_SPEC_L1D_FLUSH 2 -/* Return and control values for PR_SET/GET_SPECULATION_CTRL */ -# define PR_SPEC_NOT_AFFECTED 0 -# define PR_SPEC_PRCTL (1UL << 0) -# define PR_SPEC_ENABLE (1UL << 1) -# define PR_SPEC_DISABLE (1UL << 2) -# define PR_SPEC_FORCE_DISABLE (1UL << 3) -# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) - -/* Reset arm64 pointer authentication keys */ -#define PR_PAC_RESET_KEYS 54 -# define PR_PAC_APIAKEY (1UL << 0) -# define PR_PAC_APIBKEY (1UL << 1) -# define PR_PAC_APDAKEY (1UL << 2) -# define PR_PAC_APDBKEY (1UL << 3) -# define PR_PAC_APGAKEY (1UL << 4) - -/* Tagged user address controls for arm64 */ -#define PR_SET_TAGGED_ADDR_CTRL 55 -#define PR_GET_TAGGED_ADDR_CTRL 56 -# define PR_TAGGED_ADDR_ENABLE (1UL << 0) -/* MTE tag check fault modes */ -# define PR_MTE_TCF_NONE 0UL -# define PR_MTE_TCF_SYNC (1UL << 1) -# define PR_MTE_TCF_ASYNC (1UL << 2) -# define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC) -/* MTE tag inclusion mask */ -# define PR_MTE_TAG_SHIFT 3 -# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) -/* Unused; kept only for source compatibility */ -# define PR_MTE_TCF_SHIFT 1 - -/* Control reclaim behavior when allocating memory */ -#define PR_SET_IO_FLUSHER 57 -#define PR_GET_IO_FLUSHER 58 - -/* Dispatch syscalls to a userspace handler */ -#define PR_SET_SYSCALL_USER_DISPATCH 59 -# define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 -/* The control values for the user space selector when dispatch is enabled */ -# define SYSCALL_DISPATCH_FILTER_ALLOW 0 -# define SYSCALL_DISPATCH_FILTER_BLOCK 1 - -/* Set/get enabled arm64 pointer authentication keys */ -#define PR_PAC_SET_ENABLED_KEYS 60 -#define PR_PAC_GET_ENABLED_KEYS 61 - -/* Request the scheduler to share a core */ -#define PR_SCHED_CORE 62 -# define PR_SCHED_CORE_GET 0 -# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ -# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ -# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ -# define PR_SCHED_CORE_MAX 4 -# define PR_SCHED_CORE_SCOPE_THREAD 0 -# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 -# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 - -/* arm64 Scalable Matrix Extension controls */ -/* Flag values must be in sync with SVE versions */ -#define PR_SME_SET_VL 63 /* set task vector length */ -# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ -#define PR_SME_GET_VL 64 /* get task vector length */ -/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */ -# define PR_SME_VL_LEN_MASK 0xffff -# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ - -/* Memory deny write / execute */ -#define PR_SET_MDWE 65 -# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) -# define PR_MDWE_NO_INHERIT (1UL << 1) - -#define PR_GET_MDWE 66 - -#define PR_SET_VMA 0x53564d41 -# define PR_SET_VMA_ANON_NAME 0 - -#define PR_GET_AUXV 0x41555856 - -#define PR_SET_MEMORY_MERGE 67 -#define PR_GET_MEMORY_MERGE 68 - -#define PR_RISCV_V_SET_CONTROL 69 -#define PR_RISCV_V_GET_CONTROL 70 -# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0 -# define PR_RISCV_V_VSTATE_CTRL_OFF 1 -# define PR_RISCV_V_VSTATE_CTRL_ON 2 -# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4) -# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3 -# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc -# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f - -#endif /* _LINUX_PRCTL_H */ diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h deleted file mode 100644 index 3bac0a8cea..0000000000 --- a/tools/include/uapi/linux/sched.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_SCHED_H -#define _UAPI_LINUX_SCHED_H - -#include - -/* - * cloning flags: - */ -#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ -#define CLONE_VM 0x00000100 /* set if VM shared between processes */ -#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ -#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ -#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ -#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */ -#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ -#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ -#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ -#define CLONE_THREAD 0x00010000 /* Same thread group? */ -#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ -#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ -#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ -#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ -#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ -#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ -#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ -#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ -#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ -#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ -#define CLONE_NEWUSER 0x10000000 /* New user namespace */ -#define CLONE_NEWPID 0x20000000 /* New pid namespace */ -#define CLONE_NEWNET 0x40000000 /* New network namespace */ -#define CLONE_IO 0x80000000 /* Clone io context */ - -/* Flags for the clone3() syscall. */ -#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ -#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ - -/* - * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 - * syscalls only: - */ -#define CLONE_NEWTIME 0x00000080 /* New time namespace */ - -#ifndef __ASSEMBLY__ -/** - * struct clone_args - arguments for the clone3 syscall - * @flags: Flags for the new process as listed above. - * All flags are valid except for CSIGNAL and - * CLONE_DETACHED. - * @pidfd: If CLONE_PIDFD is set, a pidfd will be - * returned in this argument. - * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the - * child process will be returned in the child's - * memory. - * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of - * the child process will be returned in the - * parent's memory. - * @exit_signal: The exit_signal the parent process will be - * sent when the child exits. - * @stack: Specify the location of the stack for the - * child process. - * Note, @stack is expected to point to the - * lowest address. The stack direction will be - * determined by the kernel and set up - * appropriately based on @stack_size. - * @stack_size: The size of the stack for the child process. - * @tls: If CLONE_SETTLS is set, the tls descriptor - * is set to tls. - * @set_tid: Pointer to an array of type *pid_t. The size - * of the array is defined using @set_tid_size. - * This array is used to select PIDs/TIDs for - * newly created processes. The first element in - * this defines the PID in the most nested PID - * namespace. Each additional element in the array - * defines the PID in the parent PID namespace of - * the original PID namespace. If the array has - * less entries than the number of currently - * nested PID namespaces only the PIDs in the - * corresponding namespaces are set. - * @set_tid_size: This defines the size of the array referenced - * in @set_tid. This cannot be larger than the - * kernel's limit of nested PID namespaces. - * @cgroup: If CLONE_INTO_CGROUP is specified set this to - * a file descriptor for the cgroup. - * - * The structure is versioned by size and thus extensible. - * New struct members must go at the end of the struct and - * must be properly 64bit aligned. - */ -struct clone_args { - __aligned_u64 flags; - __aligned_u64 pidfd; - __aligned_u64 child_tid; - __aligned_u64 parent_tid; - __aligned_u64 exit_signal; - __aligned_u64 stack; - __aligned_u64 stack_size; - __aligned_u64 tls; - __aligned_u64 set_tid; - __aligned_u64 set_tid_size; - __aligned_u64 cgroup; -}; -#endif - -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ - -/* - * Scheduling policies - */ -#define SCHED_NORMAL 0 -#define SCHED_FIFO 1 -#define SCHED_RR 2 -#define SCHED_BATCH 3 -/* SCHED_ISO: reserved but not implemented yet */ -#define SCHED_IDLE 5 -#define SCHED_DEADLINE 6 - -/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ -#define SCHED_RESET_ON_FORK 0x40000000 - -/* - * For the sched_{set,get}attr() calls - */ -#define SCHED_FLAG_RESET_ON_FORK 0x01 -#define SCHED_FLAG_RECLAIM 0x02 -#define SCHED_FLAG_DL_OVERRUN 0x04 -#define SCHED_FLAG_KEEP_POLICY 0x08 -#define SCHED_FLAG_KEEP_PARAMS 0x10 -#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 -#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 - -#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) - -#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ - SCHED_FLAG_UTIL_CLAMP_MAX) - -#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) - -#endif /* _UAPI_LINUX_SCHED_H */ diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index 2f2ee82d55..67626d5353 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -126,8 +126,9 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ + __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[12]; /* Spare space for future expansion */ + __u64 __spare3[11]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -155,6 +156,7 @@ struct statx { #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ +#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ diff --git a/tools/include/uapi/linux/usbdevice_fs.h b/tools/include/uapi/linux/usbdevice_fs.h deleted file mode 100644 index 74a84e0242..0000000000 --- a/tools/include/uapi/linux/usbdevice_fs.h +++ /dev/null @@ -1,231 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -/*****************************************************************************/ - -/* - * usbdevice_fs.h -- USB device file system. - * - * Copyright (C) 2000 - * Thomas Sailer (sailer@ife.ee.ethz.ch) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * History: - * 0.1 04.01.2000 Created - */ - -/*****************************************************************************/ - -#ifndef _UAPI_LINUX_USBDEVICE_FS_H -#define _UAPI_LINUX_USBDEVICE_FS_H - -#include -#include - -/* --------------------------------------------------------------------- */ - -/* usbdevfs ioctl codes */ - -struct usbdevfs_ctrltransfer { - __u8 bRequestType; - __u8 bRequest; - __u16 wValue; - __u16 wIndex; - __u16 wLength; - __u32 timeout; /* in milliseconds */ - void __user *data; -}; - -struct usbdevfs_bulktransfer { - unsigned int ep; - unsigned int len; - unsigned int timeout; /* in milliseconds */ - void __user *data; -}; - -struct usbdevfs_setinterface { - unsigned int interface; - unsigned int altsetting; -}; - -struct usbdevfs_disconnectsignal { - unsigned int signr; - void __user *context; -}; - -#define USBDEVFS_MAXDRIVERNAME 255 - -struct usbdevfs_getdriver { - unsigned int interface; - char driver[USBDEVFS_MAXDRIVERNAME + 1]; -}; - -struct usbdevfs_connectinfo { - unsigned int devnum; - unsigned char slow; -}; - -struct usbdevfs_conninfo_ex { - __u32 size; /* Size of the structure from the kernel's */ - /* point of view. Can be used by userspace */ - /* to determine how much data can be */ - /* used/trusted. */ - __u32 busnum; /* USB bus number, as enumerated by the */ - /* kernel, the device is connected to. */ - __u32 devnum; /* Device address on the bus. */ - __u32 speed; /* USB_SPEED_* constants from ch9.h */ - __u8 num_ports; /* Number of ports the device is connected */ - /* to on the way to the root hub. It may */ - /* be bigger than size of 'ports' array so */ - /* userspace can detect overflows. */ - __u8 ports[7]; /* List of ports on the way from the root */ - /* hub to the device. Current limit in */ - /* USB specification is 7 tiers (root hub, */ - /* 5 intermediate hubs, device), which */ - /* gives at most 6 port entries. */ -}; - -#define USBDEVFS_URB_SHORT_NOT_OK 0x01 -#define USBDEVFS_URB_ISO_ASAP 0x02 -#define USBDEVFS_URB_BULK_CONTINUATION 0x04 -#define USBDEVFS_URB_NO_FSBR 0x20 /* Not used */ -#define USBDEVFS_URB_ZERO_PACKET 0x40 -#define USBDEVFS_URB_NO_INTERRUPT 0x80 - -#define USBDEVFS_URB_TYPE_ISO 0 -#define USBDEVFS_URB_TYPE_INTERRUPT 1 -#define USBDEVFS_URB_TYPE_CONTROL 2 -#define USBDEVFS_URB_TYPE_BULK 3 - -struct usbdevfs_iso_packet_desc { - unsigned int length; - unsigned int actual_length; - unsigned int status; -}; - -struct usbdevfs_urb { - unsigned char type; - unsigned char endpoint; - int status; - unsigned int flags; - void __user *buffer; - int buffer_length; - int actual_length; - int start_frame; - union { - int number_of_packets; /* Only used for isoc urbs */ - unsigned int stream_id; /* Only used with bulk streams */ - }; - int error_count; - unsigned int signr; /* signal to be sent on completion, - or 0 if none should be sent. */ - void __user *usercontext; - struct usbdevfs_iso_packet_desc iso_frame_desc[]; -}; - -/* ioctls for talking directly to drivers */ -struct usbdevfs_ioctl { - int ifno; /* interface 0..N ; negative numbers reserved */ - int ioctl_code; /* MUST encode size + direction of data so the - * macros in give correct values */ - void __user *data; /* param buffer (in, or out) */ -}; - -/* You can do most things with hubs just through control messages, - * except find out what device connects to what port. */ -struct usbdevfs_hub_portinfo { - char nports; /* number of downstream ports in this hub */ - char port [127]; /* e.g. port 3 connects to device 27 */ -}; - -/* System and bus capability flags */ -#define USBDEVFS_CAP_ZERO_PACKET 0x01 -#define USBDEVFS_CAP_BULK_CONTINUATION 0x02 -#define USBDEVFS_CAP_NO_PACKET_SIZE_LIM 0x04 -#define USBDEVFS_CAP_BULK_SCATTER_GATHER 0x08 -#define USBDEVFS_CAP_REAP_AFTER_DISCONNECT 0x10 -#define USBDEVFS_CAP_MMAP 0x20 -#define USBDEVFS_CAP_DROP_PRIVILEGES 0x40 -#define USBDEVFS_CAP_CONNINFO_EX 0x80 -#define USBDEVFS_CAP_SUSPEND 0x100 - -/* USBDEVFS_DISCONNECT_CLAIM flags & struct */ - -/* disconnect-and-claim if the driver matches the driver field */ -#define USBDEVFS_DISCONNECT_CLAIM_IF_DRIVER 0x01 -/* disconnect-and-claim except when the driver matches the driver field */ -#define USBDEVFS_DISCONNECT_CLAIM_EXCEPT_DRIVER 0x02 - -struct usbdevfs_disconnect_claim { - unsigned int interface; - unsigned int flags; - char driver[USBDEVFS_MAXDRIVERNAME + 1]; -}; - -struct usbdevfs_streams { - unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */ - unsigned int num_eps; - unsigned char eps[]; -}; - -/* - * USB_SPEED_* values returned by USBDEVFS_GET_SPEED are defined in - * linux/usb/ch9.h - */ - -#define USBDEVFS_CONTROL _IOWR('U', 0, struct usbdevfs_ctrltransfer) -#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) -#define USBDEVFS_BULK _IOWR('U', 2, struct usbdevfs_bulktransfer) -#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) -#define USBDEVFS_RESETEP _IOR('U', 3, unsigned int) -#define USBDEVFS_SETINTERFACE _IOR('U', 4, struct usbdevfs_setinterface) -#define USBDEVFS_SETCONFIGURATION _IOR('U', 5, unsigned int) -#define USBDEVFS_GETDRIVER _IOW('U', 8, struct usbdevfs_getdriver) -#define USBDEVFS_SUBMITURB _IOR('U', 10, struct usbdevfs_urb) -#define USBDEVFS_SUBMITURB32 _IOR('U', 10, struct usbdevfs_urb32) -#define USBDEVFS_DISCARDURB _IO('U', 11) -#define USBDEVFS_REAPURB _IOW('U', 12, void *) -#define USBDEVFS_REAPURB32 _IOW('U', 12, __u32) -#define USBDEVFS_REAPURBNDELAY _IOW('U', 13, void *) -#define USBDEVFS_REAPURBNDELAY32 _IOW('U', 13, __u32) -#define USBDEVFS_DISCSIGNAL _IOR('U', 14, struct usbdevfs_disconnectsignal) -#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) -#define USBDEVFS_CLAIMINTERFACE _IOR('U', 15, unsigned int) -#define USBDEVFS_RELEASEINTERFACE _IOR('U', 16, unsigned int) -#define USBDEVFS_CONNECTINFO _IOW('U', 17, struct usbdevfs_connectinfo) -#define USBDEVFS_IOCTL _IOWR('U', 18, struct usbdevfs_ioctl) -#define USBDEVFS_IOCTL32 _IOWR('U', 18, struct usbdevfs_ioctl32) -#define USBDEVFS_HUB_PORTINFO _IOR('U', 19, struct usbdevfs_hub_portinfo) -#define USBDEVFS_RESET _IO('U', 20) -#define USBDEVFS_CLEAR_HALT _IOR('U', 21, unsigned int) -#define USBDEVFS_DISCONNECT _IO('U', 22) -#define USBDEVFS_CONNECT _IO('U', 23) -#define USBDEVFS_CLAIM_PORT _IOR('U', 24, unsigned int) -#define USBDEVFS_RELEASE_PORT _IOR('U', 25, unsigned int) -#define USBDEVFS_GET_CAPABILITIES _IOR('U', 26, __u32) -#define USBDEVFS_DISCONNECT_CLAIM _IOR('U', 27, struct usbdevfs_disconnect_claim) -#define USBDEVFS_ALLOC_STREAMS _IOR('U', 28, struct usbdevfs_streams) -#define USBDEVFS_FREE_STREAMS _IOR('U', 29, struct usbdevfs_streams) -#define USBDEVFS_DROP_PRIVILEGES _IOW('U', 30, __u32) -#define USBDEVFS_GET_SPEED _IO('U', 31) -/* - * Returns struct usbdevfs_conninfo_ex; length is variable to allow - * extending size of the data returned. - */ -#define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) -#define USBDEVFS_FORBID_SUSPEND _IO('U', 33) -#define USBDEVFS_ALLOW_SUSPEND _IO('U', 34) -#define USBDEVFS_WAIT_FOR_RESUME _IO('U', 35) - -#endif /* _UAPI_LINUX_USBDEVICE_FS_H */ diff --git a/tools/include/uapi/linux/userfaultfd.h b/tools/include/uapi/linux/userfaultfd.h new file mode 100644 index 0000000000..4283de22d5 --- /dev/null +++ b/tools/include/uapi/linux/userfaultfd.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * include/linux/userfaultfd.h + * + * Copyright (C) 2007 Davide Libenzi + * Copyright (C) 2015 Red Hat, Inc. + * + */ + +#ifndef _LINUX_USERFAULTFD_H +#define _LINUX_USERFAULTFD_H + +#include + +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + +/* + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). + */ +#define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM | \ + UFFD_FEATURE_SIGBUS | \ + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) + +/* + * Valid ioctl command number range with this API is from 0x00 to + * 0x3F. UFFDIO_API is the fixed number, everything else can be + * changed by implementing a different UFFD_API. If sticking to the + * same UFFD_API more ioctl can be added and userland will be aware of + * which ioctl the running kernel implements through the ioctl command + * bitmask written by the UFFDIO_API. + */ +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) +#define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) +#define _UFFDIO_API (0x3F) + +/* userfaultfd ioctl ids */ +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) + +/* read() structure */ +struct uffd_msg { + __u8 event; + + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + + union { + struct { + __u64 flags; + __u64 address; + union { + __u32 ptid; + } feat; + } pagefault; + + struct { + __u32 ufd; + } fork; + + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + + struct { + __u64 start; + __u64 end; + } remove; + + struct { + /* unused reserved fields */ + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + } reserved; + } arg; +} __attribute__((packed)); + +/* + * Start at 0x12 and not at 0 to be more strict against bugs. + */ +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 + +/* flags for UFFD_EVENT_PAGEFAULT */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ + +struct uffdio_api { + /* userland asks for an API number and the features to enable */ + __u64 api; + /* + * Kernel answers below with the all available features for + * the API, this notifies userland of which events and/or + * which flags for each event are enabled in the current + * kernel. + * + * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE + * are to be considered implicitly always enabled in all kernels as + * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). + * + * UFFD_FEATURE_SIGBUS feature means no page-fault + * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead + * a SIGBUS signal will be sent to the faulting process. + * + * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will + * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd + * write-protection mode will always apply to unpopulated pages + * (i.e. empty ptes). This will be the default behavior for shmem + * & hugetlbfs, so this flag only affects anonymous memory behavior + * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. + */ +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_SIGBUS (1<<7) +#define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) + __u64 features; + + __u64 ioctls; +}; + +struct uffdio_range { + __u64 start; + __u64 len; +}; + +struct uffdio_register { + struct uffdio_range range; +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) + __u64 mode; + + /* + * kernel answers which ioctl commands are available for the + * range, keep at the end as the last 8 bytes aren't read. + */ + __u64 ioctls; +}; + +struct uffdio_copy { + __u64 dst; + __u64 src; + __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * "copy" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 copy; +}; + +struct uffdio_zeropage { + struct uffdio_range range; +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "zeropage" is written by the ioctl and must be at the end: + * the copy_from_user will not read the last 8 bytes. + */ + __s64 zeropage; +}; + +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + +#endif /* _LINUX_USERFAULTFD_H */ diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h deleted file mode 100644 index 649560c685..0000000000 --- a/tools/include/uapi/linux/vhost.h +++ /dev/null @@ -1,230 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_VHOST_H -#define _LINUX_VHOST_H -/* Userspace interface for in-kernel virtio accelerators. */ - -/* vhost is used to reduce the number of system calls involved in virtio. - * - * Existing virtio net code is used in the guest without modification. - * - * This header includes interface used by userspace hypervisor for - * device configuration. - */ - -#include -#include -#include - -#define VHOST_FILE_UNBIND -1 - -/* ioctls */ - -#define VHOST_VIRTIO 0xAF - -/* Features bitmask for forward compatibility. Transport bits are used for - * vhost specific features. */ -#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) -#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) - -/* Set current process as the (exclusive) owner of this file descriptor. This - * must be called before any other vhost command. Further calls to - * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ -#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) -/* Give up ownership, and reset the device to default values. - * Allows subsequent call to VHOST_OWNER_SET to succeed. */ -#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) - -/* Set up/modify memory layout */ -#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) - -/* Write logging setup. */ -/* Memory writes can optionally be logged by setting bit at an offset - * (calculated from the physical address) from specified log base. - * The bit is set using an atomic 32 bit operation. */ -/* Set base address for logging. */ -#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) -/* Specify an eventfd file descriptor to signal on log write. */ -#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) -/* By default, a device gets one vhost_worker that its virtqueues share. This - * command allows the owner of the device to create an additional vhost_worker - * for the device. It can later be bound to 1 or more of its virtqueues using - * the VHOST_ATTACH_VRING_WORKER command. - * - * This must be called after VHOST_SET_OWNER and the caller must be the owner - * of the device. The new thread will inherit caller's cgroups and namespaces, - * and will share the caller's memory space. The new thread will also be - * counted against the caller's RLIMIT_NPROC value. - * - * The worker's ID used in other commands will be returned in - * vhost_worker_state. - */ -#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) -/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any - * virtqueue. If userspace is not able to call this for workers its created, - * the kernel will free all the device's workers when the device is closed. - */ -#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) - -/* Ring setup. */ -/* Set number of descriptors in ring. This parameter can not - * be modified while ring is running (bound to a device). */ -#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) -/* Set addresses for the ring. */ -#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) -/* Base value where queue looks for available descriptors */ -#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) -/* Get accessor: reads index, writes value in num */ -#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) - -/* Set the vring byte order in num. Valid values are VHOST_VRING_LITTLE_ENDIAN - * or VHOST_VRING_BIG_ENDIAN (other values return -EINVAL). - * The byte order cannot be changed while the device is active: trying to do so - * returns -EBUSY. - * This is a legacy only API that is simply ignored when VIRTIO_F_VERSION_1 is - * set. - * Not all kernel configurations support this ioctl, but all configurations that - * support SET also support GET. - */ -#define VHOST_VRING_LITTLE_ENDIAN 0 -#define VHOST_VRING_BIG_ENDIAN 1 -#define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) -#define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) -/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's - * virtqueues. - * - * This will replace the virtqueue's existing worker. If the replaced worker - * is no longer attached to any virtqueues, it can be freed with - * VHOST_FREE_WORKER. - */ -#define VHOST_ATTACH_VRING_WORKER _IOW(VHOST_VIRTIO, 0x15, \ - struct vhost_vring_worker) -/* Return the vring worker's ID */ -#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ - struct vhost_vring_worker) - -/* The following ioctls use eventfd file descriptors to signal and poll - * for events. */ - -/* Set eventfd to poll for added buffers */ -#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) -/* Set eventfd to signal when buffers have beed used */ -#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) -/* Set eventfd to signal an error */ -#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) -/* Set busy loop timeout (in us) */ -#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \ - struct vhost_vring_state) -/* Get busy loop timeout (in us) */ -#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ - struct vhost_vring_state) - -/* Set or get vhost backend capability */ - -#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) -#define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) - -/* VHOST_NET specific defines */ - -/* Attach virtio net ring to a raw socket, or tap device. - * The socket must be already bound to an ethernet device, this device will be - * used for transmit. Pass fd -1 to unbind from the socket and the transmit - * device. This can be used to stop the ring (e.g. for migration). */ -#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) - -/* VHOST_SCSI specific defines */ - -#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) -#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) -/* Changing this breaks userspace. */ -#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) -/* Set and get the events missed flag */ -#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) -#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) - -/* VHOST_VSOCK specific defines */ - -#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) -#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) - -/* VHOST_VDPA specific defines */ - -/* Get the device id. The device ids follow the same definition of - * the device id defined in virtio-spec. - */ -#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32) -/* Get and set the status. The status bits follow the same definition - * of the device status defined in virtio-spec. - */ -#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8) -#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8) -/* Get and set the device config. The device config follows the same - * definition of the device config defined in virtio-spec. - */ -#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \ - struct vhost_vdpa_config) -#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \ - struct vhost_vdpa_config) -/* Enable/disable the ring. */ -#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \ - struct vhost_vring_state) -/* Get the max ring size. */ -#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16) - -/* Set event fd for config interrupt*/ -#define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) - -/* Get the valid iova range */ -#define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ - struct vhost_vdpa_iova_range) -/* Get the config size */ -#define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) - -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - -/* Get the number of address spaces. */ -#define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) - -/* Get the group for a virtqueue: read index, write group in num, - * The virtqueue index is stored in the index field of - * vhost_vring_state. The group for this specific virtqueue is - * returned via num field of vhost_vring_state. - */ -#define VHOST_VDPA_GET_VRING_GROUP _IOWR(VHOST_VIRTIO, 0x7B, \ - struct vhost_vring_state) -/* Set the ASID for a virtqueue group. The group index is stored in - * the index field of vhost_vring_state, the ASID associated with this - * group is stored at num field of vhost_vring_state. - */ -#define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ - struct vhost_vring_state) - -/* Suspend a device so it does not process virtqueue requests anymore - * - * After the return of ioctl the device must preserve all the necessary state - * (the virtqueue vring base plus the possible device specific states) that is - * required for restoring in the future. The device must not change its - * configuration after that point. - */ -#define VHOST_VDPA_SUSPEND _IO(VHOST_VIRTIO, 0x7D) - -/* Resume a device so it can resume processing virtqueue requests - * - * After the return of this ioctl the device will have restored all the - * necessary states and it is fully operational to continue processing the - * virtqueue descriptors. - */ -#define VHOST_VDPA_RESUME _IO(VHOST_VIRTIO, 0x7E) - -/* Get the group for the descriptor table including driver & device areas - * of a virtqueue: read index, write group in num. - * The virtqueue index is stored in the index field of vhost_vring_state. - * The group ID of the descriptor table for this specific virtqueue - * is returned via num field of vhost_vring_state. - */ -#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ - struct vhost_vring_state) -#endif diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h deleted file mode 100644 index 628d46a0da..0000000000 --- a/tools/include/uapi/sound/asound.h +++ /dev/null @@ -1,1252 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -/* - * Advanced Linux Sound Architecture - ALSA - Driver - * Copyright (c) 1994-2003 by Jaroslav Kysela , - * Abramo Bagnara - */ - -#ifndef _UAPI__SOUND_ASOUND_H -#define _UAPI__SOUND_ASOUND_H - -#if defined(__KERNEL__) || defined(__linux__) -#include -#include -#else -#include -#include -#endif - -#ifndef __KERNEL__ -#include -#include -#endif - -/* - * protocol version - */ - -#define SNDRV_PROTOCOL_VERSION(major, minor, subminor) (((major)<<16)|((minor)<<8)|(subminor)) -#define SNDRV_PROTOCOL_MAJOR(version) (((version)>>16)&0xffff) -#define SNDRV_PROTOCOL_MINOR(version) (((version)>>8)&0xff) -#define SNDRV_PROTOCOL_MICRO(version) ((version)&0xff) -#define SNDRV_PROTOCOL_INCOMPATIBLE(kversion, uversion) \ - (SNDRV_PROTOCOL_MAJOR(kversion) != SNDRV_PROTOCOL_MAJOR(uversion) || \ - (SNDRV_PROTOCOL_MAJOR(kversion) == SNDRV_PROTOCOL_MAJOR(uversion) && \ - SNDRV_PROTOCOL_MINOR(kversion) != SNDRV_PROTOCOL_MINOR(uversion))) - -/**************************************************************************** - * * - * Digital audio interface * - * * - ****************************************************************************/ - -#define AES_IEC958_STATUS_SIZE 24 - -struct snd_aes_iec958 { - unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ - unsigned char subcode[147]; /* AES/IEC958 subcode bits */ - unsigned char pad; /* nothing */ - unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ -}; - -/**************************************************************************** - * * - * CEA-861 Audio InfoFrame. Used in HDMI and DisplayPort * - * * - ****************************************************************************/ - -struct snd_cea_861_aud_if { - unsigned char db1_ct_cc; /* coding type and channel count */ - unsigned char db2_sf_ss; /* sample frequency and size */ - unsigned char db3; /* not used, all zeros */ - unsigned char db4_ca; /* channel allocation code */ - unsigned char db5_dminh_lsv; /* downmix inhibit & level-shit values */ -}; - -/**************************************************************************** - * * - * Section for driver hardware dependent interface - /dev/snd/hw? * - * * - ****************************************************************************/ - -#define SNDRV_HWDEP_VERSION SNDRV_PROTOCOL_VERSION(1, 0, 1) - -enum { - SNDRV_HWDEP_IFACE_OPL2 = 0, - SNDRV_HWDEP_IFACE_OPL3, - SNDRV_HWDEP_IFACE_OPL4, - SNDRV_HWDEP_IFACE_SB16CSP, /* Creative Signal Processor */ - SNDRV_HWDEP_IFACE_EMU10K1, /* FX8010 processor in EMU10K1 chip */ - SNDRV_HWDEP_IFACE_YSS225, /* Yamaha FX processor */ - SNDRV_HWDEP_IFACE_ICS2115, /* Wavetable synth */ - SNDRV_HWDEP_IFACE_SSCAPE, /* Ensoniq SoundScape ISA card (MC68EC000) */ - SNDRV_HWDEP_IFACE_VX, /* Digigram VX cards */ - SNDRV_HWDEP_IFACE_MIXART, /* Digigram miXart cards */ - SNDRV_HWDEP_IFACE_USX2Y, /* Tascam US122, US224 & US428 usb */ - SNDRV_HWDEP_IFACE_EMUX_WAVETABLE, /* EmuX wavetable */ - SNDRV_HWDEP_IFACE_BLUETOOTH, /* Bluetooth audio */ - SNDRV_HWDEP_IFACE_USX2Y_PCM, /* Tascam US122, US224 & US428 rawusb pcm */ - SNDRV_HWDEP_IFACE_PCXHR, /* Digigram PCXHR */ - SNDRV_HWDEP_IFACE_SB_RC, /* SB Extigy/Audigy2NX remote control */ - SNDRV_HWDEP_IFACE_HDA, /* HD-audio */ - SNDRV_HWDEP_IFACE_USB_STREAM, /* direct access to usb stream */ - SNDRV_HWDEP_IFACE_FW_DICE, /* TC DICE FireWire device */ - SNDRV_HWDEP_IFACE_FW_FIREWORKS, /* Echo Audio Fireworks based device */ - SNDRV_HWDEP_IFACE_FW_BEBOB, /* BridgeCo BeBoB based device */ - SNDRV_HWDEP_IFACE_FW_OXFW, /* Oxford OXFW970/971 based device */ - SNDRV_HWDEP_IFACE_FW_DIGI00X, /* Digidesign Digi 002/003 family */ - SNDRV_HWDEP_IFACE_FW_TASCAM, /* TASCAM FireWire series */ - SNDRV_HWDEP_IFACE_LINE6, /* Line6 USB processors */ - SNDRV_HWDEP_IFACE_FW_MOTU, /* MOTU FireWire series */ - SNDRV_HWDEP_IFACE_FW_FIREFACE, /* RME Fireface series */ - - /* Don't forget to change the following: */ - SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_FIREFACE -}; - -struct snd_hwdep_info { - unsigned int device; /* WR: device number */ - int card; /* R: card number */ - unsigned char id[64]; /* ID (user selectable) */ - unsigned char name[80]; /* hwdep name */ - int iface; /* hwdep interface */ - unsigned char reserved[64]; /* reserved for future */ -}; - -/* generic DSP loader */ -struct snd_hwdep_dsp_status { - unsigned int version; /* R: driver-specific version */ - unsigned char id[32]; /* R: driver-specific ID string */ - unsigned int num_dsps; /* R: number of DSP images to transfer */ - unsigned int dsp_loaded; /* R: bit flags indicating the loaded DSPs */ - unsigned int chip_ready; /* R: 1 = initialization finished */ - unsigned char reserved[16]; /* reserved for future use */ -}; - -struct snd_hwdep_dsp_image { - unsigned int index; /* W: DSP index */ - unsigned char name[64]; /* W: ID (e.g. file name) */ - unsigned char __user *image; /* W: binary image */ - size_t length; /* W: size of image in bytes */ - unsigned long driver_data; /* W: driver-specific data */ -}; - -#define SNDRV_HWDEP_IOCTL_PVERSION _IOR ('H', 0x00, int) -#define SNDRV_HWDEP_IOCTL_INFO _IOR ('H', 0x01, struct snd_hwdep_info) -#define SNDRV_HWDEP_IOCTL_DSP_STATUS _IOR('H', 0x02, struct snd_hwdep_dsp_status) -#define SNDRV_HWDEP_IOCTL_DSP_LOAD _IOW('H', 0x03, struct snd_hwdep_dsp_image) - -/***************************************************************************** - * * - * Digital Audio (PCM) interface - /dev/snd/pcm?? * - * * - *****************************************************************************/ - -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) - -typedef unsigned long snd_pcm_uframes_t; -typedef signed long snd_pcm_sframes_t; - -enum { - SNDRV_PCM_CLASS_GENERIC = 0, /* standard mono or stereo device */ - SNDRV_PCM_CLASS_MULTI, /* multichannel device */ - SNDRV_PCM_CLASS_MODEM, /* software modem class */ - SNDRV_PCM_CLASS_DIGITIZER, /* digitizer class */ - /* Don't forget to change the following: */ - SNDRV_PCM_CLASS_LAST = SNDRV_PCM_CLASS_DIGITIZER, -}; - -enum { - SNDRV_PCM_SUBCLASS_GENERIC_MIX = 0, /* mono or stereo subdevices are mixed together */ - SNDRV_PCM_SUBCLASS_MULTI_MIX, /* multichannel subdevices are mixed together */ - /* Don't forget to change the following: */ - SNDRV_PCM_SUBCLASS_LAST = SNDRV_PCM_SUBCLASS_MULTI_MIX, -}; - -enum { - SNDRV_PCM_STREAM_PLAYBACK = 0, - SNDRV_PCM_STREAM_CAPTURE, - SNDRV_PCM_STREAM_LAST = SNDRV_PCM_STREAM_CAPTURE, -}; - -typedef int __bitwise snd_pcm_access_t; -#define SNDRV_PCM_ACCESS_MMAP_INTERLEAVED ((__force snd_pcm_access_t) 0) /* interleaved mmap */ -#define SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED ((__force snd_pcm_access_t) 1) /* noninterleaved mmap */ -#define SNDRV_PCM_ACCESS_MMAP_COMPLEX ((__force snd_pcm_access_t) 2) /* complex mmap */ -#define SNDRV_PCM_ACCESS_RW_INTERLEAVED ((__force snd_pcm_access_t) 3) /* readi/writei */ -#define SNDRV_PCM_ACCESS_RW_NONINTERLEAVED ((__force snd_pcm_access_t) 4) /* readn/writen */ -#define SNDRV_PCM_ACCESS_LAST SNDRV_PCM_ACCESS_RW_NONINTERLEAVED - -typedef int __bitwise snd_pcm_format_t; -#define SNDRV_PCM_FORMAT_S8 ((__force snd_pcm_format_t) 0) -#define SNDRV_PCM_FORMAT_U8 ((__force snd_pcm_format_t) 1) -#define SNDRV_PCM_FORMAT_S16_LE ((__force snd_pcm_format_t) 2) -#define SNDRV_PCM_FORMAT_S16_BE ((__force snd_pcm_format_t) 3) -#define SNDRV_PCM_FORMAT_U16_LE ((__force snd_pcm_format_t) 4) -#define SNDRV_PCM_FORMAT_U16_BE ((__force snd_pcm_format_t) 5) -#define SNDRV_PCM_FORMAT_S24_LE ((__force snd_pcm_format_t) 6) /* low three bytes */ -#define SNDRV_PCM_FORMAT_S24_BE ((__force snd_pcm_format_t) 7) /* low three bytes */ -#define SNDRV_PCM_FORMAT_U24_LE ((__force snd_pcm_format_t) 8) /* low three bytes */ -#define SNDRV_PCM_FORMAT_U24_BE ((__force snd_pcm_format_t) 9) /* low three bytes */ -/* - * For S32/U32 formats, 'msbits' hardware parameter is often used to deliver information about the - * available bit count in most significant bit. It's for the case of so-called 'left-justified' or - * `right-padding` sample which has less width than 32 bit. - */ -#define SNDRV_PCM_FORMAT_S32_LE ((__force snd_pcm_format_t) 10) -#define SNDRV_PCM_FORMAT_S32_BE ((__force snd_pcm_format_t) 11) -#define SNDRV_PCM_FORMAT_U32_LE ((__force snd_pcm_format_t) 12) -#define SNDRV_PCM_FORMAT_U32_BE ((__force snd_pcm_format_t) 13) -#define SNDRV_PCM_FORMAT_FLOAT_LE ((__force snd_pcm_format_t) 14) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_FLOAT_BE ((__force snd_pcm_format_t) 15) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_FLOAT64_LE ((__force snd_pcm_format_t) 16) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_FLOAT64_BE ((__force snd_pcm_format_t) 17) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE ((__force snd_pcm_format_t) 18) /* IEC-958 subframe, Little Endian */ -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE ((__force snd_pcm_format_t) 19) /* IEC-958 subframe, Big Endian */ -#define SNDRV_PCM_FORMAT_MU_LAW ((__force snd_pcm_format_t) 20) -#define SNDRV_PCM_FORMAT_A_LAW ((__force snd_pcm_format_t) 21) -#define SNDRV_PCM_FORMAT_IMA_ADPCM ((__force snd_pcm_format_t) 22) -#define SNDRV_PCM_FORMAT_MPEG ((__force snd_pcm_format_t) 23) -#define SNDRV_PCM_FORMAT_GSM ((__force snd_pcm_format_t) 24) -#define SNDRV_PCM_FORMAT_S20_LE ((__force snd_pcm_format_t) 25) /* in four bytes, LSB justified */ -#define SNDRV_PCM_FORMAT_S20_BE ((__force snd_pcm_format_t) 26) /* in four bytes, LSB justified */ -#define SNDRV_PCM_FORMAT_U20_LE ((__force snd_pcm_format_t) 27) /* in four bytes, LSB justified */ -#define SNDRV_PCM_FORMAT_U20_BE ((__force snd_pcm_format_t) 28) /* in four bytes, LSB justified */ -/* gap in the numbering for a future standard linear format */ -#define SNDRV_PCM_FORMAT_SPECIAL ((__force snd_pcm_format_t) 31) -#define SNDRV_PCM_FORMAT_S24_3LE ((__force snd_pcm_format_t) 32) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S24_3BE ((__force snd_pcm_format_t) 33) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U24_3LE ((__force snd_pcm_format_t) 34) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U24_3BE ((__force snd_pcm_format_t) 35) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S20_3LE ((__force snd_pcm_format_t) 36) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S20_3BE ((__force snd_pcm_format_t) 37) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U20_3LE ((__force snd_pcm_format_t) 38) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U20_3BE ((__force snd_pcm_format_t) 39) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S18_3LE ((__force snd_pcm_format_t) 40) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S18_3BE ((__force snd_pcm_format_t) 41) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U18_3LE ((__force snd_pcm_format_t) 42) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U18_3BE ((__force snd_pcm_format_t) 43) /* in three bytes */ -#define SNDRV_PCM_FORMAT_G723_24 ((__force snd_pcm_format_t) 44) /* 8 samples in 3 bytes */ -#define SNDRV_PCM_FORMAT_G723_24_1B ((__force snd_pcm_format_t) 45) /* 1 sample in 1 byte */ -#define SNDRV_PCM_FORMAT_G723_40 ((__force snd_pcm_format_t) 46) /* 8 Samples in 5 bytes */ -#define SNDRV_PCM_FORMAT_G723_40_1B ((__force snd_pcm_format_t) 47) /* 1 sample in 1 byte */ -#define SNDRV_PCM_FORMAT_DSD_U8 ((__force snd_pcm_format_t) 48) /* DSD, 1-byte samples DSD (x8) */ -#define SNDRV_PCM_FORMAT_DSD_U16_LE ((__force snd_pcm_format_t) 49) /* DSD, 2-byte samples DSD (x16), little endian */ -#define SNDRV_PCM_FORMAT_DSD_U32_LE ((__force snd_pcm_format_t) 50) /* DSD, 4-byte samples DSD (x32), little endian */ -#define SNDRV_PCM_FORMAT_DSD_U16_BE ((__force snd_pcm_format_t) 51) /* DSD, 2-byte samples DSD (x16), big endian */ -#define SNDRV_PCM_FORMAT_DSD_U32_BE ((__force snd_pcm_format_t) 52) /* DSD, 4-byte samples DSD (x32), big endian */ -#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U32_BE -#define SNDRV_PCM_FORMAT_FIRST SNDRV_PCM_FORMAT_S8 - -#ifdef SNDRV_LITTLE_ENDIAN -#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_LE -#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_LE -#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_LE -#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_LE -#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_LE -#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_LE -#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_LE -#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_LE -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE -#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_LE -#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_LE -#endif -#ifdef SNDRV_BIG_ENDIAN -#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_BE -#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_BE -#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_BE -#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_BE -#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_BE -#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_BE -#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_BE -#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_BE -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE -#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_BE -#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_BE -#endif - -typedef int __bitwise snd_pcm_subformat_t; -#define SNDRV_PCM_SUBFORMAT_STD ((__force snd_pcm_subformat_t) 0) -#define SNDRV_PCM_SUBFORMAT_MSBITS_MAX ((__force snd_pcm_subformat_t) 1) -#define SNDRV_PCM_SUBFORMAT_MSBITS_20 ((__force snd_pcm_subformat_t) 2) -#define SNDRV_PCM_SUBFORMAT_MSBITS_24 ((__force snd_pcm_subformat_t) 3) -#define SNDRV_PCM_SUBFORMAT_LAST SNDRV_PCM_SUBFORMAT_MSBITS_24 - -#define SNDRV_PCM_INFO_MMAP 0x00000001 /* hardware supports mmap */ -#define SNDRV_PCM_INFO_MMAP_VALID 0x00000002 /* period data are valid during transfer */ -#define SNDRV_PCM_INFO_DOUBLE 0x00000004 /* Double buffering needed for PCM start/stop */ -#define SNDRV_PCM_INFO_BATCH 0x00000010 /* double buffering */ -#define SNDRV_PCM_INFO_SYNC_APPLPTR 0x00000020 /* need the explicit sync of appl_ptr update */ -#define SNDRV_PCM_INFO_PERFECT_DRAIN 0x00000040 /* silencing at the end of stream is not required */ -#define SNDRV_PCM_INFO_INTERLEAVED 0x00000100 /* channels are interleaved */ -#define SNDRV_PCM_INFO_NONINTERLEAVED 0x00000200 /* channels are not interleaved */ -#define SNDRV_PCM_INFO_COMPLEX 0x00000400 /* complex frame organization (mmap only) */ -#define SNDRV_PCM_INFO_BLOCK_TRANSFER 0x00010000 /* hardware transfer block of samples */ -#define SNDRV_PCM_INFO_OVERRANGE 0x00020000 /* hardware supports ADC (capture) overrange detection */ -#define SNDRV_PCM_INFO_RESUME 0x00040000 /* hardware supports stream resume after suspend */ -#define SNDRV_PCM_INFO_PAUSE 0x00080000 /* pause ioctl is supported */ -#define SNDRV_PCM_INFO_HALF_DUPLEX 0x00100000 /* only half duplex */ -#define SNDRV_PCM_INFO_JOINT_DUPLEX 0x00200000 /* playback and capture stream are somewhat correlated */ -#define SNDRV_PCM_INFO_SYNC_START 0x00400000 /* pcm support some kind of sync go */ -#define SNDRV_PCM_INFO_NO_PERIOD_WAKEUP 0x00800000 /* period wakeup can be disabled */ -#define SNDRV_PCM_INFO_HAS_WALL_CLOCK 0x01000000 /* (Deprecated)has audio wall clock for audio/system time sync */ -#define SNDRV_PCM_INFO_HAS_LINK_ATIME 0x01000000 /* report hardware link audio time, reset on startup */ -#define SNDRV_PCM_INFO_HAS_LINK_ABSOLUTE_ATIME 0x02000000 /* report absolute hardware link audio time, not reset on startup */ -#define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME 0x04000000 /* report estimated link audio time */ -#define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000 /* report synchronized audio/system time */ -#define SNDRV_PCM_INFO_EXPLICIT_SYNC 0x10000000 /* needs explicit sync of pointers and data */ -#define SNDRV_PCM_INFO_NO_REWINDS 0x20000000 /* hardware can only support monotonic changes of appl_ptr */ -#define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ -#define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ - -#if (__BITS_PER_LONG == 32 && defined(__USE_TIME_BITS64)) || defined __KERNEL__ -#define __SND_STRUCT_TIME64 -#endif - -typedef int __bitwise snd_pcm_state_t; -#define SNDRV_PCM_STATE_OPEN ((__force snd_pcm_state_t) 0) /* stream is open */ -#define SNDRV_PCM_STATE_SETUP ((__force snd_pcm_state_t) 1) /* stream has a setup */ -#define SNDRV_PCM_STATE_PREPARED ((__force snd_pcm_state_t) 2) /* stream is ready to start */ -#define SNDRV_PCM_STATE_RUNNING ((__force snd_pcm_state_t) 3) /* stream is running */ -#define SNDRV_PCM_STATE_XRUN ((__force snd_pcm_state_t) 4) /* stream reached an xrun */ -#define SNDRV_PCM_STATE_DRAINING ((__force snd_pcm_state_t) 5) /* stream is draining */ -#define SNDRV_PCM_STATE_PAUSED ((__force snd_pcm_state_t) 6) /* stream is paused */ -#define SNDRV_PCM_STATE_SUSPENDED ((__force snd_pcm_state_t) 7) /* hardware is suspended */ -#define SNDRV_PCM_STATE_DISCONNECTED ((__force snd_pcm_state_t) 8) /* hardware is disconnected */ -#define SNDRV_PCM_STATE_LAST SNDRV_PCM_STATE_DISCONNECTED - -enum { - SNDRV_PCM_MMAP_OFFSET_DATA = 0x00000000, - SNDRV_PCM_MMAP_OFFSET_STATUS_OLD = 0x80000000, - SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD = 0x81000000, - SNDRV_PCM_MMAP_OFFSET_STATUS_NEW = 0x82000000, - SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW = 0x83000000, -#ifdef __SND_STRUCT_TIME64 - SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_NEW, - SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW, -#else - SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_OLD, - SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD, -#endif -}; - -union snd_pcm_sync_id { - unsigned char id[16]; - unsigned short id16[8]; - unsigned int id32[4]; -}; - -struct snd_pcm_info { - unsigned int device; /* RO/WR (control): device number */ - unsigned int subdevice; /* RO/WR (control): subdevice number */ - int stream; /* RO/WR (control): stream direction */ - int card; /* R: card number */ - unsigned char id[64]; /* ID (user selectable) */ - unsigned char name[80]; /* name of this device */ - unsigned char subname[32]; /* subdevice name */ - int dev_class; /* SNDRV_PCM_CLASS_* */ - int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */ - unsigned int subdevices_count; - unsigned int subdevices_avail; - union snd_pcm_sync_id sync; /* hardware synchronization ID */ - unsigned char reserved[64]; /* reserved for future... */ -}; - -typedef int snd_pcm_hw_param_t; -#define SNDRV_PCM_HW_PARAM_ACCESS 0 /* Access type */ -#define SNDRV_PCM_HW_PARAM_FORMAT 1 /* Format */ -#define SNDRV_PCM_HW_PARAM_SUBFORMAT 2 /* Subformat */ -#define SNDRV_PCM_HW_PARAM_FIRST_MASK SNDRV_PCM_HW_PARAM_ACCESS -#define SNDRV_PCM_HW_PARAM_LAST_MASK SNDRV_PCM_HW_PARAM_SUBFORMAT - -#define SNDRV_PCM_HW_PARAM_SAMPLE_BITS 8 /* Bits per sample */ -#define SNDRV_PCM_HW_PARAM_FRAME_BITS 9 /* Bits per frame */ -#define SNDRV_PCM_HW_PARAM_CHANNELS 10 /* Channels */ -#define SNDRV_PCM_HW_PARAM_RATE 11 /* Approx rate */ -#define SNDRV_PCM_HW_PARAM_PERIOD_TIME 12 /* Approx distance between - * interrupts in us - */ -#define SNDRV_PCM_HW_PARAM_PERIOD_SIZE 13 /* Approx frames between - * interrupts - */ -#define SNDRV_PCM_HW_PARAM_PERIOD_BYTES 14 /* Approx bytes between - * interrupts - */ -#define SNDRV_PCM_HW_PARAM_PERIODS 15 /* Approx interrupts per - * buffer - */ -#define SNDRV_PCM_HW_PARAM_BUFFER_TIME 16 /* Approx duration of buffer - * in us - */ -#define SNDRV_PCM_HW_PARAM_BUFFER_SIZE 17 /* Size of buffer in frames */ -#define SNDRV_PCM_HW_PARAM_BUFFER_BYTES 18 /* Size of buffer in bytes */ -#define SNDRV_PCM_HW_PARAM_TICK_TIME 19 /* Approx tick duration in us */ -#define SNDRV_PCM_HW_PARAM_FIRST_INTERVAL SNDRV_PCM_HW_PARAM_SAMPLE_BITS -#define SNDRV_PCM_HW_PARAM_LAST_INTERVAL SNDRV_PCM_HW_PARAM_TICK_TIME - -#define SNDRV_PCM_HW_PARAMS_NORESAMPLE (1<<0) /* avoid rate resampling */ -#define SNDRV_PCM_HW_PARAMS_EXPORT_BUFFER (1<<1) /* export buffer */ -#define SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP (1<<2) /* disable period wakeups */ -#define SNDRV_PCM_HW_PARAMS_NO_DRAIN_SILENCE (1<<3) /* suppress drain with the filling - * of the silence samples - */ - -struct snd_interval { - unsigned int min, max; - unsigned int openmin:1, - openmax:1, - integer:1, - empty:1; -}; - -#define SNDRV_MASK_MAX 256 - -struct snd_mask { - __u32 bits[(SNDRV_MASK_MAX+31)/32]; -}; - -struct snd_pcm_hw_params { - unsigned int flags; - struct snd_mask masks[SNDRV_PCM_HW_PARAM_LAST_MASK - - SNDRV_PCM_HW_PARAM_FIRST_MASK + 1]; - struct snd_mask mres[5]; /* reserved masks */ - struct snd_interval intervals[SNDRV_PCM_HW_PARAM_LAST_INTERVAL - - SNDRV_PCM_HW_PARAM_FIRST_INTERVAL + 1]; - struct snd_interval ires[9]; /* reserved intervals */ - unsigned int rmask; /* W: requested masks */ - unsigned int cmask; /* R: changed masks */ - unsigned int info; /* R: Info flags for returned setup */ - unsigned int msbits; /* R: used most significant bits (in sample bit-width) */ - unsigned int rate_num; /* R: rate numerator */ - unsigned int rate_den; /* R: rate denominator */ - snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ - unsigned char reserved[64]; /* reserved for future */ -}; - -enum { - SNDRV_PCM_TSTAMP_NONE = 0, - SNDRV_PCM_TSTAMP_ENABLE, - SNDRV_PCM_TSTAMP_LAST = SNDRV_PCM_TSTAMP_ENABLE, -}; - -struct snd_pcm_sw_params { - int tstamp_mode; /* timestamp mode */ - unsigned int period_step; - unsigned int sleep_min; /* min ticks to sleep */ - snd_pcm_uframes_t avail_min; /* min avail frames for wakeup */ - snd_pcm_uframes_t xfer_align; /* obsolete: xfer size need to be a multiple */ - snd_pcm_uframes_t start_threshold; /* min hw_avail frames for automatic start */ - /* - * The following two thresholds alleviate playback buffer underruns; when - * hw_avail drops below the threshold, the respective action is triggered: - */ - snd_pcm_uframes_t stop_threshold; /* - stop playback */ - snd_pcm_uframes_t silence_threshold; /* - pre-fill buffer with silence */ - snd_pcm_uframes_t silence_size; /* max size of silence pre-fill; when >= boundary, - * fill played area with silence immediately */ - snd_pcm_uframes_t boundary; /* pointers wrap point */ - unsigned int proto; /* protocol version */ - unsigned int tstamp_type; /* timestamp type (req. proto >= 2.0.12) */ - unsigned char reserved[56]; /* reserved for future */ -}; - -struct snd_pcm_channel_info { - unsigned int channel; - __kernel_off_t offset; /* mmap offset */ - unsigned int first; /* offset to first sample in bits */ - unsigned int step; /* samples distance in bits */ -}; - -enum { - /* - * first definition for backwards compatibility only, - * maps to wallclock/link time for HDAudio playback and DEFAULT/DMA time for everything else - */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT = 0, - - /* timestamp definitions */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT = 1, /* DMA time, reported as per hw_ptr */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK = 2, /* link time reported by sample or wallclock counter, reset on startup */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ABSOLUTE = 3, /* link time reported by sample or wallclock counter, not reset on startup */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ESTIMATED = 4, /* link time estimated indirectly */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED = 5, /* link time synchronized with system time */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LAST = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED -}; - -#ifndef __KERNEL__ -/* explicit padding avoids incompatibility between i386 and x86-64 */ -typedef struct { unsigned char pad[sizeof(time_t) - sizeof(int)]; } __time_pad; - -struct snd_pcm_status { - snd_pcm_state_t state; /* stream state */ - __time_pad pad1; /* align to timespec */ - struct timespec trigger_tstamp; /* time when stream was started/stopped/paused */ - struct timespec tstamp; /* reference timestamp */ - snd_pcm_uframes_t appl_ptr; /* appl ptr */ - snd_pcm_uframes_t hw_ptr; /* hw ptr */ - snd_pcm_sframes_t delay; /* current delay in frames */ - snd_pcm_uframes_t avail; /* number of frames available */ - snd_pcm_uframes_t avail_max; /* max frames available on hw since last status */ - snd_pcm_uframes_t overrange; /* count of ADC (capture) overrange detections from last status */ - snd_pcm_state_t suspended_state; /* suspended stream state */ - __u32 audio_tstamp_data; /* needed for 64-bit alignment, used for configs/report to/from userspace */ - struct timespec audio_tstamp; /* sample counter, wall clock, PHC or on-demand sync'ed */ - struct timespec driver_tstamp; /* useful in case reference system tstamp is reported with delay */ - __u32 audio_tstamp_accuracy; /* in ns units, only valid if indicated in audio_tstamp_data */ - unsigned char reserved[52-2*sizeof(struct timespec)]; /* must be filled with zero */ -}; -#endif - -/* - * For mmap operations, we need the 64-bit layout, both for compat mode, - * and for y2038 compatibility. For 64-bit applications, the two definitions - * are identical, so we keep the traditional version. - */ -#ifdef __SND_STRUCT_TIME64 -#define __snd_pcm_mmap_status64 snd_pcm_mmap_status -#define __snd_pcm_mmap_control64 snd_pcm_mmap_control -#define __snd_pcm_sync_ptr64 snd_pcm_sync_ptr -#ifdef __KERNEL__ -#define __snd_timespec64 __kernel_timespec -#else -#define __snd_timespec64 timespec -#endif -struct __snd_timespec { - __s32 tv_sec; - __s32 tv_nsec; -}; -#else -#define __snd_pcm_mmap_status snd_pcm_mmap_status -#define __snd_pcm_mmap_control snd_pcm_mmap_control -#define __snd_pcm_sync_ptr snd_pcm_sync_ptr -#define __snd_timespec timespec -struct __snd_timespec64 { - __s64 tv_sec; - __s64 tv_nsec; -}; - -#endif - -struct __snd_pcm_mmap_status { - snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ - int pad1; /* Needed for 64 bit alignment */ - snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ - struct __snd_timespec tstamp; /* Timestamp */ - snd_pcm_state_t suspended_state; /* RO: suspended stream state */ - struct __snd_timespec audio_tstamp; /* from sample counter or wall clock */ -}; - -struct __snd_pcm_mmap_control { - snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ - snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ -}; - -#define SNDRV_PCM_SYNC_PTR_HWSYNC (1<<0) /* execute hwsync */ -#define SNDRV_PCM_SYNC_PTR_APPL (1<<1) /* get appl_ptr from driver (r/w op) */ -#define SNDRV_PCM_SYNC_PTR_AVAIL_MIN (1<<2) /* get avail_min from driver */ - -struct __snd_pcm_sync_ptr { - unsigned int flags; - union { - struct __snd_pcm_mmap_status status; - unsigned char reserved[64]; - } s; - union { - struct __snd_pcm_mmap_control control; - unsigned char reserved[64]; - } c; -}; - -#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) -typedef char __pad_before_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; -typedef char __pad_after_uframe[0]; -#endif - -#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) -typedef char __pad_before_uframe[0]; -typedef char __pad_after_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; -#endif - -struct __snd_pcm_mmap_status64 { - snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ - __u32 pad1; /* Needed for 64 bit alignment */ - __pad_before_uframe __pad1; - snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ - __pad_after_uframe __pad2; - struct __snd_timespec64 tstamp; /* Timestamp */ - snd_pcm_state_t suspended_state;/* RO: suspended stream state */ - __u32 pad3; /* Needed for 64 bit alignment */ - struct __snd_timespec64 audio_tstamp; /* sample counter or wall clock */ -}; - -struct __snd_pcm_mmap_control64 { - __pad_before_uframe __pad1; - snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ - __pad_before_uframe __pad2; // This should be __pad_after_uframe, but binary - // backwards compatibility constraints prevent a fix. - - __pad_before_uframe __pad3; - snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ - __pad_after_uframe __pad4; -}; - -struct __snd_pcm_sync_ptr64 { - __u32 flags; - __u32 pad1; - union { - struct __snd_pcm_mmap_status64 status; - unsigned char reserved[64]; - } s; - union { - struct __snd_pcm_mmap_control64 control; - unsigned char reserved[64]; - } c; -}; - -struct snd_xferi { - snd_pcm_sframes_t result; - void __user *buf; - snd_pcm_uframes_t frames; -}; - -struct snd_xfern { - snd_pcm_sframes_t result; - void __user * __user *bufs; - snd_pcm_uframes_t frames; -}; - -enum { - SNDRV_PCM_TSTAMP_TYPE_GETTIMEOFDAY = 0, /* gettimeofday equivalent */ - SNDRV_PCM_TSTAMP_TYPE_MONOTONIC, /* posix_clock_monotonic equivalent */ - SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, /* monotonic_raw (no NTP) */ - SNDRV_PCM_TSTAMP_TYPE_LAST = SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, -}; - -/* channel positions */ -enum { - SNDRV_CHMAP_UNKNOWN = 0, - SNDRV_CHMAP_NA, /* N/A, silent */ - SNDRV_CHMAP_MONO, /* mono stream */ - /* this follows the alsa-lib mixer channel value + 3 */ - SNDRV_CHMAP_FL, /* front left */ - SNDRV_CHMAP_FR, /* front right */ - SNDRV_CHMAP_RL, /* rear left */ - SNDRV_CHMAP_RR, /* rear right */ - SNDRV_CHMAP_FC, /* front center */ - SNDRV_CHMAP_LFE, /* LFE */ - SNDRV_CHMAP_SL, /* side left */ - SNDRV_CHMAP_SR, /* side right */ - SNDRV_CHMAP_RC, /* rear center */ - /* new definitions */ - SNDRV_CHMAP_FLC, /* front left center */ - SNDRV_CHMAP_FRC, /* front right center */ - SNDRV_CHMAP_RLC, /* rear left center */ - SNDRV_CHMAP_RRC, /* rear right center */ - SNDRV_CHMAP_FLW, /* front left wide */ - SNDRV_CHMAP_FRW, /* front right wide */ - SNDRV_CHMAP_FLH, /* front left high */ - SNDRV_CHMAP_FCH, /* front center high */ - SNDRV_CHMAP_FRH, /* front right high */ - SNDRV_CHMAP_TC, /* top center */ - SNDRV_CHMAP_TFL, /* top front left */ - SNDRV_CHMAP_TFR, /* top front right */ - SNDRV_CHMAP_TFC, /* top front center */ - SNDRV_CHMAP_TRL, /* top rear left */ - SNDRV_CHMAP_TRR, /* top rear right */ - SNDRV_CHMAP_TRC, /* top rear center */ - /* new definitions for UAC2 */ - SNDRV_CHMAP_TFLC, /* top front left center */ - SNDRV_CHMAP_TFRC, /* top front right center */ - SNDRV_CHMAP_TSL, /* top side left */ - SNDRV_CHMAP_TSR, /* top side right */ - SNDRV_CHMAP_LLFE, /* left LFE */ - SNDRV_CHMAP_RLFE, /* right LFE */ - SNDRV_CHMAP_BC, /* bottom center */ - SNDRV_CHMAP_BLC, /* bottom left center */ - SNDRV_CHMAP_BRC, /* bottom right center */ - SNDRV_CHMAP_LAST = SNDRV_CHMAP_BRC, -}; - -#define SNDRV_CHMAP_POSITION_MASK 0xffff -#define SNDRV_CHMAP_PHASE_INVERSE (0x01 << 16) -#define SNDRV_CHMAP_DRIVER_SPEC (0x02 << 16) - -#define SNDRV_PCM_IOCTL_PVERSION _IOR('A', 0x00, int) -#define SNDRV_PCM_IOCTL_INFO _IOR('A', 0x01, struct snd_pcm_info) -#define SNDRV_PCM_IOCTL_TSTAMP _IOW('A', 0x02, int) -#define SNDRV_PCM_IOCTL_TTSTAMP _IOW('A', 0x03, int) -#define SNDRV_PCM_IOCTL_USER_PVERSION _IOW('A', 0x04, int) -#define SNDRV_PCM_IOCTL_HW_REFINE _IOWR('A', 0x10, struct snd_pcm_hw_params) -#define SNDRV_PCM_IOCTL_HW_PARAMS _IOWR('A', 0x11, struct snd_pcm_hw_params) -#define SNDRV_PCM_IOCTL_HW_FREE _IO('A', 0x12) -#define SNDRV_PCM_IOCTL_SW_PARAMS _IOWR('A', 0x13, struct snd_pcm_sw_params) -#define SNDRV_PCM_IOCTL_STATUS _IOR('A', 0x20, struct snd_pcm_status) -#define SNDRV_PCM_IOCTL_DELAY _IOR('A', 0x21, snd_pcm_sframes_t) -#define SNDRV_PCM_IOCTL_HWSYNC _IO('A', 0x22) -#define __SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct __snd_pcm_sync_ptr) -#define __SNDRV_PCM_IOCTL_SYNC_PTR64 _IOWR('A', 0x23, struct __snd_pcm_sync_ptr64) -#define SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct snd_pcm_sync_ptr) -#define SNDRV_PCM_IOCTL_STATUS_EXT _IOWR('A', 0x24, struct snd_pcm_status) -#define SNDRV_PCM_IOCTL_CHANNEL_INFO _IOR('A', 0x32, struct snd_pcm_channel_info) -#define SNDRV_PCM_IOCTL_PREPARE _IO('A', 0x40) -#define SNDRV_PCM_IOCTL_RESET _IO('A', 0x41) -#define SNDRV_PCM_IOCTL_START _IO('A', 0x42) -#define SNDRV_PCM_IOCTL_DROP _IO('A', 0x43) -#define SNDRV_PCM_IOCTL_DRAIN _IO('A', 0x44) -#define SNDRV_PCM_IOCTL_PAUSE _IOW('A', 0x45, int) -#define SNDRV_PCM_IOCTL_REWIND _IOW('A', 0x46, snd_pcm_uframes_t) -#define SNDRV_PCM_IOCTL_RESUME _IO('A', 0x47) -#define SNDRV_PCM_IOCTL_XRUN _IO('A', 0x48) -#define SNDRV_PCM_IOCTL_FORWARD _IOW('A', 0x49, snd_pcm_uframes_t) -#define SNDRV_PCM_IOCTL_WRITEI_FRAMES _IOW('A', 0x50, struct snd_xferi) -#define SNDRV_PCM_IOCTL_READI_FRAMES _IOR('A', 0x51, struct snd_xferi) -#define SNDRV_PCM_IOCTL_WRITEN_FRAMES _IOW('A', 0x52, struct snd_xfern) -#define SNDRV_PCM_IOCTL_READN_FRAMES _IOR('A', 0x53, struct snd_xfern) -#define SNDRV_PCM_IOCTL_LINK _IOW('A', 0x60, int) -#define SNDRV_PCM_IOCTL_UNLINK _IO('A', 0x61) - -/***************************************************************************** - * * - * MIDI v1.0 interface * - * * - *****************************************************************************/ - -/* - * Raw MIDI section - /dev/snd/midi?? - */ - -#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 4) - -enum { - SNDRV_RAWMIDI_STREAM_OUTPUT = 0, - SNDRV_RAWMIDI_STREAM_INPUT, - SNDRV_RAWMIDI_STREAM_LAST = SNDRV_RAWMIDI_STREAM_INPUT, -}; - -#define SNDRV_RAWMIDI_INFO_OUTPUT 0x00000001 -#define SNDRV_RAWMIDI_INFO_INPUT 0x00000002 -#define SNDRV_RAWMIDI_INFO_DUPLEX 0x00000004 -#define SNDRV_RAWMIDI_INFO_UMP 0x00000008 - -struct snd_rawmidi_info { - unsigned int device; /* RO/WR (control): device number */ - unsigned int subdevice; /* RO/WR (control): subdevice number */ - int stream; /* WR: stream */ - int card; /* R: card number */ - unsigned int flags; /* SNDRV_RAWMIDI_INFO_XXXX */ - unsigned char id[64]; /* ID (user selectable) */ - unsigned char name[80]; /* name of device */ - unsigned char subname[32]; /* name of active or selected subdevice */ - unsigned int subdevices_count; - unsigned int subdevices_avail; - unsigned char reserved[64]; /* reserved for future use */ -}; - -#define SNDRV_RAWMIDI_MODE_FRAMING_MASK (7<<0) -#define SNDRV_RAWMIDI_MODE_FRAMING_SHIFT 0 -#define SNDRV_RAWMIDI_MODE_FRAMING_NONE (0<<0) -#define SNDRV_RAWMIDI_MODE_FRAMING_TSTAMP (1<<0) -#define SNDRV_RAWMIDI_MODE_CLOCK_MASK (7<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_SHIFT 3 -#define SNDRV_RAWMIDI_MODE_CLOCK_NONE (0<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_REALTIME (1<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC (2<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC_RAW (3<<3) - -#define SNDRV_RAWMIDI_FRAMING_DATA_LENGTH 16 - -struct snd_rawmidi_framing_tstamp { - /* For now, frame_type is always 0. Midi 2.0 is expected to add new - * types here. Applications are expected to skip unknown frame types. - */ - __u8 frame_type; - __u8 length; /* number of valid bytes in data field */ - __u8 reserved[2]; - __u32 tv_nsec; /* nanoseconds */ - __u64 tv_sec; /* seconds */ - __u8 data[SNDRV_RAWMIDI_FRAMING_DATA_LENGTH]; -} __packed; - -struct snd_rawmidi_params { - int stream; - size_t buffer_size; /* queue size in bytes */ - size_t avail_min; /* minimum avail bytes for wakeup */ - unsigned int no_active_sensing: 1; /* do not send active sensing byte in close() */ - unsigned int mode; /* For input data only, frame incoming data */ - unsigned char reserved[12]; /* reserved for future use */ -}; - -#ifndef __KERNEL__ -struct snd_rawmidi_status { - int stream; - __time_pad pad1; - struct timespec tstamp; /* Timestamp */ - size_t avail; /* available bytes */ - size_t xruns; /* count of overruns since last status (in bytes) */ - unsigned char reserved[16]; /* reserved for future use */ -}; -#endif - -/* UMP EP info flags */ -#define SNDRV_UMP_EP_INFO_STATIC_BLOCKS 0x01 - -/* UMP EP Protocol / JRTS capability bits */ -#define SNDRV_UMP_EP_INFO_PROTO_MIDI_MASK 0x0300 -#define SNDRV_UMP_EP_INFO_PROTO_MIDI1 0x0100 /* MIDI 1.0 */ -#define SNDRV_UMP_EP_INFO_PROTO_MIDI2 0x0200 /* MIDI 2.0 */ -#define SNDRV_UMP_EP_INFO_PROTO_JRTS_MASK 0x0003 -#define SNDRV_UMP_EP_INFO_PROTO_JRTS_TX 0x0001 /* JRTS Transmit */ -#define SNDRV_UMP_EP_INFO_PROTO_JRTS_RX 0x0002 /* JRTS Receive */ - -/* UMP Endpoint information */ -struct snd_ump_endpoint_info { - int card; /* card number */ - int device; /* device number */ - unsigned int flags; /* additional info */ - unsigned int protocol_caps; /* protocol capabilities */ - unsigned int protocol; /* current protocol */ - unsigned int num_blocks; /* # of function blocks */ - unsigned short version; /* UMP major/minor version */ - unsigned short family_id; /* MIDI device family ID */ - unsigned short model_id; /* MIDI family model ID */ - unsigned int manufacturer_id; /* MIDI manufacturer ID */ - unsigned char sw_revision[4]; /* software revision */ - unsigned short padding; - unsigned char name[128]; /* endpoint name string */ - unsigned char product_id[128]; /* unique product id string */ - unsigned char reserved[32]; -} __packed; - -/* UMP direction */ -#define SNDRV_UMP_DIR_INPUT 0x01 -#define SNDRV_UMP_DIR_OUTPUT 0x02 -#define SNDRV_UMP_DIR_BIDIRECTION 0x03 - -/* UMP block info flags */ -#define SNDRV_UMP_BLOCK_IS_MIDI1 (1U << 0) /* MIDI 1.0 port w/o restrict */ -#define SNDRV_UMP_BLOCK_IS_LOWSPEED (1U << 1) /* 31.25Kbps B/W MIDI1 port */ - -/* UMP block user-interface hint */ -#define SNDRV_UMP_BLOCK_UI_HINT_UNKNOWN 0x00 -#define SNDRV_UMP_BLOCK_UI_HINT_RECEIVER 0x01 -#define SNDRV_UMP_BLOCK_UI_HINT_SENDER 0x02 -#define SNDRV_UMP_BLOCK_UI_HINT_BOTH 0x03 - -/* UMP groups and blocks */ -#define SNDRV_UMP_MAX_GROUPS 16 -#define SNDRV_UMP_MAX_BLOCKS 32 - -/* UMP Block information */ -struct snd_ump_block_info { - int card; /* card number */ - int device; /* device number */ - unsigned char block_id; /* block ID (R/W) */ - unsigned char direction; /* UMP direction */ - unsigned char active; /* Activeness */ - unsigned char first_group; /* first group ID */ - unsigned char num_groups; /* number of groups */ - unsigned char midi_ci_version; /* MIDI-CI support version */ - unsigned char sysex8_streams; /* max number of sysex8 streams */ - unsigned char ui_hint; /* user interface hint */ - unsigned int flags; /* various info flags */ - unsigned char name[128]; /* block name string */ - unsigned char reserved[32]; -} __packed; - -#define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) -#define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) -#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) -#define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) -#define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) -#define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) -#define SNDRV_RAWMIDI_IOCTL_DRAIN _IOW('W', 0x31, int) -/* Additional ioctls for UMP rawmidi devices */ -#define SNDRV_UMP_IOCTL_ENDPOINT_INFO _IOR('W', 0x40, struct snd_ump_endpoint_info) -#define SNDRV_UMP_IOCTL_BLOCK_INFO _IOR('W', 0x41, struct snd_ump_block_info) - -/* - * Timer section - /dev/snd/timer - */ - -#define SNDRV_TIMER_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 7) - -enum { - SNDRV_TIMER_CLASS_NONE = -1, - SNDRV_TIMER_CLASS_SLAVE = 0, - SNDRV_TIMER_CLASS_GLOBAL, - SNDRV_TIMER_CLASS_CARD, - SNDRV_TIMER_CLASS_PCM, - SNDRV_TIMER_CLASS_LAST = SNDRV_TIMER_CLASS_PCM, -}; - -/* slave timer classes */ -enum { - SNDRV_TIMER_SCLASS_NONE = 0, - SNDRV_TIMER_SCLASS_APPLICATION, - SNDRV_TIMER_SCLASS_SEQUENCER, /* alias */ - SNDRV_TIMER_SCLASS_OSS_SEQUENCER, /* alias */ - SNDRV_TIMER_SCLASS_LAST = SNDRV_TIMER_SCLASS_OSS_SEQUENCER, -}; - -/* global timers (device member) */ -#define SNDRV_TIMER_GLOBAL_SYSTEM 0 -#define SNDRV_TIMER_GLOBAL_RTC 1 /* unused */ -#define SNDRV_TIMER_GLOBAL_HPET 2 -#define SNDRV_TIMER_GLOBAL_HRTIMER 3 - -/* info flags */ -#define SNDRV_TIMER_FLG_SLAVE (1<<0) /* cannot be controlled */ - -struct snd_timer_id { - int dev_class; - int dev_sclass; - int card; - int device; - int subdevice; -}; - -struct snd_timer_ginfo { - struct snd_timer_id tid; /* requested timer ID */ - unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ - int card; /* card number */ - unsigned char id[64]; /* timer identification */ - unsigned char name[80]; /* timer name */ - unsigned long reserved0; /* reserved for future use */ - unsigned long resolution; /* average period resolution in ns */ - unsigned long resolution_min; /* minimal period resolution in ns */ - unsigned long resolution_max; /* maximal period resolution in ns */ - unsigned int clients; /* active timer clients */ - unsigned char reserved[32]; -}; - -struct snd_timer_gparams { - struct snd_timer_id tid; /* requested timer ID */ - unsigned long period_num; /* requested precise period duration (in seconds) - numerator */ - unsigned long period_den; /* requested precise period duration (in seconds) - denominator */ - unsigned char reserved[32]; -}; - -struct snd_timer_gstatus { - struct snd_timer_id tid; /* requested timer ID */ - unsigned long resolution; /* current period resolution in ns */ - unsigned long resolution_num; /* precise current period resolution (in seconds) - numerator */ - unsigned long resolution_den; /* precise current period resolution (in seconds) - denominator */ - unsigned char reserved[32]; -}; - -struct snd_timer_select { - struct snd_timer_id id; /* bind to timer ID */ - unsigned char reserved[32]; /* reserved */ -}; - -struct snd_timer_info { - unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ - int card; /* card number */ - unsigned char id[64]; /* timer identificator */ - unsigned char name[80]; /* timer name */ - unsigned long reserved0; /* reserved for future use */ - unsigned long resolution; /* average period resolution in ns */ - unsigned char reserved[64]; /* reserved */ -}; - -#define SNDRV_TIMER_PSFLG_AUTO (1<<0) /* auto start, otherwise one-shot */ -#define SNDRV_TIMER_PSFLG_EXCLUSIVE (1<<1) /* exclusive use, precise start/stop/pause/continue */ -#define SNDRV_TIMER_PSFLG_EARLY_EVENT (1<<2) /* write early event to the poll queue */ - -struct snd_timer_params { - unsigned int flags; /* flags - SNDRV_TIMER_PSFLG_* */ - unsigned int ticks; /* requested resolution in ticks */ - unsigned int queue_size; /* total size of queue (32-1024) */ - unsigned int reserved0; /* reserved, was: failure locations */ - unsigned int filter; /* event filter (bitmask of SNDRV_TIMER_EVENT_*) */ - unsigned char reserved[60]; /* reserved */ -}; - -#ifndef __KERNEL__ -struct snd_timer_status { - struct timespec tstamp; /* Timestamp - last update */ - unsigned int resolution; /* current period resolution in ns */ - unsigned int lost; /* counter of master tick lost */ - unsigned int overrun; /* count of read queue overruns */ - unsigned int queue; /* used queue size */ - unsigned char reserved[64]; /* reserved */ -}; -#endif - -#define SNDRV_TIMER_IOCTL_PVERSION _IOR('T', 0x00, int) -#define SNDRV_TIMER_IOCTL_NEXT_DEVICE _IOWR('T', 0x01, struct snd_timer_id) -#define SNDRV_TIMER_IOCTL_TREAD_OLD _IOW('T', 0x02, int) -#define SNDRV_TIMER_IOCTL_GINFO _IOWR('T', 0x03, struct snd_timer_ginfo) -#define SNDRV_TIMER_IOCTL_GPARAMS _IOW('T', 0x04, struct snd_timer_gparams) -#define SNDRV_TIMER_IOCTL_GSTATUS _IOWR('T', 0x05, struct snd_timer_gstatus) -#define SNDRV_TIMER_IOCTL_SELECT _IOW('T', 0x10, struct snd_timer_select) -#define SNDRV_TIMER_IOCTL_INFO _IOR('T', 0x11, struct snd_timer_info) -#define SNDRV_TIMER_IOCTL_PARAMS _IOW('T', 0x12, struct snd_timer_params) -#define SNDRV_TIMER_IOCTL_STATUS _IOR('T', 0x14, struct snd_timer_status) -/* The following four ioctls are changed since 1.0.9 due to confliction */ -#define SNDRV_TIMER_IOCTL_START _IO('T', 0xa0) -#define SNDRV_TIMER_IOCTL_STOP _IO('T', 0xa1) -#define SNDRV_TIMER_IOCTL_CONTINUE _IO('T', 0xa2) -#define SNDRV_TIMER_IOCTL_PAUSE _IO('T', 0xa3) -#define SNDRV_TIMER_IOCTL_TREAD64 _IOW('T', 0xa4, int) - -#if __BITS_PER_LONG == 64 -#define SNDRV_TIMER_IOCTL_TREAD SNDRV_TIMER_IOCTL_TREAD_OLD -#else -#define SNDRV_TIMER_IOCTL_TREAD ((sizeof(__kernel_long_t) >= sizeof(time_t)) ? \ - SNDRV_TIMER_IOCTL_TREAD_OLD : \ - SNDRV_TIMER_IOCTL_TREAD64) -#endif - -struct snd_timer_read { - unsigned int resolution; - unsigned int ticks; -}; - -enum { - SNDRV_TIMER_EVENT_RESOLUTION = 0, /* val = resolution in ns */ - SNDRV_TIMER_EVENT_TICK, /* val = ticks */ - SNDRV_TIMER_EVENT_START, /* val = resolution in ns */ - SNDRV_TIMER_EVENT_STOP, /* val = 0 */ - SNDRV_TIMER_EVENT_CONTINUE, /* val = resolution in ns */ - SNDRV_TIMER_EVENT_PAUSE, /* val = 0 */ - SNDRV_TIMER_EVENT_EARLY, /* val = 0, early event */ - SNDRV_TIMER_EVENT_SUSPEND, /* val = 0 */ - SNDRV_TIMER_EVENT_RESUME, /* val = resolution in ns */ - /* master timer events for slave timer instances */ - SNDRV_TIMER_EVENT_MSTART = SNDRV_TIMER_EVENT_START + 10, - SNDRV_TIMER_EVENT_MSTOP = SNDRV_TIMER_EVENT_STOP + 10, - SNDRV_TIMER_EVENT_MCONTINUE = SNDRV_TIMER_EVENT_CONTINUE + 10, - SNDRV_TIMER_EVENT_MPAUSE = SNDRV_TIMER_EVENT_PAUSE + 10, - SNDRV_TIMER_EVENT_MSUSPEND = SNDRV_TIMER_EVENT_SUSPEND + 10, - SNDRV_TIMER_EVENT_MRESUME = SNDRV_TIMER_EVENT_RESUME + 10, -}; - -#ifndef __KERNEL__ -struct snd_timer_tread { - int event; - __time_pad pad1; - struct timespec tstamp; - unsigned int val; - __time_pad pad2; -}; -#endif - -/**************************************************************************** - * * - * Section for driver control interface - /dev/snd/control? * - * * - ****************************************************************************/ - -#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 9) - -struct snd_ctl_card_info { - int card; /* card number */ - int pad; /* reserved for future (was type) */ - unsigned char id[16]; /* ID of card (user selectable) */ - unsigned char driver[16]; /* Driver name */ - unsigned char name[32]; /* Short name of soundcard */ - unsigned char longname[80]; /* name + info text about soundcard */ - unsigned char reserved_[16]; /* reserved for future (was ID of mixer) */ - unsigned char mixername[80]; /* visual mixer identification */ - unsigned char components[128]; /* card components / fine identification, delimited with one space (AC97 etc..) */ -}; - -typedef int __bitwise snd_ctl_elem_type_t; -#define SNDRV_CTL_ELEM_TYPE_NONE ((__force snd_ctl_elem_type_t) 0) /* invalid */ -#define SNDRV_CTL_ELEM_TYPE_BOOLEAN ((__force snd_ctl_elem_type_t) 1) /* boolean type */ -#define SNDRV_CTL_ELEM_TYPE_INTEGER ((__force snd_ctl_elem_type_t) 2) /* integer type */ -#define SNDRV_CTL_ELEM_TYPE_ENUMERATED ((__force snd_ctl_elem_type_t) 3) /* enumerated type */ -#define SNDRV_CTL_ELEM_TYPE_BYTES ((__force snd_ctl_elem_type_t) 4) /* byte array */ -#define SNDRV_CTL_ELEM_TYPE_IEC958 ((__force snd_ctl_elem_type_t) 5) /* IEC958 (S/PDIF) setup */ -#define SNDRV_CTL_ELEM_TYPE_INTEGER64 ((__force snd_ctl_elem_type_t) 6) /* 64-bit integer type */ -#define SNDRV_CTL_ELEM_TYPE_LAST SNDRV_CTL_ELEM_TYPE_INTEGER64 - -typedef int __bitwise snd_ctl_elem_iface_t; -#define SNDRV_CTL_ELEM_IFACE_CARD ((__force snd_ctl_elem_iface_t) 0) /* global control */ -#define SNDRV_CTL_ELEM_IFACE_HWDEP ((__force snd_ctl_elem_iface_t) 1) /* hardware dependent device */ -#define SNDRV_CTL_ELEM_IFACE_MIXER ((__force snd_ctl_elem_iface_t) 2) /* virtual mixer device */ -#define SNDRV_CTL_ELEM_IFACE_PCM ((__force snd_ctl_elem_iface_t) 3) /* PCM device */ -#define SNDRV_CTL_ELEM_IFACE_RAWMIDI ((__force snd_ctl_elem_iface_t) 4) /* RawMidi device */ -#define SNDRV_CTL_ELEM_IFACE_TIMER ((__force snd_ctl_elem_iface_t) 5) /* timer device */ -#define SNDRV_CTL_ELEM_IFACE_SEQUENCER ((__force snd_ctl_elem_iface_t) 6) /* sequencer client */ -#define SNDRV_CTL_ELEM_IFACE_LAST SNDRV_CTL_ELEM_IFACE_SEQUENCER - -#define SNDRV_CTL_ELEM_ACCESS_READ (1<<0) -#define SNDRV_CTL_ELEM_ACCESS_WRITE (1<<1) -#define SNDRV_CTL_ELEM_ACCESS_READWRITE (SNDRV_CTL_ELEM_ACCESS_READ|SNDRV_CTL_ELEM_ACCESS_WRITE) -#define SNDRV_CTL_ELEM_ACCESS_VOLATILE (1<<2) /* control value may be changed without a notification */ -/* (1 << 3) is unused. */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_READ (1<<4) /* TLV read is possible */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_WRITE (1<<5) /* TLV write is possible */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE (SNDRV_CTL_ELEM_ACCESS_TLV_READ|SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) -#define SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND (1<<6) /* TLV command is possible */ -#define SNDRV_CTL_ELEM_ACCESS_INACTIVE (1<<8) /* control does actually nothing, but may be updated */ -#define SNDRV_CTL_ELEM_ACCESS_LOCK (1<<9) /* write lock */ -#define SNDRV_CTL_ELEM_ACCESS_OWNER (1<<10) /* write lock owner */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK (1<<28) /* kernel use a TLV callback */ -#define SNDRV_CTL_ELEM_ACCESS_USER (1<<29) /* user space element */ -/* bits 30 and 31 are obsoleted (for indirect access) */ - -/* for further details see the ACPI and PCI power management specification */ -#define SNDRV_CTL_POWER_D0 0x0000 /* full On */ -#define SNDRV_CTL_POWER_D1 0x0100 /* partial On */ -#define SNDRV_CTL_POWER_D2 0x0200 /* partial On */ -#define SNDRV_CTL_POWER_D3 0x0300 /* Off */ -#define SNDRV_CTL_POWER_D3hot (SNDRV_CTL_POWER_D3|0x0000) /* Off, with power */ -#define SNDRV_CTL_POWER_D3cold (SNDRV_CTL_POWER_D3|0x0001) /* Off, without power */ - -#define SNDRV_CTL_ELEM_ID_NAME_MAXLEN 44 - -struct snd_ctl_elem_id { - unsigned int numid; /* numeric identifier, zero = invalid */ - snd_ctl_elem_iface_t iface; /* interface identifier */ - unsigned int device; /* device/client number */ - unsigned int subdevice; /* subdevice (substream) number */ - unsigned char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; /* ASCII name of item */ - unsigned int index; /* index of item */ -}; - -struct snd_ctl_elem_list { - unsigned int offset; /* W: first element ID to get */ - unsigned int space; /* W: count of element IDs to get */ - unsigned int used; /* R: count of element IDs set */ - unsigned int count; /* R: count of all elements */ - struct snd_ctl_elem_id __user *pids; /* R: IDs */ - unsigned char reserved[50]; -}; - -struct snd_ctl_elem_info { - struct snd_ctl_elem_id id; /* W: element ID */ - snd_ctl_elem_type_t type; /* R: value type - SNDRV_CTL_ELEM_TYPE_* */ - unsigned int access; /* R: value access (bitmask) - SNDRV_CTL_ELEM_ACCESS_* */ - unsigned int count; /* count of values */ - __kernel_pid_t owner; /* owner's PID of this control */ - union { - struct { - long min; /* R: minimum value */ - long max; /* R: maximum value */ - long step; /* R: step (0 variable) */ - } integer; - struct { - long long min; /* R: minimum value */ - long long max; /* R: maximum value */ - long long step; /* R: step (0 variable) */ - } integer64; - struct { - unsigned int items; /* R: number of items */ - unsigned int item; /* W: item number */ - char name[64]; /* R: value name */ - __u64 names_ptr; /* W: names list (ELEM_ADD only) */ - unsigned int names_length; - } enumerated; - unsigned char reserved[128]; - } value; - unsigned char reserved[64]; -}; - -struct snd_ctl_elem_value { - struct snd_ctl_elem_id id; /* W: element ID */ - unsigned int indirect: 1; /* W: indirect access - obsoleted */ - union { - union { - long value[128]; - long *value_ptr; /* obsoleted */ - } integer; - union { - long long value[64]; - long long *value_ptr; /* obsoleted */ - } integer64; - union { - unsigned int item[128]; - unsigned int *item_ptr; /* obsoleted */ - } enumerated; - union { - unsigned char data[512]; - unsigned char *data_ptr; /* obsoleted */ - } bytes; - struct snd_aes_iec958 iec958; - } value; /* RO */ - unsigned char reserved[128]; -}; - -struct snd_ctl_tlv { - unsigned int numid; /* control element numeric identification */ - unsigned int length; /* in bytes aligned to 4 */ - unsigned int tlv[]; /* first TLV */ -}; - -#define SNDRV_CTL_IOCTL_PVERSION _IOR('U', 0x00, int) -#define SNDRV_CTL_IOCTL_CARD_INFO _IOR('U', 0x01, struct snd_ctl_card_info) -#define SNDRV_CTL_IOCTL_ELEM_LIST _IOWR('U', 0x10, struct snd_ctl_elem_list) -#define SNDRV_CTL_IOCTL_ELEM_INFO _IOWR('U', 0x11, struct snd_ctl_elem_info) -#define SNDRV_CTL_IOCTL_ELEM_READ _IOWR('U', 0x12, struct snd_ctl_elem_value) -#define SNDRV_CTL_IOCTL_ELEM_WRITE _IOWR('U', 0x13, struct snd_ctl_elem_value) -#define SNDRV_CTL_IOCTL_ELEM_LOCK _IOW('U', 0x14, struct snd_ctl_elem_id) -#define SNDRV_CTL_IOCTL_ELEM_UNLOCK _IOW('U', 0x15, struct snd_ctl_elem_id) -#define SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS _IOWR('U', 0x16, int) -#define SNDRV_CTL_IOCTL_ELEM_ADD _IOWR('U', 0x17, struct snd_ctl_elem_info) -#define SNDRV_CTL_IOCTL_ELEM_REPLACE _IOWR('U', 0x18, struct snd_ctl_elem_info) -#define SNDRV_CTL_IOCTL_ELEM_REMOVE _IOWR('U', 0x19, struct snd_ctl_elem_id) -#define SNDRV_CTL_IOCTL_TLV_READ _IOWR('U', 0x1a, struct snd_ctl_tlv) -#define SNDRV_CTL_IOCTL_TLV_WRITE _IOWR('U', 0x1b, struct snd_ctl_tlv) -#define SNDRV_CTL_IOCTL_TLV_COMMAND _IOWR('U', 0x1c, struct snd_ctl_tlv) -#define SNDRV_CTL_IOCTL_HWDEP_NEXT_DEVICE _IOWR('U', 0x20, int) -#define SNDRV_CTL_IOCTL_HWDEP_INFO _IOR('U', 0x21, struct snd_hwdep_info) -#define SNDRV_CTL_IOCTL_PCM_NEXT_DEVICE _IOR('U', 0x30, int) -#define SNDRV_CTL_IOCTL_PCM_INFO _IOWR('U', 0x31, struct snd_pcm_info) -#define SNDRV_CTL_IOCTL_PCM_PREFER_SUBDEVICE _IOW('U', 0x32, int) -#define SNDRV_CTL_IOCTL_RAWMIDI_NEXT_DEVICE _IOWR('U', 0x40, int) -#define SNDRV_CTL_IOCTL_RAWMIDI_INFO _IOWR('U', 0x41, struct snd_rawmidi_info) -#define SNDRV_CTL_IOCTL_RAWMIDI_PREFER_SUBDEVICE _IOW('U', 0x42, int) -#define SNDRV_CTL_IOCTL_UMP_NEXT_DEVICE _IOWR('U', 0x43, int) -#define SNDRV_CTL_IOCTL_UMP_ENDPOINT_INFO _IOWR('U', 0x44, struct snd_ump_endpoint_info) -#define SNDRV_CTL_IOCTL_UMP_BLOCK_INFO _IOWR('U', 0x45, struct snd_ump_block_info) -#define SNDRV_CTL_IOCTL_POWER _IOWR('U', 0xd0, int) -#define SNDRV_CTL_IOCTL_POWER_STATE _IOR('U', 0xd1, int) - -/* - * Read interface. - */ - -enum sndrv_ctl_event_type { - SNDRV_CTL_EVENT_ELEM = 0, - SNDRV_CTL_EVENT_LAST = SNDRV_CTL_EVENT_ELEM, -}; - -#define SNDRV_CTL_EVENT_MASK_VALUE (1<<0) /* element value was changed */ -#define SNDRV_CTL_EVENT_MASK_INFO (1<<1) /* element info was changed */ -#define SNDRV_CTL_EVENT_MASK_ADD (1<<2) /* element was added */ -#define SNDRV_CTL_EVENT_MASK_TLV (1<<3) /* element TLV tree was changed */ -#define SNDRV_CTL_EVENT_MASK_REMOVE (~0U) /* element was removed */ - -struct snd_ctl_event { - int type; /* event type - SNDRV_CTL_EVENT_* */ - union { - struct { - unsigned int mask; - struct snd_ctl_elem_id id; - } elem; - unsigned char data8[60]; - } data; -}; - -/* - * Control names - */ - -#define SNDRV_CTL_NAME_NONE "" -#define SNDRV_CTL_NAME_PLAYBACK "Playback " -#define SNDRV_CTL_NAME_CAPTURE "Capture " - -#define SNDRV_CTL_NAME_IEC958_NONE "" -#define SNDRV_CTL_NAME_IEC958_SWITCH "Switch" -#define SNDRV_CTL_NAME_IEC958_VOLUME "Volume" -#define SNDRV_CTL_NAME_IEC958_DEFAULT "Default" -#define SNDRV_CTL_NAME_IEC958_MASK "Mask" -#define SNDRV_CTL_NAME_IEC958_CON_MASK "Con Mask" -#define SNDRV_CTL_NAME_IEC958_PRO_MASK "Pro Mask" -#define SNDRV_CTL_NAME_IEC958_PCM_STREAM "PCM Stream" -#define SNDRV_CTL_NAME_IEC958(expl,direction,what) "IEC958 " expl SNDRV_CTL_NAME_##direction SNDRV_CTL_NAME_IEC958_##what - -#endif /* _UAPI__SOUND_ASOUND_H */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 145b8bd1d5..2a4c71501a 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -766,6 +766,7 @@ int bpf_link_create(int prog_fd, int target_fd, return libbpf_err(-EINVAL); break; case BPF_TRACE_KPROBE_MULTI: + case BPF_TRACE_KPROBE_SESSION: attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); @@ -785,6 +786,7 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, uprobe_multi)) return libbpf_err(-EINVAL); break; + case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: @@ -1173,20 +1175,31 @@ int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info return bpf_obj_get_info_by_fd(link_fd, info, info_len); } -int bpf_raw_tracepoint_open(const char *name, int prog_fd) +int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); union bpf_attr attr; int fd; + if (!OPTS_VALID(opts, bpf_raw_tp_opts)) + return libbpf_err(-EINVAL); + memset(&attr, 0, attr_sz); - attr.raw_tracepoint.name = ptr_to_u64(name); attr.raw_tracepoint.prog_fd = prog_fd; + attr.raw_tracepoint.name = ptr_to_u64(OPTS_GET(opts, tp_name, NULL)); + attr.raw_tracepoint.cookie = OPTS_GET(opts, cookie, 0); fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); return libbpf_err_errno(fd); } +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + LIBBPF_OPTS(bpf_raw_tp_opts, opts, .tp_name = name); + + return bpf_raw_tracepoint_open_opts(prog_fd, &opts); +} + int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd); diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index df0db2f0cd..972e17ec0c 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -617,6 +617,15 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); +struct bpf_raw_tp_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const char *tp_name; + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tp_opts__last_field cookie + +LIBBPF_API int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts); LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 670726353a..c0e13cdf96 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -2,7 +2,7 @@ #ifndef __BPF_CORE_READ_H__ #define __BPF_CORE_READ_H__ -#include +#include "bpf_helpers.h" /* * enum bpf_field_info_kind is passed as a second argument into diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cd17f6d079..305c62817d 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -137,7 +137,8 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12) +#if defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -165,6 +166,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } #endif +#endif enum libbpf_pin_type { LIBBPF_PIN_NONE, @@ -184,10 +186,21 @@ enum libbpf_tristate { #define __kptr __attribute__((btf_type_tag("kptr"))) #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) -#define bpf_ksym_exists(sym) ({ \ - _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ - !!sym; \ +#if defined (__clang__) +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), \ + #sym " should be marked as __weak"); \ + !!sym; \ +}) +#elif __GNUC__ > 8 +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(__builtin_has_attribute (*sym, __weak__), \ + #sym " should be marked as __weak"); \ + !!sym; \ }) +#else +#define bpf_ksym_exists(sym) !!sym +#endif #define __arg_ctx __attribute__((btf_decl_tag("arg:ctx"))) #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 1c13f8e888..9314fa95f0 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -633,18 +633,18 @@ struct pt_regs; #endif #define ___bpf_ctx_cast0() ctx -#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] -#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] -#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] -#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] -#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] -#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] -#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] -#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] -#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] -#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] -#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] -#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), ctx[11] #define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) /* @@ -786,14 +786,14 @@ ____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) struct pt_regs; #define ___bpf_kprobe_args0() ctx -#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) -#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) -#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) -#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) -#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) -#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx) -#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx) -#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx) +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (unsigned long long)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (unsigned long long)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (unsigned long long)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (unsigned long long)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (unsigned long long)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (unsigned long long)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (unsigned long long)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (unsigned long long)PT_REGS_PARM8(ctx) #define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) /* @@ -821,7 +821,7 @@ static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #define ___bpf_kretprobe_args0() ctx -#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (unsigned long long)PT_REGS_RC(ctx) #define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) /* @@ -845,24 +845,24 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) /* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ #define ___bpf_syscall_args0() ctx -#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) -#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) -#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) -#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) -#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) -#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs) -#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs) +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (unsigned long long)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (unsigned long long)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (unsigned long long)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (unsigned long long)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (unsigned long long)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (unsigned long long)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (unsigned long long)PT_REGS_PARM7_SYSCALL(regs) #define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) /* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ #define ___bpf_syswrap_args0() ctx -#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (unsigned long long)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (unsigned long long)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (unsigned long long)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (unsigned long long)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (unsigned long long)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (unsigned long long)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (unsigned long long)PT_REGS_PARM7_CORE_SYSCALL(regs) #define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) /* diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 2d0840ef59..142060bbce 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -598,7 +598,7 @@ static int btf_sanity_check(const struct btf *btf) __u32 i, n = btf__type_cnt(btf); int err; - for (i = 1; i < n; i++) { + for (i = btf->start_id; i < n; i++) { t = btf_type_by_id(btf, i); err = btf_validate_type(btf, t, i); if (err) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 4d9f30bf7f..894860111d 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1559,10 +1559,12 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, * Clang for BPF target generates func_proto with no * args as a func_proto with a single void arg (e.g., * `int (*f)(void)` vs just `int (*f)()`). We are - * going to pretend there are no args for such case. + * going to emit valid empty args (void) syntax for + * such case. Similarly and conveniently, valid + * no args case can be special-cased here as well. */ - if (vlen == 1 && p->type == 0) { - btf_dump_printf(d, ")"); + if (vlen == 0 || (vlen == 1 && p->type == 0)) { + btf_dump_printf(d, "void)"); return; } @@ -1929,6 +1931,7 @@ static int btf_dump_int_data(struct btf_dump *d, if (d->typed_dump->is_array_terminated) break; if (*(char *)data == '\0') { + btf_dump_type_values(d, "'\\0'"); d->typed_dump->is_array_terminated = true; break; } @@ -2031,6 +2034,7 @@ static int btf_dump_array_data(struct btf_dump *d, __u32 i, elem_type_id; __s64 elem_size; bool is_array_member; + bool is_array_terminated; elem_type_id = array->type; elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); @@ -2066,12 +2070,15 @@ static int btf_dump_array_data(struct btf_dump *d, */ is_array_member = d->typed_dump->is_array_member; d->typed_dump->is_array_member = true; + is_array_terminated = d->typed_dump->is_array_terminated; + d->typed_dump->is_array_terminated = false; for (i = 0; i < array->nelems; i++, data += elem_size) { if (d->typed_dump->is_array_terminated) break; btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); } d->typed_dump->is_array_member = is_array_member; + d->typed_dump->is_array_terminated = is_array_terminated; d->typed_dump->depth--; btf_dump_data_pfx(d); btf_dump_type_values(d, "]"); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f515cf264a..5401f2df46 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -132,6 +132,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", [BPF_NETKIT_PRIMARY] = "netkit_primary", [BPF_NETKIT_PEER] = "netkit_peer", + [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", }; static const char * const link_type_name[] = { @@ -149,6 +150,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_TCX] = "tcx", [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", [BPF_LINK_TYPE_NETKIT] = "netkit", + [BPF_LINK_TYPE_SOCKMAP] = "sockmap", }; static const char * const map_type_name[] = { @@ -1126,17 +1128,46 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) const struct btf_type *mtype, *kern_mtype; __u32 mtype_id, kern_mtype_id; void *mdata, *kern_mdata; + struct bpf_program *prog; __s64 msize, kern_msize; __u32 moff, kern_moff; __u32 kern_member_idx; const char *mname; mname = btf__name_by_offset(btf, member->name_off); + moff = member->offset / 8; + mdata = data + moff; + msize = btf__resolve_size(btf, member->type); + if (msize < 0) { + pr_warn("struct_ops init_kern %s: failed to resolve the size of member %s\n", + map->name, mname); + return msize; + } + kern_member = find_member_by_name(kern_btf, kern_type, mname); if (!kern_member) { - pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + if (!libbpf_is_mem_zeroed(mdata, msize)) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + map->name, mname); + return -ENOTSUP; + } + + if (st_ops->progs[i]) { + /* If we had declaratively set struct_ops callback, we need to + * force its autoload to false, because it doesn't have + * a chance of succeeding from POV of the current struct_ops map. + * If this program is still referenced somewhere else, though, + * then bpf_object_adjust_struct_ops_autoload() will update its + * autoload accordingly. + */ + st_ops->progs[i]->autoload = false; + st_ops->progs[i] = NULL; + } + + /* Skip all-zero/NULL fields if they are not present in the kernel BTF */ + pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", map->name, mname); - return -ENOTSUP; + continue; } kern_member_idx = kern_member - btf_members(kern_type); @@ -1147,10 +1178,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) return -ENOTSUP; } - moff = member->offset / 8; kern_moff = kern_member->offset / 8; - - mdata = data + moff; kern_mdata = kern_data + kern_moff; mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); @@ -1165,13 +1193,19 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) } if (btf_is_ptr(mtype)) { - struct bpf_program *prog; + prog = *(void **)mdata; + /* just like for !kern_member case above, reset declaratively + * set (at compile time) program's autload to false, + * if user replaced it with another program or NULL + */ + if (st_ops->progs[i] && st_ops->progs[i] != prog) + st_ops->progs[i]->autoload = false; /* Update the value from the shadow type */ - prog = *(void **)mdata; st_ops->progs[i] = prog; if (!prog) continue; + if (!is_valid_st_ops_program(obj, prog)) { pr_warn("struct_ops init_kern %s: member %s is not a struct_ops program\n", map->name, mname); @@ -1230,9 +1264,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) continue; } - msize = btf__resolve_size(btf, mtype_id); kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); - if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + if (kern_msize < 0 || msize != kern_msize) { pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", map->name, mname, (ssize_t)msize, (ssize_t)kern_msize); @@ -1956,6 +1989,20 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, return NULL; } +static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj, + const void *name, int len) +{ + const char *ext_name; + int i; + + for (i = 0; i < obj->nr_extern; i++) { + ext_name = obj->externs[i].name; + if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0) + return &obj->externs[i]; + } + return NULL; +} + static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, char value) { @@ -7325,7 +7372,11 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog __u32 log_level = prog->log_level; int ret, err; - if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* Be more helpful by rejecting programs that can't be validated early + * with more meaningful and actionable error message. + */ + switch (prog->type) { + case BPF_PROG_TYPE_UNSPEC: /* * The program type must be set. Most likely we couldn't find a proper * section definition at load time, and thus we didn't infer the type. @@ -7333,6 +7384,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", prog->name, prog->sec_name); return -EINVAL; + case BPF_PROG_TYPE_STRUCT_OPS: + if (prog->attach_btf_id == 0) { + pr_warn("prog '%s': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?\n", + prog->name); + return -EINVAL; + } + break; + default: + break; } if (!insns || !insns_cnt) @@ -7972,7 +8032,10 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) return 0; } -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) { char sym_type, sym_name[500]; unsigned long long sym_addr; @@ -8012,8 +8075,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; + char *res; - ext = find_extern_by_name(obj, sym_name); + res = strstr(sym_name, ".llvm."); + if (sym_type == 'd' && res) + ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name); + else + ext = find_extern_by_name(obj, sym_name); if (!ext || ext->type != EXT_KSYM) return 0; @@ -8562,6 +8630,11 @@ int bpf_map__pin(struct bpf_map *map, const char *path) return libbpf_err(-EINVAL); } + if (map->fd < 0) { + pr_warn("map '%s': can't pin BPF map without FD (was it created?)\n", map->name); + return libbpf_err(-EINVAL); + } + if (map->pin_path) { if (path && strcmp(path, map->pin_path)) { pr_warn("map '%s' already has pin path '%s' different from '%s'\n", @@ -9230,6 +9303,7 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -9246,6 +9320,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kprobe.session+", KPROBE, BPF_TRACE_KPROBE_SESSION, SEC_NONE, attach_kprobe_session), SEC_DEF("uprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), @@ -9297,6 +9372,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/verdict", SK_SKB, BPF_SK_SKB_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), @@ -9815,16 +9891,28 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, enum bpf_attach_type attach_type, int *btf_obj_fd, int *btf_type_id) { - int ret, i; + int ret, i, mod_len; + const char *fn_name, *mod_name = NULL; - ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); - if (ret > 0) { - *btf_obj_fd = 0; /* vmlinux BTF */ - *btf_type_id = ret; - return 0; + fn_name = strchr(attach_name, ':'); + if (fn_name) { + mod_name = attach_name; + mod_len = fn_name - mod_name; + fn_name++; + } + + if (!mod_name || strncmp(mod_name, "vmlinux", mod_len) == 0) { + ret = find_attach_btf_id(obj->btf_vmlinux, + mod_name ? fn_name : attach_name, + attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; } - if (ret != -ENOENT) - return ret; ret = load_module_btfs(obj); if (ret) @@ -9833,7 +9921,12 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, for (i = 0; i < obj->btf_module_cnt; i++) { const struct module_btf *mod = &obj->btf_modules[i]; - ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0) + continue; + + ret = find_attach_btf_id(mod->btf, + mod_name ? fn_name : attach_name, + attach_type); if (ret > 0) { *btf_obj_fd = mod->fd; *btf_type_id = ret; @@ -10306,6 +10399,11 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz, return -EINVAL; } + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + if (!check_value_sz) return 0; @@ -10418,8 +10516,15 @@ long libbpf_get_error(const void *ptr) int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { int ret; + int prog_fd = bpf_program__fd(prog); + + if (prog_fd < 0) { + pr_warn("prog '%s': can't use BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } - ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + ret = bpf_link_update(bpf_link__fd(link), prog_fd, NULL); return libbpf_err_errno(ret); } @@ -10613,7 +10718,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -11325,18 +11430,26 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, struct kprobe_multi_resolve res = { .pattern = pattern, }; + enum bpf_attach_type attach_type; struct bpf_link *link = NULL; char errmsg[STRERR_BUFSIZE]; const unsigned long *addrs; int err, link_fd, prog_fd; + bool retprobe, session; const __u64 *cookies; const char **syms; - bool retprobe; size_t cnt; if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, false); addrs = OPTS_GET(opts, addrs, false); cnt = OPTS_GET(opts, cnt, false); @@ -11363,6 +11476,12 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } retprobe = OPTS_GET(opts, retprobe, false); + session = OPTS_GET(opts, session, false); + + if (retprobe && session) + return libbpf_err_ptr(-EINVAL); + + attach_type = session ? BPF_TRACE_KPROBE_SESSION : BPF_TRACE_KPROBE_MULTI; lopts.kprobe_multi.syms = syms; lopts.kprobe_multi.addrs = addrs; @@ -11377,8 +11496,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); - link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + link_fd = bpf_link_create(prog_fd, 0, attach_type, &lopts); if (link_fd < 0) { err = -errno; pr_warn("prog '%s': failed to attach: %s\n", @@ -11484,6 +11602,32 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru return libbpf_get_error(*link); } +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, + struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, .session = true); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.session") */ + if (strcmp(prog->sec_name, "kprobe.session") == 0) + return 0; + + spec = prog->sec_name + sizeof("kprobe.session/") - 1; + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe session pattern is invalid: %s\n", spec); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return *link ? 0 : -errno; +} + static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) { char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; @@ -11760,6 +11904,13 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_uprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, NULL); offsets = OPTS_GET(opts, offsets, NULL); ref_ctr_offsets = OPTS_GET(opts, ref_ctr_offsets, NULL); @@ -11835,7 +11986,6 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &lopts); if (link_fd < 0) { err = -errno; @@ -12079,7 +12229,7 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); if (bpf_program__fd(prog) < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -12270,13 +12420,19 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin return libbpf_get_error(*link); } -struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, - const char *tp_name) +struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts) { + LIBBPF_OPTS(bpf_raw_tp_opts, raw_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int prog_fd, pfd; + if (!OPTS_VALID(opts, bpf_raw_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { pr_warn("prog '%s': can't attach before loaded\n", prog->name); @@ -12288,7 +12444,9 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return libbpf_err_ptr(-ENOMEM); link->detach = &bpf_link__detach_fd; - pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + raw_opts.tp_name = tp_name; + raw_opts.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_opts); if (pfd < 0) { pfd = -errno; free(link); @@ -12300,6 +12458,12 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return link; } +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + return bpf_program__attach_raw_tracepoint_opts(prog, tp_name, NULL); +} + static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) { static const char *const prefixes[] = { @@ -12453,6 +12617,12 @@ bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) return bpf_program_attach_fd(prog, netns_fd, "netns", NULL); } +struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd) +{ + return bpf_program_attach_fd(prog, map_fd, "sockmap", NULL); +} + struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) { /* target_fd/target_ifindex use the same field in LINK_CREATE */ @@ -12661,6 +12831,12 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog) if (!prog->sec_def || !prog->sec_def->prog_attach_fn) return libbpf_err_ptr(-EOPNOTSUPP); + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); if (err) return libbpf_err_ptr(err); @@ -12701,8 +12877,13 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) __u32 zero = 0; int err, fd; - if (!bpf_map__is_struct_ops(map) || map->fd == -1) + if (!bpf_map__is_struct_ops(map)) + return libbpf_err_ptr(-EINVAL); + + if (map->fd < 0) { + pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name); return libbpf_err_ptr(-EINVAL); + } link = calloc(1, sizeof(*link)); if (!link) @@ -12750,9 +12931,14 @@ int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) __u32 zero = 0; int err; - if (!bpf_map__is_struct_ops(map) || !map_is_created(map)) + if (!bpf_map__is_struct_ops(map)) return -EINVAL; + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + st_ops_link = container_of(link, struct bpf_link_struct_ops, link); /* Ensure the type of a link is correct */ if (st_ops_link->map_fd < 0) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 7b510761f5..c3f77d9260 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -539,10 +539,12 @@ struct bpf_kprobe_multi_opts { size_t cnt; /* create return kprobes */ bool retprobe; + /* create session kprobes */ + bool session; size_t :0; }; -#define bpf_kprobe_multi_opts__last_field retprobe +#define bpf_kprobe_multi_opts__last_field session LIBBPF_API struct bpf_link * bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, @@ -760,9 +762,20 @@ bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, const char *tp_name, const struct bpf_tracepoint_opts *opts); +struct bpf_raw_tracepoint_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tracepoint_opts__last_field cookie + LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts); struct bpf_trace_opts { /* size of this struct, for forward/backward compatibility */ @@ -784,6 +797,8 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); LIBBPF_API struct bpf_link * bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); LIBBPF_API struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd); +LIBBPF_API struct bpf_link * bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, @@ -1282,6 +1297,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); /** @@ -1356,6 +1372,17 @@ LIBBPF_API int ring__map_fd(const struct ring *r); */ LIBBPF_API int ring__consume(struct ring *r); +/** + * @brief **ring__consume_n()** consumes up to a requested amount of items from + * a ringbuffer without event polling. + * + * @param r A ringbuffer object. + * @param n Maximum amount of items to consume. + * @return The number of items consumed, or a negative number if any of the + * callbacks return an error. + */ +LIBBPF_API int ring__consume_n(struct ring *r, size_t n); + struct user_ring_buffer_opts { size_t sz; /* size of this struct, for forward/backward compatibility */ }; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 86804fd90d..c1ce8aa352 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -410,7 +410,16 @@ LIBBPF_1.3.0 { LIBBPF_1.4.0 { global: + bpf_program__attach_raw_tracepoint_opts; + bpf_raw_tracepoint_open_opts; bpf_token_create; btf__new_split; btf_ext__raw_data; } LIBBPF_1.3.0; + +LIBBPF_1.5.0 { + global: + bpf_program__attach_sockmap; + ring__consume_n; + ring_buffer__consume_n; +} LIBBPF_1.4.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 864b361774..7e7e686008 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -518,11 +518,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx); - -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); - /* handle direct returned errors */ static inline int libbpf_err(int ret) { @@ -602,13 +597,9 @@ static inline int ensure_good_fd(int fd) return fd; } -static inline int sys_dup2(int oldfd, int newfd) +static inline int sys_dup3(int oldfd, int newfd, int flags) { -#ifdef __NR_dup2 - return syscall(__NR_dup2, oldfd, newfd); -#else - return syscall(__NR_dup3, oldfd, newfd, 0); -#endif + return syscall(__NR_dup3, oldfd, newfd, flags); } /* Point *fixed_fd* to the same file that *tmp_fd* points to. @@ -619,7 +610,7 @@ static inline int reuse_fd(int fixed_fd, int tmp_fd) { int err; - err = sys_dup2(tmp_fd, fixed_fd); + err = sys_dup3(tmp_fd, fixed_fd, O_CLOEXEC); err = err < 0 ? -errno : 0; close(tmp_fd); /* clean up temporary FD */ return err; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 3021881224..9dfbe7750f 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index e783a47da8..d6e5eff967 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 4 +#define LIBBPF_MINOR_VERSION 5 #endif /* __LIBBPF_VERSION_H */ diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 0d4be82955..5a583053e3 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -2213,10 +2213,17 @@ static int linker_fixup_btf(struct src_obj *obj) vi = btf_var_secinfos(t); for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); - const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); - int var_linkage = btf_var(vt)->linkage; + const char *var_name; + int var_linkage; Elf64_Sym *sym; + /* could be a variable or function */ + if (!btf_is_var(vt)) + continue; + + var_name = btf__str_by_offset(obj->btf, vt->name_off); + var_linkage = btf_var(vt)->linkage; + /* no need to patch up static or extern vars */ if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) continue; diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index aacb64278a..bfd8dac4c0 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -231,7 +231,7 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int64_t ringbuf_process_ring(struct ring *r) +static int64_t ringbuf_process_ring(struct ring *r, size_t n) { int *len_ptr, len, err; /* 64-bit to avoid overflow in case of extreme application behavior */ @@ -268,12 +268,42 @@ static int64_t ringbuf_process_ring(struct ring *r) } smp_store_release(r->consumer_pos, cons_pos); + + if (cnt >= n) + goto done; } } while (got_new_data); done: return cnt; } +/* Consume available ring buffer(s) data without event polling, up to n + * records. + * + * Returns number of records consumed across all registered ring buffers (or + * n, whichever is less), or negative number if any of the callbacks return + * error. + */ +int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = rb->rings[i]; + + err = ringbuf_process_ring(ring, n); + if (err < 0) + return libbpf_err(err); + res += err; + n -= err; + + if (n == 0) + break; + } + return res > INT_MAX ? INT_MAX : res; +} + /* Consume available ring buffer(s) data without event polling. * Returns number of records consumed across all registered ring buffers (or * INT_MAX, whichever is less), or negative number if any of the callbacks @@ -287,13 +317,15 @@ int ring_buffer__consume(struct ring_buffer *rb) for (i = 0; i < rb->ring_cnt; i++) { struct ring *ring = rb->rings[i]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; + if (res > INT_MAX) { + res = INT_MAX; + break; + } } - if (res > INT_MAX) - return INT_MAX; return res; } @@ -314,13 +346,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) __u32 ring_id = rb->events[i].data.fd; struct ring *ring = rb->rings[ring_id]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; } if (res > INT_MAX) - return INT_MAX; + res = INT_MAX; return res; } @@ -371,17 +403,22 @@ int ring__map_fd(const struct ring *r) return r->map_fd; } -int ring__consume(struct ring *r) +int ring__consume_n(struct ring *r, size_t n) { int64_t res; - res = ringbuf_process_ring(r); + res = ringbuf_process_ring(r, n); if (res < 0) return libbpf_err(res); return res > INT_MAX ? INT_MAX : res; } +int ring__consume(struct ring *r) +{ + return ring__consume_n(r, INT_MAX); +} + static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) { if (rb->consumer_pos) { diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c index 146da01979..5e6a1e27dd 100644 --- a/tools/lib/bpf/str_error.c +++ b/tools/lib/bpf/str_error.c @@ -2,6 +2,7 @@ #undef _GNU_SOURCE #include #include +#include #include "str_error.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -15,7 +16,18 @@ char *libbpf_strerror_r(int err, char *dst, int len) { int ret = strerror_r(err < 0 ? -err : err, dst, len); - if (ret) - snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + /* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't + * handle the error, on glibc >=2.13 *positive* (errno-like) error + * code is returned directly + */ + if (ret == -1) + ret = errno; + if (ret) { + if (ret == EINVAL) + /* strerror_r() doesn't recognize this specific error */ + snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err); + else + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + } return dst; } diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index f6763300b2..76359bcdc9 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -214,18 +214,18 @@ long bpf_usdt_cookie(struct pt_regs *ctx) /* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ #define ___bpf_usdt_args0() ctx -#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) -#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) -#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) -#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) -#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) -#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) -#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) -#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) -#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) -#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) -#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) -#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; }) #define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) /* diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 4adcd7920d..cae799ad44 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -18,9 +18,13 @@ void perf_cpu_map__set_nr(struct perf_cpu_map *map, int nr_cpus) struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus) { - RC_STRUCT(perf_cpu_map) *cpus = malloc(sizeof(*cpus) + sizeof(struct perf_cpu) * nr_cpus); + RC_STRUCT(perf_cpu_map) *cpus; struct perf_cpu_map *result; + if (nr_cpus == 0) + return NULL; + + cpus = malloc(sizeof(*cpus) + sizeof(struct perf_cpu) * nr_cpus); if (ADD_RC_CHK(result, cpus)) { cpus->nr = nr_cpus; refcount_set(&cpus->refcnt, 1); @@ -316,6 +320,19 @@ bool perf_cpu_map__has_any_cpu_or_is_empty(const struct perf_cpu_map *map) return map ? __perf_cpu_map__cpu(map, 0).cpu == -1 : true; } +bool perf_cpu_map__is_any_cpu_or_is_empty(const struct perf_cpu_map *map) +{ + if (!map) + return true; + + return __perf_cpu_map__nr(map) == 1 && __perf_cpu_map__cpu(map, 0).cpu == -1; +} + +bool perf_cpu_map__is_empty(const struct perf_cpu_map *map) +{ + return map == NULL; +} + int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu) { int low, high; @@ -372,6 +389,20 @@ bool perf_cpu_map__has_any_cpu(const struct perf_cpu_map *map) return map && __perf_cpu_map__cpu(map, 0).cpu == -1; } +struct perf_cpu perf_cpu_map__min(const struct perf_cpu_map *map) +{ + struct perf_cpu cpu, result = { + .cpu = -1 + }; + int idx; + + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, map) { + result = cpu; + break; + } + return result; +} + struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map) { struct perf_cpu result = { diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index 228c6c629b..90457d17fb 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -61,6 +61,22 @@ LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); * perf_cpu_map__has_any_cpu_or_is_empty - is map either empty or has the "any CPU"/dummy value. */ LIBPERF_API bool perf_cpu_map__has_any_cpu_or_is_empty(const struct perf_cpu_map *map); +/** + * perf_cpu_map__is_any_cpu_or_is_empty - is map either empty or the "any CPU"/dummy value. + */ +LIBPERF_API bool perf_cpu_map__is_any_cpu_or_is_empty(const struct perf_cpu_map *map); +/** + * perf_cpu_map__is_empty - does the map contain no values and it doesn't + * contain the special "any CPU"/dummy value. + */ +LIBPERF_API bool perf_cpu_map__is_empty(const struct perf_cpu_map *map); +/** + * perf_cpu_map__min - the minimum CPU value or -1 if empty or just the "any CPU"/dummy value. + */ +LIBPERF_API struct perf_cpu perf_cpu_map__min(const struct perf_cpu_map *map); +/** + * perf_cpu_map__max - the maximum CPU value or -1 if empty or just the "any CPU"/dummy value. + */ LIBPERF_API struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map); LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_cpu cpu); LIBPERF_API bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map index 10b3f37226..2aa79b6960 100644 --- a/tools/lib/perf/libperf.map +++ b/tools/lib/perf/libperf.map @@ -10,6 +10,10 @@ LIBPERF_0.0.1 { perf_cpu_map__nr; perf_cpu_map__cpu; perf_cpu_map__has_any_cpu_or_is_empty; + perf_cpu_map__is_any_cpu_or_is_empty; + perf_cpu_map__is_empty; + perf_cpu_map__has_any_cpu; + perf_cpu_map__min; perf_cpu_map__max; perf_cpu_map__has; perf_thread_map__new_array; diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index 0c903c2372..c1a51d925e 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -279,7 +279,7 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map) if (!refcount_read(&map->refcnt)) return NULL; - /* non-overwirte doesn't pause the ringbuffer */ + /* non-overwrite doesn't pause the ringbuffer */ if (!map->overwrite) map->end = perf_mmap__read_head(map); diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 727396de6b..9e7307186b 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -58,7 +58,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color |= RB_BLACK; + rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index d943d78b78..4b60ec03b0 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -808,18 +808,30 @@ static int option__cmp(const void *va, const void *vb) static struct option *options__order(const struct option *opts) { - int nr_opts = 0, nr_group = 0, len; - const struct option *o = opts; - struct option *opt, *ordered, *group; - - for (o = opts; o->type != OPTION_END; o++) - ++nr_opts; - - len = sizeof(*o) * (nr_opts + 1); - ordered = malloc(len); - if (!ordered) - goto out; - memcpy(ordered, opts, len); + int nr_opts = 0, nr_group = 0, nr_parent = 0, len; + const struct option *o, *p = opts; + struct option *opt, *ordered = NULL, *group; + + /* flatten the options that have parents */ + for (p = opts; p != NULL; p = o->parent) { + for (o = p; o->type != OPTION_END; o++) + ++nr_opts; + + /* + * the length is given by the number of options plus a null + * terminator for the last loop iteration. + */ + len = sizeof(*o) * (nr_opts + !o->parent); + group = realloc(ordered, len); + if (!group) + goto out; + ordered = group; + memcpy(&ordered[nr_parent], p, sizeof(*o) * (nr_opts - nr_parent)); + + nr_parent = nr_opts; + } + /* copy the last OPTION_END */ + memcpy(&ordered[nr_opts], o, sizeof(*o)); /* sort each option group individually */ for (opt = group = ordered; opt->type != OPTION_END; opt++) { diff --git a/tools/lib/subcmd/run-command.c b/tools/lib/subcmd/run-command.c index d435eb4235..4e3a557a2f 100644 --- a/tools/lib/subcmd/run-command.c +++ b/tools/lib/subcmd/run-command.c @@ -165,43 +165,65 @@ int start_command(struct child_process *cmd) return 0; } -static int wait_or_whine(pid_t pid) +static int wait_or_whine(struct child_process *cmd, bool block) { - char sbuf[STRERR_BUFSIZE]; + bool finished = cmd->finished; + int result = cmd->finish_result; - for (;;) { + while (!finished) { int status, code; - pid_t waiting = waitpid(pid, &status, 0); + pid_t waiting = waitpid(cmd->pid, &status, block ? 0 : WNOHANG); + + if (!block && waiting == 0) + break; + + if (waiting < 0 && errno == EINTR) + continue; + finished = true; if (waiting < 0) { - if (errno == EINTR) - continue; + char sbuf[STRERR_BUFSIZE]; + fprintf(stderr, " Error: waitpid failed (%s)", str_error_r(errno, sbuf, sizeof(sbuf))); - return -ERR_RUN_COMMAND_WAITPID; - } - if (waiting != pid) - return -ERR_RUN_COMMAND_WAITPID_WRONG_PID; - if (WIFSIGNALED(status)) - return -ERR_RUN_COMMAND_WAITPID_SIGNAL; - - if (!WIFEXITED(status)) - return -ERR_RUN_COMMAND_WAITPID_NOEXIT; - code = WEXITSTATUS(status); - switch (code) { - case 127: - return -ERR_RUN_COMMAND_EXEC; - case 0: - return 0; - default: - return -code; + result = -ERR_RUN_COMMAND_WAITPID; + } else if (waiting != cmd->pid) { + result = -ERR_RUN_COMMAND_WAITPID_WRONG_PID; + } else if (WIFSIGNALED(status)) { + result = -ERR_RUN_COMMAND_WAITPID_SIGNAL; + } else if (!WIFEXITED(status)) { + result = -ERR_RUN_COMMAND_WAITPID_NOEXIT; + } else { + code = WEXITSTATUS(status); + switch (code) { + case 127: + result = -ERR_RUN_COMMAND_EXEC; + break; + case 0: + result = 0; + break; + default: + result = -code; + break; + } } } + if (finished) { + cmd->finished = 1; + cmd->finish_result = result; + } + return result; +} + +int check_if_command_finished(struct child_process *cmd) +{ + wait_or_whine(cmd, /*block=*/false); + return cmd->finished; } int finish_command(struct child_process *cmd) { - return wait_or_whine(cmd->pid); + return wait_or_whine(cmd, /*block=*/true); } int run_command(struct child_process *cmd) diff --git a/tools/lib/subcmd/run-command.h b/tools/lib/subcmd/run-command.h index d794138a79..b2d39de6e6 100644 --- a/tools/lib/subcmd/run-command.h +++ b/tools/lib/subcmd/run-command.h @@ -41,17 +41,20 @@ struct child_process { int err; const char *dir; const char *const *env; + int finish_result; unsigned no_stdin:1; unsigned no_stdout:1; unsigned no_stderr:1; unsigned exec_cmd:1; /* if this is to be external sub-command */ unsigned stdout_to_stderr:1; + unsigned finished:1; void (*preexec_cb)(void); /* If set, call function in child rather than doing an exec. */ int (*no_exec_cmd)(struct child_process *process); }; int start_command(struct child_process *); +int check_if_command_finished(struct child_process *); int finish_command(struct child_process *); int run_command(struct child_process *); diff --git a/tools/memory-model/lock.cat b/tools/memory-model/lock.cat index 53b5a49273..21ba650869 100644 --- a/tools/memory-model/lock.cat +++ b/tools/memory-model/lock.cat @@ -102,19 +102,19 @@ let rf-lf = rfe-lf | rfi-lf * within one of the lock's critical sections returns False. *) -(* rfi for RU events: an RU may read from the last po-previous UL *) -let rfi-ru = ([UL] ; po-loc ; [RU]) \ ([UL] ; po-loc ; [LKW] ; po-loc) - -(* rfe for RU events: an RU may read from an external UL or the initial write *) -let all-possible-rfe-ru = - let possible-rfe-ru r = +(* + * rf for RU events: an RU may read from an external UL or the initial write, + * or from the last po-previous UL + *) +let all-possible-rf-ru = + let possible-rf-ru r = let pair-to-relation p = p ++ 0 - in map pair-to-relation (((UL | IW) * {r}) & loc & ext) - in map possible-rfe-ru RU + in map pair-to-relation ((((UL | IW) * {r}) & loc & ext) | + (((UL * {r}) & po-loc) \ ([UL] ; po-loc ; [LKW] ; po-loc))) + in map possible-rf-ru RU (* Generate all rf relations for RU events *) -with rfe-ru from cross(all-possible-rfe-ru) -let rf-ru = rfe-ru | rfi-ru +with rf-ru from cross(all-possible-rf-ru) (* Final rf relation *) let rf = rf | rf-lf | rf-ru diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py index f131e33ac3..b8481f4013 100755 --- a/tools/net/ynl/cli.py +++ b/tools/net/ynl/cli.py @@ -19,13 +19,30 @@ class YnlEncoder(json.JSONEncoder): def main(): - parser = argparse.ArgumentParser(description='YNL CLI sample') + description = """ + YNL CLI utility - a general purpose netlink utility that uses YAML + specs to drive protocol encoding and decoding. + """ + epilog = """ + The --multi option can be repeated to include several do operations + in the same netlink payload. + """ + + parser = argparse.ArgumentParser(description=description, + epilog=epilog) parser.add_argument('--spec', dest='spec', type=str, required=True) parser.add_argument('--schema', dest='schema', type=str) parser.add_argument('--no-schema', action='store_true') parser.add_argument('--json', dest='json_text', type=str) - parser.add_argument('--do', dest='do', type=str) - parser.add_argument('--dump', dest='dump', type=str) + + group = parser.add_mutually_exclusive_group() + group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) + group.add_argument('--multi', dest='multi', nargs=2, action='append', + metavar=('DO-OPERATION', 'JSON_TEXT'), type=str) + group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) + group.add_argument('--list-ops', action='store_true') + group.add_argument('--list-msgs', action='store_true') + parser.add_argument('--sleep', dest='sleep', type=int) parser.add_argument('--subscribe', dest='ntf', type=str) parser.add_argument('--replace', dest='flags', action='append_const', @@ -66,6 +83,13 @@ def main(): if args.sleep: time.sleep(args.sleep) + if args.list_ops: + for op_name, op in ynl.ops.items(): + print(op_name, " [", ", ".join(op.modes), "]") + if args.list_msgs: + for op_name, op in ynl.msgs.items(): + print(op_name, " [", ", ".join(op.modes), "]") + try: if args.do: reply = ynl.do(args.do, attrs, args.flags) @@ -73,6 +97,10 @@ def main(): if args.dump: reply = ynl.dump(args.dump, attrs) output(reply) + if args.multi: + ops = [ (item[0], json.loads(item[1]), args.flags or []) for item in args.multi ] + reply = ynl.do_multi(ops) + output(reply) except NlError as e: print(e) exit(1) diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/ethtool.py index 6c9f7e3125..63c471f075 100755 --- a/tools/net/ynl/ethtool.py +++ b/tools/net/ynl/ethtool.py @@ -6,6 +6,7 @@ import json import pprint import sys import re +import os from lib import YnlFamily @@ -152,8 +153,11 @@ def main(): global args args = parser.parse_args() - spec = '../../../Documentation/netlink/specs/ethtool.yaml' - schema = '../../../Documentation/netlink/genetlink-legacy.yaml' + script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + spec = os.path.join(script_abs_dir, + '../../../Documentation/netlink/specs/ethtool.yaml') + schema = os.path.join(script_abs_dir, + '../../../Documentation/netlink/genetlink-legacy.yaml') ynl = YnlFamily(spec, schema) @@ -320,7 +324,13 @@ def main(): return if args.show_time_stamping: - tsinfo = dumpit(ynl, args, 'tsinfo-get') + req = { + 'header': { + 'flags': 'stats', + }, + } + + tsinfo = dumpit(ynl, args, 'tsinfo-get', req) print(f'Time stamping parameters for {args.device}:') @@ -334,6 +344,9 @@ def main(): print('Hardware Receive Filter Modes:') [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + + print('Statistics:') + [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] return print(f'Settings for {args.device}:') diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index 6d08ab9e21..b6d6f8aef4 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -335,6 +335,7 @@ class SpecOperation(SpecElement): req_value numerical ID when serialized, user -> kernel rsp_value numerical ID when serialized, user <- kernel + modes supported operation modes (do, dump, event etc.) is_call bool, whether the operation is a call is_async bool, whether the operation is a notification is_resv bool, whether the operation does not exist (it's just a reserved ID) @@ -350,6 +351,7 @@ class SpecOperation(SpecElement): self.req_value = req_value self.rsp_value = rsp_value + self.modes = yaml.keys() & {'do', 'dump', 'event', 'notify'} self.is_call = 'do' in yaml or 'dump' in yaml self.is_async = 'notify' in yaml or 'event' in yaml self.is_resv = not self.is_async and not self.is_call diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 9842e85a8c..eef7c6324e 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -91,6 +91,18 @@ void ynl_sock_destroy(struct ynl_sock *ys); !ynl_dump_obj_is_last(iter); \ iter = ynl_dump_obj_next(iter)) +/** + * ynl_dump_empty() - does the dump have no entries + * @dump: pointer to the dump list, as returned by a dump call + * + * Check if the dump is empty, i.e. contains no objects. + * Dump calls return NULL on error, and terminator element if empty. + */ +static inline bool ynl_dump_empty(void *dump) +{ + return dump == (void *)YNL_LIST_END; +} + int ynl_subscribe(struct ynl_sock *ys, const char *grp_name); int ynl_socket_get_fd(struct ynl_sock *ys); int ynl_ntf_check(struct ynl_sock *ys); diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 25810e18b0..35e6669281 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause from collections import namedtuple +from enum import Enum import functools import os import random @@ -76,13 +77,33 @@ class Netlink: NLMSGERR_ATTR_MISS_TYPE = 5 NLMSGERR_ATTR_MISS_NEST = 6 + # Policy types + NL_POLICY_TYPE_ATTR_TYPE = 1 + NL_POLICY_TYPE_ATTR_MIN_VALUE_S = 2 + NL_POLICY_TYPE_ATTR_MAX_VALUE_S = 3 + NL_POLICY_TYPE_ATTR_MIN_VALUE_U = 4 + NL_POLICY_TYPE_ATTR_MAX_VALUE_U = 5 + NL_POLICY_TYPE_ATTR_MIN_LENGTH = 6 + NL_POLICY_TYPE_ATTR_MAX_LENGTH = 7 + NL_POLICY_TYPE_ATTR_POLICY_IDX = 8 + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE = 9 + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK = 10 + NL_POLICY_TYPE_ATTR_PAD = 11 + NL_POLICY_TYPE_ATTR_MASK = 12 + + AttrType = Enum('AttrType', ['flag', 'u8', 'u16', 'u32', 'u64', + 's8', 's16', 's32', 's64', + 'binary', 'string', 'nul-string', + 'nested', 'nested-array', + 'bitfield32', 'sint', 'uint']) class NlError(Exception): def __init__(self, nl_msg): self.nl_msg = nl_msg + self.error = -nl_msg.error def __str__(self): - return f"Netlink error: {os.strerror(-self.nl_msg.error)}\n{self.nl_msg}" + return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}" class ConfigError(Exception): @@ -199,6 +220,8 @@ class NlMsg: self.extack['miss-nest'] = extack.as_scalar('u32') elif extack.type == Netlink.NLMSGERR_ATTR_OFFS: self.extack['bad-attr-offs'] = extack.as_scalar('u32') + elif extack.type == Netlink.NLMSGERR_ATTR_POLICY: + self.extack['policy'] = self._decode_policy(extack.raw) else: if 'unknown' not in self.extack: self.extack['unknown'] = [] @@ -210,10 +233,33 @@ class NlMsg: miss_type = self.extack['miss-type'] if miss_type in attr_space.attrs_by_val: spec = attr_space.attrs_by_val[miss_type] - desc = spec['name'] + self.extack['miss-type'] = spec['name'] if 'doc' in spec: - desc += f" ({spec['doc']})" - self.extack['miss-type'] = desc + self.extack['miss-type-doc'] = spec['doc'] + + def _decode_policy(self, raw): + policy = {} + for attr in NlAttrs(raw): + if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE: + type = attr.as_scalar('u32') + policy['type'] = Netlink.AttrType(type).name + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S: + policy['min-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S: + policy['max-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_U: + policy['min-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_U: + policy['max-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_LENGTH: + policy['min-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_LENGTH: + policy['max-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: + policy['bitfield32-mask'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MASK: + policy['mask'] = attr.as_scalar('u64') + return policy def cmd(self): return self.nl_type @@ -340,12 +386,9 @@ class NetlinkProtocol: def _decode(self, nl_msg): return nl_msg - def decode(self, ynl, nl_msg): + def decode(self, ynl, nl_msg, op): msg = self._decode(nl_msg) - fixed_header_size = 0 - if ynl: - op = ynl.rsp_by_value[msg.cmd()] - fixed_header_size = ynl._struct_size(op.fixed_header) + fixed_header_size = ynl._struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -585,15 +628,28 @@ class YnlFamily(SpecFamily): decoded = self._formatted_string(decoded, attr_spec.display_hint) return decoded - def _decode_array_nest(self, attr, attr_spec): + def _decode_array_attr(self, attr, attr_spec): decoded = [] offset = 0 while offset < len(attr.raw): item = NlAttr(attr.raw, offset) offset += item.full_len - subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) - decoded.append({ item.type: subattrs }) + if attr_spec["sub-type"] == 'nest': + subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) + decoded.append({ item.type: subattrs }) + elif attr_spec["sub-type"] == 'binary': + subattrs = item.as_bin() + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) + elif attr_spec["sub-type"] in NlAttr.type_formats: + subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order) + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) + else: + raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded def _decode_nest_type_value(self, attr, attr_spec): @@ -687,8 +743,8 @@ class YnlFamily(SpecFamily): decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order) if 'enum' in attr_spec: decoded = self._decode_enum(decoded, attr_spec) - elif attr_spec["type"] == 'array-nest': - decoded = self._decode_array_nest(attr, attr_spec) + elif attr_spec["type"] == 'indexed-array': + decoded = self._decode_array_attr(attr, attr_spec) elif attr_spec["type"] == 'bitfield32': value, selector = struct.unpack("II", attr.raw) if 'enum' in attr_spec: @@ -738,7 +794,7 @@ class YnlFamily(SpecFamily): if 'bad-attr-offs' not in extack: return - msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set)) + msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op) offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header) path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset, extack['bad-attr-offs']) @@ -820,7 +876,10 @@ class YnlFamily(SpecFamily): if display_hint == 'mac': formatted = ':'.join('%02x' % b for b in raw) elif display_hint == 'hex': - formatted = bytes.hex(raw, ' ') + if isinstance(raw, int): + formatted = hex(raw) + else: + formatted = bytes.hex(raw, ' ') elif display_hint in [ 'ipv4', 'ipv6' ]: formatted = format(ipaddress.ip_address(raw)) elif display_hint == 'uuid': @@ -860,7 +919,8 @@ class YnlFamily(SpecFamily): print("Netlink done while checking for ntf!?") continue - decoded = self.nlproto.decode(self, nl_msg) + op = self.rsp_by_value[nl_msg.cmd()] + decoded = self.nlproto.decode(self, nl_msg, op) if decoded.cmd() not in self.async_msg_ids: print("Unexpected msg id done while checking for ntf", decoded) continue @@ -878,16 +938,11 @@ class YnlFamily(SpecFamily): return op['do']['request']['attributes'].copy() - def _op(self, method, vals, flags=None, dump=False): - op = self.ops[method] - + def _encode_message(self, op, vals, flags, req_seq): nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK for flag in flags or []: nl_flags |= flag - if dump: - nl_flags |= Netlink.NLM_F_DUMP - req_seq = random.randint(1024, 65535) msg = self.nlproto.message(nl_flags, op.req_value, 1, req_seq) if op.fixed_header: msg += self._encode_struct(op.fixed_header, vals) @@ -895,18 +950,36 @@ class YnlFamily(SpecFamily): for name, value in vals.items(): msg += self._add_attr(op.attr_set.name, name, value, search_attrs) msg = _genl_msg_finalize(msg) + return msg - self.sock.send(msg, 0) + def _ops(self, ops): + reqs_by_seq = {} + req_seq = random.randint(1024, 65535) + payload = b'' + for (method, vals, flags) in ops: + op = self.ops[method] + msg = self._encode_message(op, vals, flags, req_seq) + reqs_by_seq[req_seq] = (op, msg, flags) + payload += msg + req_seq += 1 + + self.sock.send(payload, 0) done = False rsp = [] + op_rsp = [] while not done: reply = self.sock.recv(self._recv_size) nms = NlMsgs(reply, attr_space=op.attr_set) self._recv_dbg_print(reply, nms) for nl_msg in nms: - if nl_msg.extack: - self._decode_extack(msg, op, nl_msg.extack) + if nl_msg.nl_seq in reqs_by_seq: + (op, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq] + if nl_msg.extack: + self._decode_extack(req_msg, op, nl_msg.extack) + else: + op = self.rsp_by_value[nl_msg.cmd()] + req_flags = [] if nl_msg.error: raise NlError(nl_msg) @@ -914,13 +987,25 @@ class YnlFamily(SpecFamily): if nl_msg.extack: print("Netlink warning:") print(nl_msg) - done = True + + if Netlink.NLM_F_DUMP in req_flags: + rsp.append(op_rsp) + elif not op_rsp: + rsp.append(None) + elif len(op_rsp) == 1: + rsp.append(op_rsp[0]) + else: + rsp.append(op_rsp) + op_rsp = [] + + del reqs_by_seq[nl_msg.nl_seq] + done = len(reqs_by_seq) == 0 break - decoded = self.nlproto.decode(self, nl_msg) + decoded = self.nlproto.decode(self, nl_msg, op) # Check if this is a reply to our request - if nl_msg.nl_seq != req_seq or decoded.cmd() != op.rsp_value: + if nl_msg.nl_seq not in reqs_by_seq or decoded.cmd() != op.rsp_value: if decoded.cmd() in self.async_msg_ids: self.handle_ntf(decoded) continue @@ -931,16 +1016,23 @@ class YnlFamily(SpecFamily): rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name) if op.fixed_header: rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header)) - rsp.append(rsp_msg) + op_rsp.append(rsp_msg) - if not rsp: - return None - if not dump and len(rsp) == 1: - return rsp[0] return rsp + def _op(self, method, vals, flags=None, dump=False): + req_flags = flags or [] + if dump: + req_flags.append(Netlink.NLM_F_DUMP) + + ops = [(method, vals, req_flags)] + return self._ops(ops)[0] + def do(self, method, vals, flags=None): return self._op(method, vals, flags) def dump(self, method, vals): - return self._op(method, vals, [], dump=True) + return self._op(method, vals, dump=True) + + def do_multi(self, ops): + return self._ops(ops) diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index 591b90e218..3e7b29bd55 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -100,6 +100,8 @@ int main(int argc, char **argv) if (!devs) goto err_close; + if (ynl_dump_empty(devs)) + fprintf(stderr, "Error: no devices reported\n"); ynl_dump_foreach(devs, d) netdev_print_device(d, 0); netdev_dev_get_list_free(devs); diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index a451cbfbd7..a42d62b23e 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -413,7 +413,7 @@ class TypeString(Type): def _attr_policy(self, policy): if 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' else: mem = '{ .type = ' + policy if 'max-len' in self.checks: @@ -465,7 +465,7 @@ class TypeBinary(Type): def _attr_policy(self, policy): if 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' else: mem = '{ ' if len(self.checks) == 1 and 'min-len' in self.checks: @@ -841,8 +841,11 @@ class AttrSet(SpecAttrSet): t = TypeBitfield32(self.family, self, elem, value) elif elem['type'] == 'nest': t = TypeNest(self.family, self, elem, value) - elif elem['type'] == 'array-nest': - t = TypeArrayNest(self.family, self, elem, value) + elif elem['type'] == 'indexed-array' and 'sub-type' in elem: + if elem["sub-type"] == 'nest': + t = TypeArrayNest(self.family, self, elem, value) + else: + raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}') elif elem['type'] == 'nest-type-value': t = TypeNestTypeValue(self.family, self, elem, value) else: @@ -1055,7 +1058,7 @@ class Family(SpecFamily): if nested in self.root_sets: raise Exception("Inheriting members to a space used as root not supported") inherit.update(set(spec['type-value'])) - elif spec['type'] == 'array-nest': + elif spec['type'] == 'indexed-array': inherit.add('idx') self.pure_nested_structs[nested].set_inherited(inherit) @@ -1619,9 +1622,12 @@ def _multi_parse(ri, struct, init_lines, local_vars): multi_attrs = set() needs_parg = False for arg, aspec in struct.member_list(): - if aspec['type'] == 'array-nest': - local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') - array_nests.add(arg) + if aspec['type'] == 'indexed-array' and 'sub-type' in aspec: + if aspec["sub-type"] == 'nest': + local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') + array_nests.add(arg) + else: + raise Exception(f'Not supported sub-type {aspec["sub-type"]}') if 'multi-attr' in aspec: multi_attrs.add(arg) needs_parg |= 'nested-attributes' in aspec diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py index 927407b3ef..657e881d2e 100755 --- a/tools/net/ynl/ynl-gen-rst.py +++ b/tools/net/ynl/ynl-gen-rst.py @@ -82,9 +82,9 @@ def rst_subsubsection(title: str) -> str: return f"{title}\n" + "~" * len(title) -def rst_section(title: str) -> str: +def rst_section(namespace: str, prefix: str, title: str) -> str: """Add a section to the document""" - return f"\n{title}\n" + "=" * len(title) + return f".. _{namespace}-{prefix}-{title}:\n\n{title}\n" + "=" * len(title) def rst_subtitle(title: str) -> str: @@ -102,6 +102,17 @@ def rst_list_inline(list_: List[str], level: int = 0) -> str: return headroom(level) + "[" + ", ".join(inline(i) for i in list_) + "]" +def rst_ref(namespace: str, prefix: str, name: str) -> str: + """Add a hyperlink to the document""" + mappings = {'enum': 'definition', + 'fixed-header': 'definition', + 'nested-attributes': 'attribute-set', + 'struct': 'definition'} + if prefix in mappings: + prefix = mappings[prefix] + return f":ref:`{namespace}-{prefix}-{name}`" + + def rst_header() -> str: """The headers for all the auto generated RST files""" lines = [] @@ -159,20 +170,24 @@ def parse_do_attributes(attrs: Dict[str, Any], level: int = 0) -> str: return "\n".join(lines) -def parse_operations(operations: List[Dict[str, Any]]) -> str: +def parse_operations(operations: List[Dict[str, Any]], namespace: str) -> str: """Parse operations block""" preprocessed = ["name", "doc", "title", "do", "dump"] + linkable = ["fixed-header", "attribute-set"] lines = [] for operation in operations: - lines.append(rst_section(operation["name"])) + lines.append(rst_section(namespace, 'operation', operation["name"])) lines.append(rst_paragraph(sanitize(operation["doc"])) + "\n") for key in operation.keys(): if key in preprocessed: # Skip the special fields continue - lines.append(rst_fields(key, operation[key], 0)) + value = operation[key] + if key in linkable: + value = rst_ref(namespace, key, value) + lines.append(rst_fields(key, value, 0)) if "do" in operation: lines.append(rst_paragraph(":do:", 0)) @@ -212,14 +227,14 @@ def parse_entries(entries: List[Dict[str, Any]], level: int) -> str: return "\n".join(lines) -def parse_definitions(defs: Dict[str, Any]) -> str: +def parse_definitions(defs: Dict[str, Any], namespace: str) -> str: """Parse definitions section""" preprocessed = ["name", "entries", "members"] ignored = ["render-max"] # This is not printed lines = [] for definition in defs: - lines.append(rst_section(definition["name"])) + lines.append(rst_section(namespace, 'definition', definition["name"])) for k in definition.keys(): if k in preprocessed + ignored: continue @@ -237,14 +252,15 @@ def parse_definitions(defs: Dict[str, Any]) -> str: return "\n".join(lines) -def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: +def parse_attr_sets(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse attribute from attribute-set""" preprocessed = ["name", "type"] + linkable = ["enum", "nested-attributes", "struct", "sub-message"] ignored = ["checks"] lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'attribute-set', entry["name"])) for attr in entry["attributes"]: type_ = attr.get("type") attr_line = attr["name"] @@ -257,25 +273,31 @@ def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: for k in attr.keys(): if k in preprocessed + ignored: continue - lines.append(rst_fields(k, sanitize(attr[k]), 0)) + if k in linkable: + value = rst_ref(namespace, k, attr[k]) + else: + value = sanitize(attr[k]) + lines.append(rst_fields(k, value, 0)) lines.append("\n") return "\n".join(lines) -def parse_sub_messages(entries: List[Dict[str, Any]]) -> str: +def parse_sub_messages(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse sub-message definitions""" lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'sub-message', entry["name"])) for fmt in entry["formats"]: value = fmt["value"] lines.append(rst_bullet(bold(value))) for attr in ['fixed-header', 'attribute-set']: if attr in fmt: - lines.append(rst_fields(attr, fmt[attr], 1)) + lines.append(rst_fields(attr, + rst_ref(namespace, attr, fmt[attr]), + 1)) lines.append("\n") return "\n".join(lines) @@ -289,9 +311,11 @@ def parse_yaml(obj: Dict[str, Any]) -> str: lines.append(rst_header()) - title = f"Family ``{obj['name']}`` netlink specification" + family = obj['name'] + + title = f"Family ``{family}`` netlink specification" lines.append(rst_title(title)) - lines.append(rst_paragraph(".. contents::\n")) + lines.append(rst_paragraph(".. contents:: :depth: 3\n")) if "doc" in obj: lines.append(rst_subtitle("Summary")) @@ -300,7 +324,7 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Operations if "operations" in obj: lines.append(rst_subtitle("Operations")) - lines.append(parse_operations(obj["operations"]["list"])) + lines.append(parse_operations(obj["operations"]["list"], family)) # Multicast groups if "mcast-groups" in obj: @@ -310,17 +334,17 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Definitions if "definitions" in obj: lines.append(rst_subtitle("Definitions")) - lines.append(parse_definitions(obj["definitions"])) + lines.append(parse_definitions(obj["definitions"], family)) # Attributes set if "attribute-sets" in obj: lines.append(rst_subtitle("Attribute sets")) - lines.append(parse_attr_sets(obj["attribute-sets"])) + lines.append(parse_attr_sets(obj["attribute-sets"], family)) # Sub-messages if "sub-messages" in obj: lines.append(rst_subtitle("Sub-messages")) - lines.append(parse_sub_messages(obj["sub-messages"])) + lines.append(parse_sub_messages(obj["sub-messages"], family)) return "\n".join(lines) diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 7ebf29c911..1e8141ef1b 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -7,12 +7,16 @@ * Yes, this is unfortunate. A better solution is in the works. */ NORETURN(__fortify_panic) +NORETURN(__ia32_sys_exit) +NORETURN(__ia32_sys_exit_group) NORETURN(__kunit_abort) NORETURN(__module_put_and_kthread_exit) NORETURN(__reiserfs_panic) NORETURN(__stack_chk_fail) NORETURN(__tdx_hypercall_failed) NORETURN(__ubsan_handle_builtin_unreachable) +NORETURN(__x64_sys_exit) +NORETURN(__x64_sys_exit_group) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) diff --git a/tools/perf/Build b/tools/perf/Build index aa76236228..b0cb7ad8e6 100644 --- a/tools/perf/Build +++ b/tools/perf/Build @@ -59,3 +59,17 @@ perf-y += ui/ perf-y += scripts/ gtk-y += ui/gtk/ + +ifdef SHELLCHECK + SHELL_TESTS := $(wildcard *.sh) + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -s bash -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt index bf03222e9a..0a3eda4823 100644 --- a/tools/perf/Documentation/perf-arm-spe.txt +++ b/tools/perf/Documentation/perf-arm-spe.txt @@ -116,6 +116,15 @@ Depending on CPU model, the kernel may need to be booted with page table isolati (kpti=off). If KPTI needs to be disabled, this will fail with a console message "profiling buffer inaccessible. Try passing 'kpti=off' on the kernel command line". +For the full criteria that determine whether KPTI needs to be forced off or not, see function +unmap_kernel_at_el0() in the kernel sources. Common cases where it's not required +are on the CPUs in kpti_safe_list, or on Arm v8.5+ where FEAT_E0PD is mandatory. + +The SPE interrupt must also be described by the firmware. If the module is loaded and KPTI is +disabled (or isn't required to be disabled) but the SPE PMU still doesn't show in +/sys/bus/event_source/devices/, then it's possible that the SPE interrupt isn't described by +ACPI or DT. In this case no warning will be printed by the driver. + Capturing SPE with perf command-line tools ------------------------------------------ @@ -199,7 +208,8 @@ Common errors - "Cannot find PMU `arm_spe'. Missing kernel support?" - Module not built or loaded, KPTI not disabled (see above), or running on a VM + Module not built or loaded, KPTI not disabled, interrupt not described by firmware, + or running on a VM. See 'Kernel Requirements' above. - "Arm SPE CONTEXT packets not found in the traces." diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index d8b863e01f..d2b1593ef7 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -121,6 +121,9 @@ OPTIONS - type: Data type of sample memory access. - typeoff: Offset in the data type of sample memory access. - symoff: Offset in the symbol. + - weight1: Average value of event specific weight (1st field of weight_struct). + - weight2: Average value of event specific weight (2nd field of weight_struct). + - weight3: Average value of event specific weight (3rd field of weight_struct). By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) @@ -198,7 +201,11 @@ OPTIONS --fields=:: Specify output field - multiple keys can be specified in CSV format. Following fields are available: - overhead, overhead_sys, overhead_us, overhead_children, sample and period. + overhead, overhead_sys, overhead_us, overhead_children, sample, period, + weight1, weight2, weight3, ins_lat, p_stage_cyc and retire_lat. The + last 3 names are alias for the corresponding weights. When the weight + fields are used, they will show the average value of the weight. + Also it can contain any sort key(s). By default, every sort keys not specified in -F will be appended diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 5fbe42bd59..a216d2991b 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -20,6 +20,26 @@ There are several variants of 'perf sched': 'perf sched latency' to report the per task scheduling latencies and other scheduling properties of the workload. + Example usage: + perf sched record -- sleep 1 + perf sched latency + + ------------------------------------------------------------------------------------------------------------------------------------------- + Task | Runtime ms | Count | Avg delay ms | Max delay ms | Max delay start | Max delay end | + ------------------------------------------------------------------------------------------------------------------------------------------- + perf:(2) | 2.804 ms | 66 | avg: 0.524 ms | max: 1.069 ms | max start: 254752.314960 s | max end: 254752.316029 s + NetworkManager:1343 | 0.372 ms | 13 | avg: 0.008 ms | max: 0.013 ms | max start: 254751.551153 s | max end: 254751.551166 s + kworker/1:2-xfs:4649 | 0.012 ms | 1 | avg: 0.008 ms | max: 0.008 ms | max start: 254751.519807 s | max end: 254751.519815 s + kworker/3:1-xfs:388 | 0.011 ms | 1 | avg: 0.006 ms | max: 0.006 ms | max start: 254751.519809 s | max end: 254751.519815 s + sleep:147736 | 0.938 ms | 3 | avg: 0.006 ms | max: 0.007 ms | max start: 254751.313817 s | max end: 254751.313824 s + + It shows Runtime(time that a task spent actually running on the CPU), + Count(number of times a delay was calculated) and delay(time that a + task was ready to run but was kept waiting). + + Tasks with the same command name are merged and the merge count is + given within (), However if -p option is used, pid is mentioned. + 'perf sched script' to see a detailed trace of the workload that was recorded (aliased to 'perf script' for now). @@ -78,6 +98,22 @@ OPTIONS --force:: Don't complain, do it. +OPTIONS for 'perf sched latency' +------------------------------- + +-C:: +--CPU :: + CPU to profile on. + +-p:: +--pids:: + latency stats per pid instead of per command name. + +-s:: +--sort :: + sort by key(s): runtime, switch, avg, max + by default it's sorted by "avg ,max ,switch ,runtime". + OPTIONS for 'perf sched map' ---------------------------- diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 005e51df85..ff086ef05a 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -132,9 +132,9 @@ OPTIONS Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, dsoff, addr, symoff, srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, - brstackinsn, brstackinsnlen, brstackoff, callindent, insn, disasm, + brstackinsn, brstackinsnlen, brstackdisasm, brstackoff, callindent, insn, disasm, insnlen, synth, phys_addr, metric, misc, srccode, ipc, data_page_size, - code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat. + code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat, Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. @@ -257,6 +257,9 @@ OPTIONS can’t know the next sequential instruction after an unconditional branch unless you calculate that based on its length. + brstackdisasm acts like brstackinsn, but will print disassembled instructions if + perf is built with the capstone library. + The brstackoff field will print an offset into a specific dso/binary. With the metric option perf script can compute metrics for diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt index 951a2f2628..9acb8d1f65 100644 --- a/tools/perf/Documentation/perf-test.txt +++ b/tools/perf/Documentation/perf-test.txt @@ -31,9 +31,20 @@ OPTIONS --verbose:: Be more verbose. +-S:: +--sequential:: + Run tests one after the other, this is the default mode. + +-p:: +--parallel:: + Run tests in parallel, speeds up the whole process but is not safe with + the current infrastructure, where some tests that compete for some resources, + for instance, 'perf probe' tests that add/remove probes or clean all probes, etc. + -F:: --dont-fork:: - Do not fork child for each test, run all tests within single process. + Do not fork child for each test, run all tests within single process, this + sets sequential mode. --dso:: Specify a DSO for the "Symbols" test. diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 1fe8df97fe..7f1e016a92 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -182,6 +182,16 @@ endif FEATURE_CHECK_CFLAGS-libzstd := $(LIBZSTD_CFLAGS) FEATURE_CHECK_LDFLAGS-libzstd := $(LIBZSTD_LDFLAGS) +# for linking with debug library, run like: +# make DEBUG=1 LIBTRACEEVENT_DIR=/opt/libtraceevent/ +TRACEEVENTLIBS := -ltraceevent +ifdef LIBTRACEEVENT_DIR + LIBTRACEEVENT_CFLAGS := -I$(LIBTRACEEVENT_DIR)/include + LIBTRACEEVENT_LDFLAGS := -L$(LIBTRACEEVENT_DIR)/lib +endif +FEATURE_CHECK_CFLAGS-libtraceevent := $(LIBTRACEEVENT_CFLAGS) +FEATURE_CHECK_LDFLAGS-libtraceevent := $(LIBTRACEEVENT_LDFLAGS) $(TRACEEVENTLIBS) + FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi # include ARCH specific config -include $(src-perf)/arch/$(SRCARCH)/Makefile @@ -486,7 +496,10 @@ ifdef NO_DWARF endif ifeq ($(feature-scandirat), 1) - CFLAGS += -DHAVE_SCANDIRAT_SUPPORT + # Ignore having scandirat with memory sanitizer that lacks an interceptor. + ifeq ($(filter s% -fsanitize=memory%,$(EXTRA_CFLAGS),),) + CFLAGS += -DHAVE_SCANDIRAT_SUPPORT + endif endif ifeq ($(feature-sched_getcpu), 1) @@ -1165,9 +1178,10 @@ endif ifneq ($(NO_LIBTRACEEVENT),1) $(call feature_check,libtraceevent) ifeq ($(feature-libtraceevent), 1) - CFLAGS += -DHAVE_LIBTRACEEVENT - EXTLIBS += -ltraceevent - LIBTRACEEVENT_VERSION := $(shell $(PKG_CONFIG) --modversion libtraceevent) + CFLAGS += -DHAVE_LIBTRACEEVENT $(LIBTRACEEVENT_CFLAGS) + LDFLAGS += $(LIBTRACEEVENT_LDFLAGS) + EXTLIBS += ${TRACEEVENTLIBS} + LIBTRACEEVENT_VERSION := $(shell PKG_CONFIG_PATH=$(LIBTRACEEVENT_DIR) $(PKG_CONFIG) --modversion libtraceevent) LIBTRACEEVENT_VERSION_1 := $(word 1, $(subst ., ,$(LIBTRACEEVENT_VERSION))) LIBTRACEEVENT_VERSION_2 := $(word 2, $(subst ., ,$(LIBTRACEEVENT_VERSION))) LIBTRACEEVENT_VERSION_3 := $(word 3, $(subst ., ,$(LIBTRACEEVENT_VERSION))) @@ -1175,7 +1189,7 @@ ifneq ($(NO_LIBTRACEEVENT),1) CFLAGS += -DLIBTRACEEVENT_VERSION=$(LIBTRACEEVENT_VERSION_CPP) $(call detected,CONFIG_LIBTRACEEVENT) else - $(error ERROR: libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel or build with NO_LIBTRACEEVENT=1) + $(error ERROR: libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel and/or set LIBTRACEEVENT_DIR or build with NO_LIBTRACEEVENT=1) endif $(call feature_check,libtracefs) @@ -1301,6 +1315,7 @@ ifeq ($(VF),1) $(call print_var,LIBUNWIND_DIR) $(call print_var,LIBDW_DIR) $(call print_var,JDIR) + $(call print_var,LIBTRACEEVENT_DIR) ifeq ($(dwarf-post-unwind),1) $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text)) $(info $(MSG)) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index d769aa447f..e6d56b5553 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -214,6 +214,7 @@ NON_CONFIG_TARGETS := clean python-clean TAGS tags cscope help ifdef MAKECMDGOALS ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),) + VMLINUX_H=$(src-perf)/util/bpf_skel/vmlinux/vmlinux.h config := 0 endif endif @@ -473,21 +474,38 @@ arm64-sysreg-defs-clean: prefix= subdir= clean > /dev/null beauty_linux_dir := $(srctree)/tools/perf/trace/beauty/include/linux/ +beauty_uapi_linux_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/linux/ +beauty_uapi_sound_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/sound/ +beauty_arch_asm_dir := $(srctree)/tools/perf/trace/beauty/arch/x86/include/asm/ +beauty_x86_arch_asm_uapi_dir := $(srctree)/tools/perf/trace/beauty/arch/x86/include/uapi/asm/ + linux_uapi_dir := $(srctree)/tools/include/uapi/linux asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic arch_asm_uapi_dir := $(srctree)/tools/arch/$(SRCARCH)/include/uapi/asm/ -x86_arch_asm_uapi_dir := $(srctree)/tools/arch/x86/include/uapi/asm/ x86_arch_asm_dir := $(srctree)/tools/arch/x86/include/asm/ beauty_outdir := $(OUTPUT)trace/beauty/generated beauty_ioctl_outdir := $(beauty_outdir)/ioctl -drm_ioctl_array := $(beauty_ioctl_outdir)/drm_ioctl_array.c -drm_hdr_dir := $(srctree)/tools/include/uapi/drm -drm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/drm_ioctl.sh # Create output directory if not already present $(shell [ -d '$(beauty_ioctl_outdir)' ] || mkdir -p '$(beauty_ioctl_outdir)') +fs_at_flags_array := $(beauty_outdir)/fs_at_flags_array.c +fs_at_flags_tbl := $(srctree)/tools/perf/trace/beauty/fs_at_flags.sh + +$(fs_at_flags_array): $(beauty_uapi_linux_dir)/fcntl.h $(fs_at_flags_tbl) + $(Q)$(SHELL) '$(fs_at_flags_tbl)' $(beauty_uapi_linux_dir) > $@ + +clone_flags_array := $(beauty_outdir)/clone_flags_array.c +clone_flags_tbl := $(srctree)/tools/perf/trace/beauty/clone.sh + +$(clone_flags_array): $(beauty_uapi_linux_dir)/sched.h $(clone_flags_tbl) + $(Q)$(SHELL) '$(clone_flags_tbl)' $(beauty_uapi_linux_dir) > $@ + +drm_ioctl_array := $(beauty_ioctl_outdir)/drm_ioctl_array.c +drm_hdr_dir := $(srctree)/tools/include/uapi/drm +drm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/drm_ioctl.sh + $(drm_ioctl_array): $(drm_hdr_dir)/drm.h $(drm_hdr_dir)/i915_drm.h $(drm_ioctl_tbl) $(Q)$(SHELL) '$(drm_ioctl_tbl)' $(drm_hdr_dir) > $@ @@ -500,20 +518,20 @@ $(fadvise_advice_array): $(linux_uapi_dir)/in.h $(fadvise_advice_tbl) fsmount_arrays := $(beauty_outdir)/fsmount_arrays.c fsmount_tbls := $(srctree)/tools/perf/trace/beauty/fsmount.sh -$(fsmount_arrays): $(linux_uapi_dir)/fs.h $(fsmount_tbls) - $(Q)$(SHELL) '$(fsmount_tbls)' $(linux_uapi_dir) > $@ +$(fsmount_arrays): $(beauty_uapi_linux_dir)/mount.h $(fsmount_tbls) + $(Q)$(SHELL) '$(fsmount_tbls)' $(beauty_uapi_linux_dir) > $@ fspick_arrays := $(beauty_outdir)/fspick_arrays.c fspick_tbls := $(srctree)/tools/perf/trace/beauty/fspick.sh -$(fspick_arrays): $(linux_uapi_dir)/fs.h $(fspick_tbls) - $(Q)$(SHELL) '$(fspick_tbls)' $(linux_uapi_dir) > $@ +$(fspick_arrays): $(beauty_uapi_linux_dir)/mount.h $(fspick_tbls) + $(Q)$(SHELL) '$(fspick_tbls)' $(beauty_uapi_linux_dir) > $@ fsconfig_arrays := $(beauty_outdir)/fsconfig_arrays.c fsconfig_tbls := $(srctree)/tools/perf/trace/beauty/fsconfig.sh -$(fsconfig_arrays): $(linux_uapi_dir)/fs.h $(fsconfig_tbls) - $(Q)$(SHELL) '$(fsconfig_tbls)' $(linux_uapi_dir) > $@ +$(fsconfig_arrays): $(beauty_uapi_linux_dir)/mount.h $(fsconfig_tbls) + $(Q)$(SHELL) '$(fsconfig_tbls)' $(beauty_uapi_linux_dir) > $@ pkey_alloc_access_rights_array := $(beauty_outdir)/pkey_alloc_access_rights_array.c asm_generic_hdr_dir := $(srctree)/tools/include/uapi/asm-generic/ @@ -526,15 +544,15 @@ sndrv_ctl_ioctl_array := $(beauty_ioctl_outdir)/sndrv_ctl_ioctl_array.c sndrv_ctl_hdr_dir := $(srctree)/tools/include/uapi/sound sndrv_ctl_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh -$(sndrv_ctl_ioctl_array): $(sndrv_ctl_hdr_dir)/asound.h $(sndrv_ctl_ioctl_tbl) - $(Q)$(SHELL) '$(sndrv_ctl_ioctl_tbl)' $(sndrv_ctl_hdr_dir) > $@ +$(sndrv_ctl_ioctl_array): $(beauty_uapi_sound_dir)/asound.h $(sndrv_ctl_ioctl_tbl) + $(Q)$(SHELL) '$(sndrv_ctl_ioctl_tbl)' $(beauty_uapi_sound_dir) > $@ sndrv_pcm_ioctl_array := $(beauty_ioctl_outdir)/sndrv_pcm_ioctl_array.c sndrv_pcm_hdr_dir := $(srctree)/tools/include/uapi/sound sndrv_pcm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh -$(sndrv_pcm_ioctl_array): $(sndrv_pcm_hdr_dir)/asound.h $(sndrv_pcm_ioctl_tbl) - $(Q)$(SHELL) '$(sndrv_pcm_ioctl_tbl)' $(sndrv_pcm_hdr_dir) > $@ +$(sndrv_pcm_ioctl_array): $(beauty_uapi_sound_dir)/asound.h $(sndrv_pcm_ioctl_tbl) + $(Q)$(SHELL) '$(sndrv_pcm_ioctl_tbl)' $(beauty_uapi_sound_dir) > $@ kcmp_type_array := $(beauty_outdir)/kcmp_type_array.c kcmp_hdr_dir := $(srctree)/tools/include/uapi/linux/ @@ -563,11 +581,10 @@ $(sockaddr_arrays): $(beauty_linux_dir)/socket.h $(sockaddr_tbl) $(Q)$(SHELL) '$(sockaddr_tbl)' $(beauty_linux_dir) > $@ vhost_virtio_ioctl_array := $(beauty_ioctl_outdir)/vhost_virtio_ioctl_array.c -vhost_virtio_hdr_dir := $(srctree)/tools/include/uapi/linux vhost_virtio_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/vhost_virtio_ioctl.sh -$(vhost_virtio_ioctl_array): $(vhost_virtio_hdr_dir)/vhost.h $(vhost_virtio_ioctl_tbl) - $(Q)$(SHELL) '$(vhost_virtio_ioctl_tbl)' $(vhost_virtio_hdr_dir) > $@ +$(vhost_virtio_ioctl_array): $(beauty_uapi_linux_dir)/vhost.h $(vhost_virtio_ioctl_tbl) + $(Q)$(SHELL) '$(vhost_virtio_ioctl_tbl)' $(beauty_uapi_linux_dir) > $@ perf_ioctl_array := $(beauty_ioctl_outdir)/perf_ioctl_array.c perf_hdr_dir := $(srctree)/tools/include/uapi/linux @@ -598,15 +615,14 @@ $(mremap_flags_array): $(linux_uapi_dir)/mman.h $(mremap_flags_tbl) mount_flags_array := $(beauty_outdir)/mount_flags_array.c mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/mount_flags.sh -$(mount_flags_array): $(linux_uapi_dir)/fs.h $(mount_flags_tbl) - $(Q)$(SHELL) '$(mount_flags_tbl)' $(linux_uapi_dir) > $@ +$(mount_flags_array): $(beauty_uapi_linux_dir)/mount.h $(mount_flags_tbl) + $(Q)$(SHELL) '$(mount_flags_tbl)' $(beauty_uapi_linux_dir) > $@ move_mount_flags_array := $(beauty_outdir)/move_mount_flags_array.c move_mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/move_mount_flags.sh -$(move_mount_flags_array): $(linux_uapi_dir)/fs.h $(move_mount_flags_tbl) - $(Q)$(SHELL) '$(move_mount_flags_tbl)' $(linux_uapi_dir) > $@ - +$(move_mount_flags_array): $(beauty_uapi_linux_dir)/mount.h $(move_mount_flags_tbl) + $(Q)$(SHELL) '$(move_mount_flags_tbl)' $(beauty_uapi_linux_dir) > $@ mmap_prot_array := $(beauty_outdir)/mmap_prot_array.c mmap_prot_tbl := $(srctree)/tools/perf/trace/beauty/mmap_prot.sh @@ -615,29 +631,28 @@ $(mmap_prot_array): $(asm_generic_uapi_dir)/mman.h $(asm_generic_uapi_dir)/mman- $(Q)$(SHELL) '$(mmap_prot_tbl)' $(asm_generic_uapi_dir) $(arch_asm_uapi_dir) > $@ prctl_option_array := $(beauty_outdir)/prctl_option_array.c -prctl_hdr_dir := $(srctree)/tools/include/uapi/linux/ prctl_option_tbl := $(srctree)/tools/perf/trace/beauty/prctl_option.sh -$(prctl_option_array): $(prctl_hdr_dir)/prctl.h $(prctl_option_tbl) - $(Q)$(SHELL) '$(prctl_option_tbl)' $(prctl_hdr_dir) > $@ +$(prctl_option_array): $(beauty_uapi_linux_dir)/prctl.h $(prctl_option_tbl) + $(Q)$(SHELL) '$(prctl_option_tbl)' $(beauty_uapi_linux_dir) > $@ usbdevfs_ioctl_array := $(beauty_ioctl_outdir)/usbdevfs_ioctl_array.c usbdevfs_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/usbdevfs_ioctl.sh -$(usbdevfs_ioctl_array): $(linux_uapi_dir)/usbdevice_fs.h $(usbdevfs_ioctl_tbl) - $(Q)$(SHELL) '$(usbdevfs_ioctl_tbl)' $(linux_uapi_dir) > $@ +$(usbdevfs_ioctl_array): $(beauty_uapi_linux_dir)/usbdevice_fs.h $(usbdevfs_ioctl_tbl) + $(Q)$(SHELL) '$(usbdevfs_ioctl_tbl)' $(beauty_uapi_linux_dir) > $@ x86_arch_prctl_code_array := $(beauty_outdir)/x86_arch_prctl_code_array.c x86_arch_prctl_code_tbl := $(srctree)/tools/perf/trace/beauty/x86_arch_prctl.sh -$(x86_arch_prctl_code_array): $(x86_arch_asm_uapi_dir)/prctl.h $(x86_arch_prctl_code_tbl) - $(Q)$(SHELL) '$(x86_arch_prctl_code_tbl)' $(x86_arch_asm_uapi_dir) > $@ +$(x86_arch_prctl_code_array): $(beauty_x86_arch_asm_uapi_dir)/prctl.h $(x86_arch_prctl_code_tbl) + $(Q)$(SHELL) '$(x86_arch_prctl_code_tbl)' $(beauty_x86_arch_asm_uapi_dir) > $@ x86_arch_irq_vectors_array := $(beauty_outdir)/x86_arch_irq_vectors_array.c x86_arch_irq_vectors_tbl := $(srctree)/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh -$(x86_arch_irq_vectors_array): $(x86_arch_asm_dir)/irq_vectors.h $(x86_arch_irq_vectors_tbl) - $(Q)$(SHELL) '$(x86_arch_irq_vectors_tbl)' $(x86_arch_asm_dir) > $@ +$(x86_arch_irq_vectors_array): $(beauty_arch_asm_dir)/irq_vectors.h $(x86_arch_irq_vectors_tbl) + $(Q)$(SHELL) '$(x86_arch_irq_vectors_tbl)' $(beauty_arch_asm_dir) > $@ x86_arch_MSRs_array := $(beauty_outdir)/x86_arch_MSRs_array.c x86_arch_MSRs_tbl := $(srctree)/tools/perf/trace/beauty/tracepoints/x86_msr.sh @@ -648,8 +663,8 @@ $(x86_arch_MSRs_array): $(x86_arch_asm_dir)/msr-index.h $(x86_arch_MSRs_tbl) rename_flags_array := $(beauty_outdir)/rename_flags_array.c rename_flags_tbl := $(srctree)/tools/perf/trace/beauty/rename_flags.sh -$(rename_flags_array): $(linux_uapi_dir)/fs.h $(rename_flags_tbl) - $(Q)$(SHELL) '$(rename_flags_tbl)' $(linux_uapi_dir) > $@ +$(rename_flags_array): $(beauty_uapi_linux_dir)/fs.h $(rename_flags_tbl) + $(Q)$(SHELL) '$(rename_flags_tbl)' $(beauty_uapi_linux_dir) > $@ arch_errno_name_array := $(beauty_outdir)/arch_errno_name_array.c arch_errno_hdr_dir := $(srctree)/tools @@ -658,11 +673,17 @@ arch_errno_tbl := $(srctree)/tools/perf/trace/beauty/arch_errno_names.sh $(arch_errno_name_array): $(arch_errno_tbl) $(Q)$(SHELL) '$(arch_errno_tbl)' '$(patsubst -%,,$(CC))' $(arch_errno_hdr_dir) > $@ +statx_mask_array := $(beauty_outdir)/statx_mask_array.c +statx_mask_tbl := $(srctree)/tools/perf/trace/beauty/statx_mask.sh + +$(statx_mask_array): $(beauty_uapi_linux_dir)/stat.h $(statx_mask_tbl) + $(Q)$(SHELL) '$(statx_mask_tbl)' $(beauty_uapi_linux_dir) > $@ + sync_file_range_arrays := $(beauty_outdir)/sync_file_range_arrays.c sync_file_range_tbls := $(srctree)/tools/perf/trace/beauty/sync_file_range.sh -$(sync_file_range_arrays): $(linux_uapi_dir)/fs.h $(sync_file_range_tbls) - $(Q)$(SHELL) '$(sync_file_range_tbls)' $(linux_uapi_dir) > $@ +$(sync_file_range_arrays): $(beauty_uapi_linux_dir)/fs.h $(sync_file_range_tbls) + $(Q)$(SHELL) '$(sync_file_range_tbls)' $(beauty_uapi_linux_dir) > $@ TESTS_CORESIGHT_DIR := $(srctree)/tools/perf/tests/shell/coresight @@ -763,6 +784,8 @@ build-dir = $(or $(__build-dir),.) prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \ arm64-sysreg-defs \ + $(fs_at_flags_array) \ + $(clone_flags_array) \ $(drm_ioctl_array) \ $(fadvise_advice_array) \ $(fsconfig_arrays) \ @@ -790,6 +813,7 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \ $(x86_arch_prctl_code_array) \ $(rename_flags_array) \ $(arch_errno_name_array) \ + $(statx_mask_array) \ $(sync_file_range_arrays) \ $(LIBAPI) \ $(LIBPERF) \ diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 77e6663c17..da62313679 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -66,18 +66,30 @@ static const char * const metadata_ete_ro[] = { [CS_ETE_TS_SOURCE] = "ts_source", }; -static bool cs_etm_is_etmv4(struct auxtrace_record *itr, int cpu); -static bool cs_etm_is_ete(struct auxtrace_record *itr, int cpu); +enum cs_etm_version { CS_NOT_PRESENT, CS_ETMV3, CS_ETMV4, CS_ETE }; -static int cs_etm_validate_context_id(struct auxtrace_record *itr, - struct evsel *evsel, int cpu) +static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu); +static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val); +static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path); + +static enum cs_etm_version cs_etm_get_version(struct perf_pmu *cs_etm_pmu, + struct perf_cpu cpu) +{ + if (cs_etm_is_ete(cs_etm_pmu, cpu)) + return CS_ETE; + else if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0])) + return CS_ETMV4; + else if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMCCER])) + return CS_ETMV3; + + return CS_NOT_PRESENT; +} + +static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel *evsel, + struct perf_cpu cpu) { - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - char path[PATH_MAX]; int err; - u32 val; + __u64 val; u64 contextid = evsel->core.attr.config & (perf_pmu__format_bits(cs_etm_pmu, "contextid") | perf_pmu__format_bits(cs_etm_pmu, "contextid1") | @@ -87,23 +99,16 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, return 0; /* Not supported in etmv3 */ - if (!cs_etm_is_etmv4(itr, cpu)) { + if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_ETMV3) { pr_err("%s: contextid not supported in ETMv3, disable with %s/contextid=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; } /* Get a handle on TRCIDR2 */ - snprintf(path, PATH_MAX, "cpu%d/%s", - cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2]); - err = perf_pmu__scan_file(cs_etm_pmu, path, "%x", &val); - - /* There was a problem reading the file, bailing out */ - if (err != 1) { - pr_err("%s: can't read file %s\n", CORESIGHT_ETM_PMU_NAME, - path); + err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &val); + if (err) return err; - } if (contextid & perf_pmu__format_bits(cs_etm_pmu, "contextid1")) { @@ -140,37 +145,26 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, return 0; } -static int cs_etm_validate_timestamp(struct auxtrace_record *itr, - struct evsel *evsel, int cpu) +static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *evsel, + struct perf_cpu cpu) { - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - char path[PATH_MAX]; int err; - u32 val; + __u64 val; if (!(evsel->core.attr.config & perf_pmu__format_bits(cs_etm_pmu, "timestamp"))) return 0; - if (!cs_etm_is_etmv4(itr, cpu)) { + if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_ETMV3) { pr_err("%s: timestamp not supported in ETMv3, disable with %s/timestamp=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; } /* Get a handle on TRCIRD0 */ - snprintf(path, PATH_MAX, "cpu%d/%s", - cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); - err = perf_pmu__scan_file(cs_etm_pmu, path, "%x", &val); - - /* There was a problem reading the file, bailing out */ - if (err != 1) { - pr_err("%s: can't read file %s\n", - CORESIGHT_ETM_PMU_NAME, path); + err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &val); + if (err) return err; - } /* * TRCIDR0.TSSIZE, bit [28-24], indicates whether global timestamping @@ -187,6 +181,13 @@ static int cs_etm_validate_timestamp(struct auxtrace_record *itr, return 0; } +static struct perf_pmu *cs_etm_get_pmu(struct auxtrace_record *itr) +{ + struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); + + return ptr->cs_etm_pmu; +} + /* * Check whether the requested timestamp and contextid options should be * available on all requested CPUs and if not, tell the user how to override. @@ -194,41 +195,45 @@ static int cs_etm_validate_timestamp(struct auxtrace_record *itr, * first is better. In theory the kernel could still disable the option for * some other reason so this is best effort only. */ -static int cs_etm_validate_config(struct auxtrace_record *itr, +static int cs_etm_validate_config(struct perf_pmu *cs_etm_pmu, struct evsel *evsel) { - int i, err = -EINVAL; + int idx, err = 0; struct perf_cpu_map *event_cpus = evsel->evlist->core.user_requested_cpus; - struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); + struct perf_cpu_map *intersect_cpus; + struct perf_cpu cpu; - /* Set option of each CPU we have */ - for (i = 0; i < cpu__max_cpu().cpu; i++) { - struct perf_cpu cpu = { .cpu = i, }; - - /* - * In per-cpu case, do the validation for CPUs to work with. - * In per-thread case, the CPU map is empty. Since the traced - * program can run on any CPUs in this case, thus don't skip - * validation. - */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(event_cpus) && - !perf_cpu_map__has(event_cpus, cpu)) - continue; + /* + * Set option of each CPU we have. In per-cpu case, do the validation + * for CPUs to work with. In per-thread case, the CPU map has the "any" + * CPU value. Since the traced program can run on any CPUs in this case, + * thus don't skip validation. + */ + if (!perf_cpu_map__has_any_cpu(event_cpus)) { + struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); - if (!perf_cpu_map__has(online_cpus, cpu)) - continue; + intersect_cpus = perf_cpu_map__intersect(event_cpus, online_cpus); + perf_cpu_map__put(online_cpus); + } else { + intersect_cpus = perf_cpu_map__new_online_cpus(); + } - err = cs_etm_validate_context_id(itr, evsel, i); + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { + if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_NOT_PRESENT) { + pr_err("%s: Not found on CPU %d. Check hardware and firmware support and that all Coresight drivers are loaded\n", + CORESIGHT_ETM_PMU_NAME, cpu.cpu); + return -EINVAL; + } + err = cs_etm_validate_context_id(cs_etm_pmu, evsel, cpu); if (err) - goto out; - err = cs_etm_validate_timestamp(itr, evsel, i); + break; + + err = cs_etm_validate_timestamp(cs_etm_pmu, evsel, cpu); if (err) - goto out; + break; } - err = 0; -out: - perf_cpu_map__put(online_cpus); + perf_cpu_map__put(intersect_cpus); return err; } @@ -435,7 +440,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, * Also the case of per-cpu mmaps, need the contextID in order to be notified * when a context switch happened. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, "timestamp", 1); evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, @@ -461,10 +466,10 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, evsel->core.attr.sample_period = 1; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(evsel, TIME); - err = cs_etm_validate_config(itr, cs_etm_evsel); + err = cs_etm_validate_config(cs_etm_pmu, cs_etm_evsel); out: return err; } @@ -530,48 +535,35 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr) } static size_t -cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, - struct evlist *evlist __maybe_unused) +cs_etm_info_priv_size(struct auxtrace_record *itr, + struct evlist *evlist) { - int i; + int idx; int etmv3 = 0, etmv4 = 0, ete = 0; struct perf_cpu_map *event_cpus = evlist->core.user_requested_cpus; - struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); - - /* cpu map is not empty, we have specific CPUs to work with */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(event_cpus)) { - for (i = 0; i < cpu__max_cpu().cpu; i++) { - struct perf_cpu cpu = { .cpu = i, }; + struct perf_cpu_map *intersect_cpus; + struct perf_cpu cpu; + struct perf_pmu *cs_etm_pmu = cs_etm_get_pmu(itr); - if (!perf_cpu_map__has(event_cpus, cpu) || - !perf_cpu_map__has(online_cpus, cpu)) - continue; + if (!perf_cpu_map__has_any_cpu(event_cpus)) { + /* cpu map is not "any" CPU , we have specific CPUs to work with */ + struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); - if (cs_etm_is_ete(itr, i)) - ete++; - else if (cs_etm_is_etmv4(itr, i)) - etmv4++; - else - etmv3++; - } + intersect_cpus = perf_cpu_map__intersect(event_cpus, online_cpus); + perf_cpu_map__put(online_cpus); } else { - /* get configuration for all CPUs in the system */ - for (i = 0; i < cpu__max_cpu().cpu; i++) { - struct perf_cpu cpu = { .cpu = i, }; - - if (!perf_cpu_map__has(online_cpus, cpu)) - continue; - - if (cs_etm_is_ete(itr, i)) - ete++; - else if (cs_etm_is_etmv4(itr, i)) - etmv4++; - else - etmv3++; - } + /* Event can be "any" CPU so count all online CPUs. */ + intersect_cpus = perf_cpu_map__new_online_cpus(); } + /* Count number of each type of ETM. Don't count if that CPU has CS_NOT_PRESENT. */ + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { + enum cs_etm_version v = cs_etm_get_version(cs_etm_pmu, cpu); - perf_cpu_map__put(online_cpus); + ete += v == CS_ETE; + etmv4 += v == CS_ETMV4; + etmv3 += v == CS_ETMV3; + } + perf_cpu_map__put(intersect_cpus); return (CS_ETM_HEADER_SIZE + (ete * CS_ETE_PRIV_SIZE) + @@ -579,66 +571,49 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, (etmv3 * CS_ETMV3_PRIV_SIZE)); } -static bool cs_etm_is_etmv4(struct auxtrace_record *itr, int cpu) -{ - bool ret = false; - char path[PATH_MAX]; - int scan; - unsigned int val; - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - - /* Take any of the RO files for ETMv4 and see if it present */ - snprintf(path, PATH_MAX, "cpu%d/%s", - cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); - scan = perf_pmu__scan_file(cs_etm_pmu, path, "%x", &val); - - /* The file was read successfully, we have a winner */ - if (scan == 1) - ret = true; - - return ret; -} - -static int cs_etm_get_ro(struct perf_pmu *pmu, int cpu, const char *path) +static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val) { char pmu_path[PATH_MAX]; int scan; - unsigned int val = 0; /* Get RO metadata from sysfs */ - snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu, path); + snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu.cpu, path); - scan = perf_pmu__scan_file(pmu, pmu_path, "%x", &val); - if (scan != 1) + scan = perf_pmu__scan_file(pmu, pmu_path, "%llx", val); + if (scan != 1) { pr_err("%s: error reading: %s\n", __func__, pmu_path); + return -EINVAL; + } - return val; + return 0; } -static int cs_etm_get_ro_signed(struct perf_pmu *pmu, int cpu, const char *path) +static int cs_etm_get_ro_signed(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, + __u64 *out_val) { char pmu_path[PATH_MAX]; int scan; int val = 0; /* Get RO metadata from sysfs */ - snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu, path); + snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu.cpu, path); scan = perf_pmu__scan_file(pmu, pmu_path, "%d", &val); - if (scan != 1) + if (scan != 1) { pr_err("%s: error reading: %s\n", __func__, pmu_path); + return -EINVAL; + } - return val; + *out_val = (__u64) val; + return 0; } -static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, int cpu, const char *path) +static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path) { char pmu_path[PATH_MAX]; /* Get RO metadata from sysfs */ - snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu, path); + snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu.cpu, path); return perf_pmu__file_exists(pmu, pmu_path); } @@ -651,16 +626,14 @@ static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, int cpu, const char *pa #define TRCDEVARCH_ARCHVER_MASK GENMASK(15, 12) #define TRCDEVARCH_ARCHVER(x) (((x) & TRCDEVARCH_ARCHVER_MASK) >> TRCDEVARCH_ARCHVER_SHIFT) -static bool cs_etm_is_ete(struct auxtrace_record *itr, int cpu) +static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu) { - struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - int trcdevarch; + __u64 trcdevarch; if (!cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH])) return false; - trcdevarch = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH], &trcdevarch); /* * ETE if ARCHVER is 5 (ARCHVER is 4 for ETM) and ARCHPART is 0xA13. * See ETM_DEVARCH_ETE_ARCH in coresight-etm4x.h @@ -668,7 +641,12 @@ static bool cs_etm_is_ete(struct auxtrace_record *itr, int cpu) return TRCDEVARCH_ARCHVER(trcdevarch) == 5 && TRCDEVARCH_ARCHPART(trcdevarch) == 0xA13; } -static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, int cpu) +static __u64 cs_etm_get_legacy_trace_id(struct perf_cpu cpu) +{ + return CORESIGHT_LEGACY_CPU_TRACE_ID(cpu.cpu); +} + +static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, struct perf_cpu cpu) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; @@ -676,33 +654,32 @@ static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, /* Get trace configuration register */ data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_get_config(itr); /* traceID set to legacy version, in case new perf running on older system */ - data[CS_ETMV4_TRCTRACEIDR] = - CORESIGHT_LEGACY_CPU_TRACE_ID(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; + data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | + CORESIGHT_TRACE_ID_UNUSED_FLAG; /* Get read-only information from sysFS */ - data[CS_ETMV4_TRCIDR0] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); - data[CS_ETMV4_TRCIDR1] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR1]); - data[CS_ETMV4_TRCIDR2] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR2]); - data[CS_ETMV4_TRCIDR8] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR8]); - data[CS_ETMV4_TRCAUTHSTATUS] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCAUTHSTATUS]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], + &data[CS_ETMV4_TRCIDR0]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR1], + &data[CS_ETMV4_TRCIDR1]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], + &data[CS_ETMV4_TRCIDR2]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR8], + &data[CS_ETMV4_TRCIDR8]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCAUTHSTATUS], + &data[CS_ETMV4_TRCAUTHSTATUS]); /* Kernels older than 5.19 may not expose ts_source */ - if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TS_SOURCE])) - data[CS_ETMV4_TS_SOURCE] = (__u64) cs_etm_get_ro_signed(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TS_SOURCE]); - else { + if (!cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TS_SOURCE]) || + cs_etm_get_ro_signed(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TS_SOURCE], + &data[CS_ETMV4_TS_SOURCE])) { pr_debug3("[%03d] pmu file 'ts_source' not found. Fallback to safe value (-1)\n", - cpu); + cpu.cpu); data[CS_ETMV4_TS_SOURCE] = (__u64) -1; } } -static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, int cpu) +static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, struct perf_cpu cpu) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; @@ -710,83 +687,85 @@ static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, in /* Get trace configuration register */ data[CS_ETE_TRCCONFIGR] = cs_etmv4_get_config(itr); /* traceID set to legacy version, in case new perf running on older system */ - data[CS_ETE_TRCTRACEIDR] = - CORESIGHT_LEGACY_CPU_TRACE_ID(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; + data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; /* Get read-only information from sysFS */ - data[CS_ETE_TRCIDR0] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR0]); - data[CS_ETE_TRCIDR1] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR1]); - data[CS_ETE_TRCIDR2] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR2]); - data[CS_ETE_TRCIDR8] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR8]); - data[CS_ETE_TRCAUTHSTATUS] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCAUTHSTATUS]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR0], &data[CS_ETE_TRCIDR0]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR1], &data[CS_ETE_TRCIDR1]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR2], &data[CS_ETE_TRCIDR2]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR8], &data[CS_ETE_TRCIDR8]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCAUTHSTATUS], + &data[CS_ETE_TRCAUTHSTATUS]); /* ETE uses the same registers as ETMv4 plus TRCDEVARCH */ - data[CS_ETE_TRCDEVARCH] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCDEVARCH]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH], + &data[CS_ETE_TRCDEVARCH]); /* Kernels older than 5.19 may not expose ts_source */ - if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TS_SOURCE])) - data[CS_ETE_TS_SOURCE] = (__u64) cs_etm_get_ro_signed(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TS_SOURCE]); - else { + if (!cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TS_SOURCE]) || + cs_etm_get_ro_signed(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TS_SOURCE], + &data[CS_ETE_TS_SOURCE])) { pr_debug3("[%03d] pmu file 'ts_source' not found. Fallback to safe value (-1)\n", - cpu); + cpu.cpu); data[CS_ETE_TS_SOURCE] = (__u64) -1; } } -static void cs_etm_get_metadata(int cpu, u32 *offset, +static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset, struct auxtrace_record *itr, struct perf_record_auxtrace_info *info) { u32 increment, nr_trc_params; u64 magic; - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; + struct perf_pmu *cs_etm_pmu = cs_etm_get_pmu(itr); /* first see what kind of tracer this cpu is affined to */ - if (cs_etm_is_ete(itr, cpu)) { + switch (cs_etm_get_version(cs_etm_pmu, cpu)) { + case CS_ETE: magic = __perf_cs_ete_magic; cs_etm_save_ete_header(&info->priv[*offset], itr, cpu); /* How much space was used */ increment = CS_ETE_PRIV_MAX; nr_trc_params = CS_ETE_PRIV_MAX - CS_ETM_COMMON_BLK_MAX_V1; - } else if (cs_etm_is_etmv4(itr, cpu)) { + break; + + case CS_ETMV4: magic = __perf_cs_etmv4_magic; cs_etm_save_etmv4_header(&info->priv[*offset], itr, cpu); /* How much space was used */ increment = CS_ETMV4_PRIV_MAX; nr_trc_params = CS_ETMV4_PRIV_MAX - CS_ETMV4_TRCCONFIGR; - } else { + break; + + case CS_ETMV3: magic = __perf_cs_etmv3_magic; /* Get configuration register */ info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr); /* traceID set to legacy value in case new perf running on old system */ - info->priv[*offset + CS_ETM_ETMTRACEIDR] = - CORESIGHT_LEGACY_CPU_TRACE_ID(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; + info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | + CORESIGHT_TRACE_ID_UNUSED_FLAG; /* Get read-only information from sysFS */ - info->priv[*offset + CS_ETM_ETMCCER] = - cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv3_ro[CS_ETM_ETMCCER]); - info->priv[*offset + CS_ETM_ETMIDR] = - cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv3_ro[CS_ETM_ETMIDR]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMCCER], + &info->priv[*offset + CS_ETM_ETMCCER]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMIDR], + &info->priv[*offset + CS_ETM_ETMIDR]); /* How much space was used */ increment = CS_ETM_PRIV_MAX; nr_trc_params = CS_ETM_PRIV_MAX - CS_ETM_ETMCR; + break; + + default: + case CS_NOT_PRESENT: + /* Unreachable, CPUs already validated in cs_etm_validate_config() */ + assert(true); + return; } /* Build generic header portion */ info->priv[*offset + CS_ETM_MAGIC] = magic; - info->priv[*offset + CS_ETM_CPU] = cpu; + info->priv[*offset + CS_ETM_CPU] = cpu.cpu; info->priv[*offset + CS_ETM_NR_TRC_PARAMS] = nr_trc_params; /* Where the next CPU entry should start from */ *offset += increment; @@ -806,6 +785,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; + struct perf_cpu cpu; if (priv_size != cs_etm_info_priv_size(itr, session->evlist)) return -EINVAL; @@ -813,16 +793,13 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, if (!session->evlist->core.nr_mmaps) return -EINVAL; - /* If the cpu_map is empty all online CPUs are involved */ - if (perf_cpu_map__has_any_cpu_or_is_empty(event_cpus)) { + /* If the cpu_map has the "any" CPU all online CPUs are involved */ + if (perf_cpu_map__has_any_cpu(event_cpus)) { cpu_map = online_cpus; } else { /* Make sure all specified CPUs are online */ - for (i = 0; i < perf_cpu_map__nr(event_cpus); i++) { - struct perf_cpu cpu = { .cpu = i, }; - - if (perf_cpu_map__has(event_cpus, cpu) && - !perf_cpu_map__has(online_cpus, cpu)) + perf_cpu_map__for_each_cpu(cpu, i, event_cpus) { + if (!perf_cpu_map__has(online_cpus, cpu)) return -EINVAL; } @@ -842,11 +819,9 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, offset = CS_ETM_SNAPSHOT + 1; - for (i = 0; i < cpu__max_cpu().cpu && offset < priv_size; i++) { - struct perf_cpu cpu = { .cpu = i, }; - - if (perf_cpu_map__has(cpu_map, cpu)) - cs_etm_get_metadata(i, &offset, itr, info); + perf_cpu_map__for_each_cpu(cpu, i, cpu_map) { + assert(offset < priv_size); + cs_etm_get_metadata(cpu, &offset, itr, info); } perf_cpu_map__put(online_cpus); diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 51ccbfd3d2..0b52e67edb 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -232,7 +232,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, sample CPU for AUX event; * also enable the timestamp tracing for samples correlation. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(arm_spe_evsel, CPU); evsel__set_config_if_unset(arm_spe_pmu, arm_spe_evsel, "ts_enable", 1); @@ -265,7 +265,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, tracking_evsel->core.attr.sample_period = 1; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(tracking_evsel, TIME); evsel__set_sample_bit(tracking_evsel, CPU); diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c index 9703749915..741df3614a 100644 --- a/tools/perf/arch/arm64/util/header.c +++ b/tools/perf/arch/arm64/util/header.c @@ -4,8 +4,6 @@ #include #include #include -#include -#include #include #include #include "debug.h" @@ -19,20 +17,18 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) { const char *sysfs = sysfs__mountpoint(); - int cpu; - int ret = EINVAL; + struct perf_cpu cpu; + int idx, ret = EINVAL; if (!sysfs || sz < MIDR_SIZE) return EINVAL; - cpus = perf_cpu_map__get(cpus); - - for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { + perf_cpu_map__for_each_cpu(cpu, idx, cpus) { char path[PATH_MAX]; FILE *file; scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d" MIDR, - sysfs, RC_CHK_ACCESS(cpus)->map[cpu].cpu); + sysfs, cpu.cpu); file = fopen(path, "r"); if (!file) { @@ -51,7 +47,6 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) break; } - perf_cpu_map__put(cpus); return ret; } diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 532b855df5..1464c6be6e 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -376,3 +376,4 @@ 459 n64 lsm_get_self_attr sys_lsm_get_self_attr 460 n64 lsm_set_self_attr sys_lsm_set_self_attr 461 n64 lsm_list_modules sys_lsm_list_modules +462 n64 mseal sys_mseal diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 17173b82ca..3656f1ca7a 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -548,3 +548,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index 5f3edb3004..356786432f 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -159,9 +159,9 @@ static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc) Dwarf_Addr start = pc; Dwarf_Addr end = pc; bool signalp; - const char *exec_file = dso->long_name; + const char *exec_file = dso__long_name(dso); - dwfl = dso->dwfl; + dwfl = RC_CHK_ACCESS(dso)->dwfl; if (!dwfl) { dwfl = dwfl_begin(&offline_callbacks); @@ -183,7 +183,7 @@ static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc) dwfl_end(dwfl); goto out; } - dso->dwfl = dwfl; + RC_CHK_ACCESS(dso)->dwfl = dwfl; } mod = dwfl_addrmodule(dwfl, pc); @@ -267,7 +267,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain) rc = check_return_addr(dso, map__start(al.map), ip); pr_debug("[DSO %s, sym %s, ip 0x%" PRIx64 "] rc %d\n", - dso->long_name, al.sym->name, ip, rc); + dso__long_name(dso), al.sym->name, ip, rc); if (rc == 0) { /* diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 095bb86339..bd0fee24ad 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -464,3 +464,4 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal sys_mseal diff --git a/tools/perf/arch/x86/Build b/tools/perf/arch/x86/Build index a7dd46a5b6..ed37013b42 100644 --- a/tools/perf/arch/x86/Build +++ b/tools/perf/arch/x86/Build @@ -1,2 +1,16 @@ perf-y += util/ perf-y += tests/ + +ifdef SHELLCHECK + SHELL_TESTS := entry/syscalls/syscalltbl.sh + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 7e8d46f414..a396f6e6ab 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -374,7 +374,7 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 -453 64 map_shadow_stack sys_map_shadow_stack +453 common map_shadow_stack sys_map_shadow_stack 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue @@ -383,6 +383,7 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal # # Due to a historical design error, certain syscalls are numbered differently diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build index b87f46e5fe..c1e3b7d395 100644 --- a/tools/perf/arch/x86/tests/Build +++ b/tools/perf/arch/x86/tests/Build @@ -10,3 +10,17 @@ perf-$(CONFIG_AUXTRACE) += insn-x86.o endif perf-$(CONFIG_X86_64) += bp-modify.o perf-y += amd-ibs-via-core-pmu.o + +ifdef SHELLCHECK + SHELL_TESTS := gen-insn-x86-dat.sh + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh b/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh index 0d0a003a9c..89c46532cd 100755 --- a/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh +++ b/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh @@ -11,7 +11,7 @@ if [ "$(uname -m)" != "x86_64" ]; then exit 1 fi -cd $(dirname $0) +cd "$(dirname $0)" trap 'echo "Might need a more recent version of binutils"' EXIT diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index af8ae46475..34696f3d3d 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -143,7 +143,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, if (!opts->full_auxtrace) return 0; - if (opts->full_auxtrace && !perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (opts->full_auxtrace && !perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { pr_err(INTEL_BTS_PMU_NAME " does not support per-cpu recording\n"); return -EINVAL; } @@ -224,7 +224,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, we need the CPU on the * AUX event. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(intel_bts_evsel, CPU); } diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index d199619df3..4b710e8759 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -32,6 +32,7 @@ #include "../../../util/tsc.h" #include // page_size #include "../../../util/intel-pt.h" +#include #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) @@ -369,7 +370,7 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, ui__warning("Intel Processor Trace: TSC not available\n"); } - per_cpu_mmaps = !perf_cpu_map__has_any_cpu_or_is_empty(session->evlist->core.user_requested_cpus); + per_cpu_mmaps = !perf_cpu_map__is_any_cpu_or_is_empty(session->evlist->core.user_requested_cpus); auxtrace_info->type = PERF_AUXTRACE_INTEL_PT; auxtrace_info->priv[INTEL_PT_PMU_TYPE] = intel_pt_pmu->type; @@ -428,6 +429,16 @@ static int intel_pt_track_switches(struct evlist *evlist) } #endif +static bool intel_pt_exclude_guest(void) +{ + int pt_mode; + + if (sysfs__read_int("module/kvm_intel/parameters/pt_mode", &pt_mode)) + pt_mode = 0; + + return pt_mode == 1; +} + static void intel_pt_valid_str(char *str, size_t len, u64 valid) { unsigned int val, last = 0, state = 1; @@ -620,6 +631,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, } evsel->core.attr.freq = 0; evsel->core.attr.sample_period = 1; + evsel->core.attr.exclude_guest = intel_pt_exclude_guest(); evsel->no_aux_samples = true; evsel->needs_auxtrace_mmap = true; intel_pt_evsel = evsel; @@ -758,7 +770,8 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, } if (!opts->auxtrace_snapshot_mode && !opts->auxtrace_sample_mode) { - u32 aux_watermark = opts->auxtrace_mmap_pages * page_size / 4; + size_t aw = opts->auxtrace_mmap_pages * (size_t)page_size / 4; + u32 aux_watermark = aw > UINT_MAX ? UINT_MAX : aw; intel_pt_evsel->core.attr.aux_watermark = aux_watermark; } @@ -774,7 +787,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * Per-cpu recording needs sched_switch events to distinguish different * threads. */ - if (have_timing_info && !perf_cpu_map__has_any_cpu_or_is_empty(cpus) && + if (have_timing_info && !perf_cpu_map__is_any_cpu_or_is_empty(cpus) && !record_opts__no_switch_events(opts)) { if (perf_can_record_switch_events()) { bool cpu_wide = !target__none(&opts->target) && @@ -832,7 +845,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, we need the CPU on the * AUX event. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(intel_pt_evsel, CPU); } @@ -858,7 +871,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, tracking_evsel->immediate = true; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(tracking_evsel, TIME); /* And the CPU for switch events */ evsel__set_sample_bit(tracking_evsel, CPU); @@ -870,7 +883,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * Warn the user when we do not have enough information to decode i.e. * per-cpu with no sched_switch (except workload-only). */ - if (!ptr->have_sched_switch && !perf_cpu_map__has_any_cpu_or_is_empty(cpus) && + if (!ptr->have_sched_switch && !perf_cpu_map__is_any_cpu_or_is_empty(cpus) && !target__none(&opts->target) && !intel_pt_evsel->core.attr.exclude_user) ui__warning("Intel Processor Trace decoding will not be possible except for kernel tracing!\n"); diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index faa18e6d24..9f736423af 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -46,6 +46,8 @@ int bench_breakpoint_enable(int argc, const char **argv); int bench_uprobe_baseline(int argc, const char **argv); int bench_uprobe_empty(int argc, const char **argv); int bench_uprobe_trace_printk(int argc, const char **argv); +int bench_uprobe_empty_ret(int argc, const char **argv); +int bench_uprobe_trace_printk_ret(int argc, const char **argv); int bench_pmu_scan(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c index b722ff88fe..0b90275862 100644 --- a/tools/perf/bench/uprobe.c +++ b/tools/perf/bench/uprobe.c @@ -26,9 +26,11 @@ static int loops = LOOPS_DEFAULT; enum bench_uprobe { - BENCH_UPROBE__BASELINE, - BENCH_UPROBE__EMPTY, - BENCH_UPROBE__TRACE_PRINTK, + BENCH_UPROBE__BASELINE, + BENCH_UPROBE__EMPTY, + BENCH_UPROBE__TRACE_PRINTK, + BENCH_UPROBE__EMPTY_RET, + BENCH_UPROBE__TRACE_PRINTK_RET, }; static const struct option options[] = { @@ -81,6 +83,8 @@ static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench) case BENCH_UPROBE__BASELINE: break; case BENCH_UPROBE__EMPTY: bench_uprobe__attach_uprobe(empty); break; case BENCH_UPROBE__TRACE_PRINTK: bench_uprobe__attach_uprobe(trace_printk); break; + case BENCH_UPROBE__EMPTY_RET: bench_uprobe__attach_uprobe(empty_ret); break; + case BENCH_UPROBE__TRACE_PRINTK_RET: bench_uprobe__attach_uprobe(trace_printk_ret); break; default: fprintf(stderr, "Invalid bench: %d\n", bench); goto cleanup; @@ -197,3 +201,13 @@ int bench_uprobe_trace_printk(int argc, const char **argv) { return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK); } + +int bench_uprobe_empty_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY_RET); +} + +int bench_uprobe_trace_printk_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK_RET); +} diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 9cd97fd76b..50d2fb222d 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -37,11 +37,13 @@ #include "util/map_symbol.h" #include "util/branch.h" #include "util/util.h" +#include "ui/progress.h" #include #include #include #include +#include struct perf_annotate { struct perf_tool tool; @@ -217,7 +219,7 @@ static int process_branch_callback(struct evsel *evsel, } if (a.map != NULL) - map__dso(a.map)->hit = 1; + dso__set_hit(map__dso(a.map)); hist__account_cycles(sample->branch_stack, al, sample, false, NULL); @@ -252,7 +254,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample, if (al->sym != NULL) { struct dso *dso = map__dso(al->map); - rb_erase_cached(&al->sym->rb_node, &dso->symbols); + rb_erase_cached(&al->sym->rb_node, dso__symbols(dso)); symbol__delete(al->sym); dso__reset_find_symbol_cache(dso); } @@ -327,77 +329,6 @@ static int hist_entry__tty_annotate(struct hist_entry *he, return symbol__tty_annotate2(&he->ms, evsel); } -static void print_annotated_data_header(struct hist_entry *he, struct evsel *evsel) -{ - struct dso *dso = map__dso(he->ms.map); - int nr_members = 1; - int nr_samples = he->stat.nr_events; - - if (evsel__is_group_event(evsel)) { - struct hist_entry *pair; - - list_for_each_entry(pair, &he->pairs.head, pairs.node) - nr_samples += pair->stat.nr_events; - } - - printf("Annotate type: '%s' in %s (%d samples):\n", - he->mem_type->self.type_name, dso->name, nr_samples); - - if (evsel__is_group_event(evsel)) { - struct evsel *pos; - int i = 0; - - for_each_group_evsel(pos, evsel) - printf(" event[%d] = %s\n", i++, pos->name); - - nr_members = evsel->core.nr_members; - } - - printf("============================================================================\n"); - printf("%*s %10s %10s %s\n", 11 * nr_members, "samples", "offset", "size", "field"); -} - -static void print_annotated_data_type(struct annotated_data_type *mem_type, - struct annotated_member *member, - struct evsel *evsel, int indent) -{ - struct annotated_member *child; - struct type_hist *h = mem_type->histograms[evsel->core.idx]; - int i, nr_events = 1, samples = 0; - - for (i = 0; i < member->size; i++) - samples += h->addr[member->offset + i].nr_samples; - printf(" %10d", samples); - - if (evsel__is_group_event(evsel)) { - struct evsel *pos; - - for_each_group_member(pos, evsel) { - h = mem_type->histograms[pos->core.idx]; - - samples = 0; - for (i = 0; i < member->size; i++) - samples += h->addr[member->offset + i].nr_samples; - printf(" %10d", samples); - } - nr_events = evsel->core.nr_members; - } - - printf(" %10d %10d %*s%s\t%s", - member->offset, member->size, indent, "", member->type_name, - member->var_name ?: ""); - - if (!list_empty(&member->children)) - printf(" {\n"); - - list_for_each_entry(child, &member->children, node) - print_annotated_data_type(mem_type, child, evsel, indent + 4); - - if (!list_empty(&member->children)) - printf("%*s}", 11 * nr_events + 24 + indent, ""); - printf(";\n"); -} - static void print_annotate_data_stat(struct annotated_data_stat *s) { #define PRINT_STAT(fld) if (s->fld) printf("%10d : %s\n", s->fld, #fld) @@ -430,6 +361,7 @@ static void print_annotate_data_stat(struct annotated_data_stat *s) PRINT_STAT(no_typeinfo); PRINT_STAT(invalid_size); PRINT_STAT(bad_offset); + PRINT_STAT(insn_track); printf("\n"); #undef PRINT_STAT @@ -487,7 +419,7 @@ static void hists__find_annotations(struct hists *hists, struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); struct annotation *notes; - if (he->ms.sym == NULL || map__dso(he->ms.map)->annotate_warned) + if (he->ms.sym == NULL || dso__annotate_warned(map__dso(he->ms.map))) goto find_next; if (ann->sym_hist_filter && @@ -537,10 +469,32 @@ find_next: goto find_next; } - print_annotated_data_header(he, evsel); - print_annotated_data_type(he->mem_type, &he->mem_type->self, evsel, 0); - printf("\n"); - goto find_next; + if (use_browser == 1) + key = hist_entry__annotate_data_tui(he, evsel, NULL); + else + key = hist_entry__annotate_data_tty(he, evsel); + + switch (key) { + case -1: + if (!ann->skip_missing) + return; + /* fall through */ + case K_RIGHT: + case '>': + next = rb_next(nd); + break; + case K_LEFT: + case '<': + next = rb_prev(nd); + break; + default: + return; + } + + if (use_browser == 0 || next != NULL) + nd = next; + + continue; } if (use_browser == 2) { @@ -632,13 +586,23 @@ static int __cmd_annotate(struct perf_annotate *ann) evlist__for_each_entry(session->evlist, pos) { struct hists *hists = evsel__hists(pos); u32 nr_samples = hists->stats.nr_samples; + struct ui_progress prog; if (nr_samples > 0) { total_nr_samples += nr_samples; - hists__collapse_resort(hists, NULL); + + ui_progress__init(&prog, nr_samples, + "Merging related events..."); + hists__collapse_resort(hists, &prog); + ui_progress__finish(); + /* Don't sort callchain */ evsel__reset_sample_bit(pos, CALLCHAIN); - evsel__output_resort(pos, NULL); + + ui_progress__init(&prog, nr_samples, + "Sorting events for output..."); + evsel__output_resort(pos, &prog); + ui_progress__finish(); /* * An event group needs to display other events too. @@ -933,9 +897,7 @@ int cmd_annotate(int argc, const char **argv) use_browser = 2; #endif - /* FIXME: only support stdio for now */ if (annotate.data_type) { - use_browser = 0; annotate_opts.annotate_src = false; symbol_conf.annotate_data_member = true; symbol_conf.annotate_data_sample = true; diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 1a8898d5b5..2c1a9f3d84 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -109,6 +109,8 @@ static struct bench uprobe_benchmarks[] = { { "baseline", "Baseline libc usleep(1000) call", bench_uprobe_baseline, }, { "empty", "Attach empty BPF prog to uprobe on usleep, system wide", bench_uprobe_empty, }, { "trace_printk", "Attach trace_printk BPF prog to uprobe on usleep syswide", bench_uprobe_trace_printk, }, + { "empty_ret", "Attach empty BPF prog to uretprobe on usleep, system wide", bench_uprobe_empty_ret, }, + { "trace_printk_ret", "Attach trace_printk BPF prog to uretprobe on usleep syswide", bench_uprobe_trace_printk_ret,}, { NULL, NULL, NULL }, }; diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index e2a40f1d92..b0511d16ae 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -286,7 +286,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused) pr_warning("Problems with %s file, consider removing it from the cache\n", filename); - } else if (memcmp(dso->bid.data, bid.data, bid.size)) { + } else if (memcmp(dso__bid(dso)->data, bid.data, bid.size)) { pr_warning("Problems with %s file, consider removing it from the cache\n", filename); } diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index c903747786..383d5de36c 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -26,16 +26,18 @@ static int buildid__map_cb(struct map *map, void *arg __maybe_unused) { const struct dso *dso = map__dso(map); char bid_buf[SBUILD_ID_SIZE]; + const char *dso_long_name = dso__long_name(dso); + const char *dso_short_name = dso__short_name(dso); memset(bid_buf, 0, sizeof(bid_buf)); - if (dso->has_build_id) - build_id__sprintf(&dso->bid, bid_buf); + if (dso__has_build_id(dso)) + build_id__sprintf(dso__bid_const(dso), bid_buf); printf("%s %16" PRIx64 " %16" PRIx64, bid_buf, map__start(map), map__end(map)); - if (dso->long_name != NULL) { - printf(" %s", dso->long_name); - } else if (dso->short_name != NULL) { - printf(" %s", dso->short_name); - } + if (dso_long_name != NULL) + printf(" %s", dso_long_name); + else if (dso_short_name != NULL) + printf(" %s", dso_short_name); + printf("\n"); return 0; @@ -76,7 +78,7 @@ static int filename__fprintf_build_id(const char *name, FILE *fp) static bool dso__skip_buildid(struct dso *dso, int with_hits) { - return with_hits && !dso->hit; + return with_hits && !dso__hit(dso); } static int perf_session__list_build_ids(bool force, bool with_hits) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 16b40f5d43..c157bd31f2 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -38,6 +38,7 @@ #include "ui/browsers/hists.h" #include "thread.h" #include "mem2node.h" +#include "mem-info.h" #include "symbol.h" #include "ui/ui.h" #include "ui/progress.h" @@ -529,7 +530,7 @@ static int dcacheline_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = cl_address(he->mem_info->daddr.addr, chk_double_cl); + addr = cl_address(mem_info__daddr(he->mem_info)->addr, chk_double_cl); return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -567,7 +568,7 @@ static int offset_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = cl_offset(he->mem_info->daddr.al_addr, chk_double_cl); + addr = cl_offset(mem_info__daddr(he->mem_info)->al_addr, chk_double_cl); return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -579,10 +580,10 @@ offset_cmp(struct perf_hpp_fmt *fmt __maybe_unused, uint64_t l = 0, r = 0; if (left->mem_info) - l = cl_offset(left->mem_info->daddr.addr, chk_double_cl); + l = cl_offset(mem_info__daddr(left->mem_info)->addr, chk_double_cl); if (right->mem_info) - r = cl_offset(right->mem_info->daddr.addr, chk_double_cl); + r = cl_offset(mem_info__daddr(right->mem_info)->addr, chk_double_cl); return (int64_t)(r - l); } @@ -596,7 +597,7 @@ iaddr_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = he->mem_info->iaddr.addr; + addr = mem_info__iaddr(he->mem_info)->addr; return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -2050,7 +2051,7 @@ static int hpp_list__parse(struct perf_hpp_list *hpp_list, perf_hpp__setup_output_field(hpp_list); /* - * We dont need other sorting keys other than those + * We don't need other sorting keys other than those * we already specified. It also really slows down * the processing a lot with big number of output * fields, so switching this off for c2c. @@ -2319,11 +2320,7 @@ static int setup_nodes(struct perf_session *session) nodes[node] = set; - /* empty node, skip */ - if (perf_cpu_map__has_any_cpu_or_is_empty(map)) - continue; - - perf_cpu_map__for_each_cpu(cpu, idx, map) { + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, map) { __set_bit(cpu.cpu, set); if (WARN_ONCE(cpu2node[cpu.cpu] != -1, "node/cpu topology bug")) @@ -2596,7 +2593,7 @@ perf_c2c_cacheline_browser__title(struct hist_browser *browser, he = cl_browser->he; if (he->mem_info) - addr = cl_address(he->mem_info->daddr.addr, chk_double_cl); + addr = cl_address(mem_info__daddr(he->mem_info)->addr, chk_double_cl); scnprintf(bf, size, "Cacheline 0x%lx", addr); return 0; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index eb3ef5c24b..a212678d47 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -445,10 +445,9 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename, } if (dso) { - mutex_lock(&dso->lock); - nsinfo__put(dso->nsinfo); - dso->nsinfo = nsi; - mutex_unlock(&dso->lock); + mutex_lock(dso__lock(dso)); + dso__set_nsinfo(dso, nsi); + mutex_unlock(dso__lock(dso)); } else nsinfo__put(nsi); @@ -466,8 +465,8 @@ static int perf_event__repipe_buildid_mmap(struct perf_tool *tool, dso = findnew_dso(event->mmap.pid, event->mmap.tid, event->mmap.filename, NULL, machine); - if (dso && !dso->hit) { - dso->hit = 1; + if (dso && !dso__hit(dso)) { + dso__set_hit(dso); dso__inject_build_id(dso, tool, machine, sample->cpumode, 0); } dso__put(dso); @@ -492,7 +491,7 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool, event->mmap2.filename, NULL, machine); if (dso) { /* mark it not to inject build-id */ - dso->hit = 1; + dso__set_hit(dso); } dso__put(dso); } @@ -544,7 +543,7 @@ static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool, event->mmap2.filename, NULL, machine); if (dso) { /* mark it not to inject build-id */ - dso->hit = 1; + dso__set_hit(dso); } dso__put(dso); perf_event__repipe(tool, event, sample, machine); @@ -554,8 +553,8 @@ static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool, dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, event->mmap2.filename, &dso_id, machine); - if (dso && !dso->hit) { - dso->hit = 1; + if (dso && !dso__hit(dso)) { + dso__set_hit(dso); dso__inject_build_id(dso, tool, machine, sample->cpumode, event->mmap2.flags); } @@ -631,24 +630,24 @@ static int dso__read_build_id(struct dso *dso) { struct nscookie nsc; - if (dso->has_build_id) + if (dso__has_build_id(dso)) return 0; - mutex_lock(&dso->lock); - nsinfo__mountns_enter(dso->nsinfo, &nsc); - if (filename__read_build_id(dso->long_name, &dso->bid) > 0) - dso->has_build_id = true; - else if (dso->nsinfo) { - char *new_name = dso__filename_with_chroot(dso, dso->long_name); + mutex_lock(dso__lock(dso)); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + if (filename__read_build_id(dso__long_name(dso), dso__bid(dso)) > 0) + dso__set_has_build_id(dso); + else if (dso__nsinfo(dso)) { + char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); - if (new_name && filename__read_build_id(new_name, &dso->bid) > 0) - dso->has_build_id = true; + if (new_name && filename__read_build_id(new_name, dso__bid(dso)) > 0) + dso__set_has_build_id(dso); free(new_name); } nsinfo__mountns_exit(&nsc); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); - return dso->has_build_id ? 0 : -1; + return dso__has_build_id(dso) ? 0 : -1; } static struct strlist *perf_inject__parse_known_build_ids( @@ -700,14 +699,14 @@ static bool perf_inject__lookup_known_build_id(struct perf_inject *inject, dso_name = strchr(build_id, ' '); bid_len = dso_name - pos->s; dso_name = skip_spaces(dso_name); - if (strcmp(dso->long_name, dso_name)) + if (strcmp(dso__long_name(dso), dso_name)) continue; for (int ix = 0; 2 * ix + 1 < bid_len; ++ix) { - dso->bid.data[ix] = (hex(build_id[2 * ix]) << 4 | - hex(build_id[2 * ix + 1])); + dso__bid(dso)->data[ix] = (hex(build_id[2 * ix]) << 4 | + hex(build_id[2 * ix + 1])); } - dso->bid.size = bid_len / 2; - dso->has_build_id = 1; + dso__bid(dso)->size = bid_len / 2; + dso__set_has_build_id(dso); return true; } return false; @@ -720,9 +719,9 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, tool); int err; - if (is_anon_memory(dso->long_name) || flags & MAP_HUGETLB) + if (is_anon_memory(dso__long_name(dso)) || flags & MAP_HUGETLB) return 0; - if (is_no_dso_memory(dso->long_name)) + if (is_no_dso_memory(dso__long_name(dso))) return 0; if (inject->known_build_ids != NULL && @@ -730,14 +729,14 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, return 1; if (dso__read_build_id(dso) < 0) { - pr_debug("no build_id found for %s\n", dso->long_name); + pr_debug("no build_id found for %s\n", dso__long_name(dso)); return -1; } err = perf_event__synthesize_build_id(tool, dso, cpumode, perf_event__repipe, machine); if (err) { - pr_err("Can't synthesize build_id event for %s\n", dso->long_name); + pr_err("Can't synthesize build_id event for %s\n", dso__long_name(dso)); return -1; } @@ -763,8 +762,8 @@ int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) { struct dso *dso = map__dso(al.map); - if (!dso->hit) { - dso->hit = 1; + if (!dso__hit(dso)) { + dso__set_hit(dso); dso__inject_build_id(dso, tool, machine, sample->cpumode, map__flags(al.map)); } @@ -1146,8 +1145,8 @@ static bool dso__is_in_kernel_space(struct dso *dso) return false; return dso__is_kcore(dso) || - dso->kernel || - is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); + dso__kernel(dso) || + is_kernel_module(dso__long_name(dso), PERF_RECORD_MISC_CPUMODE_UNKNOWN); } static u64 evlist__first_id(struct evlist *evlist) @@ -1181,29 +1180,34 @@ static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_ if (!machine) return -ENOMEM; - dso->hit = 1; + dso__set_hit(dso); return perf_event__synthesize_build_id(&inject->tool, dso, cpumode, process_build_id, machine); } +static int guest_session__add_build_ids_cb(struct dso *dso, void *data) +{ + struct guest_session *gs = data; + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); + + if (!dso__has_build_id(dso)) + return 0; + + return synthesize_build_id(inject, dso, gs->machine_pid); + +} + static int guest_session__add_build_ids(struct guest_session *gs) { struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); - struct machine *machine = &gs->session->machines.host; - struct dso *dso; - int ret; /* Build IDs will be put in the Build ID feature section */ perf_header__set_feat(&inject->session->header, HEADER_BUILD_ID); - dsos__for_each_with_build_id(dso, &machine->dsos.head) { - ret = synthesize_build_id(inject, dso, gs->machine_pid); - if (ret) - return ret; - } - - return 0; + return dsos__for_each_dso(&gs->session->machines.host.dsos, + guest_session__add_build_ids_cb, + gs); } static int guest_session__ksymbol_event(struct perf_tool *tool, @@ -2122,7 +2126,7 @@ static int __cmd_inject(struct perf_inject *inject) */ if (perf_header__has_feat(&session->header, HEADER_BUILD_ID) && inject->have_auxtrace && !inject->itrace_synth_opts.set) - dsos__hit_all(session); + perf_session__dsos_hit_all(session); /* * The AUX areas have been removed and replaced with * synthesized hardware events, so clear the feature flag. diff --git a/tools/perf/builtin-kallsyms.c b/tools/perf/builtin-kallsyms.c index 7f75c5b73f..a3c2ffdc1a 100644 --- a/tools/perf/builtin-kallsyms.c +++ b/tools/perf/builtin-kallsyms.c @@ -38,7 +38,7 @@ static int __cmd_kallsyms(int argc, const char **argv) dso = map__dso(map); printf("%s: %s %s %#" PRIx64 "-%#" PRIx64 " (%#" PRIx64 "-%#" PRIx64")\n", - symbol->name, dso->short_name, dso->long_name, + symbol->name, dso__short_name(dso), dso__long_name(dso), map__unmap_ip(map, symbol->start), map__unmap_ip(map, symbol->end), symbol->start, symbol->end); } diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 9714327fd0..6fd95be503 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -1408,7 +1408,7 @@ static int __cmd_kmem(struct perf_session *session) } evlist__for_each_entry(session->evlist, evsel) { - if (!strcmp(evsel__name(evsel), "kmem:mm_page_alloc") && + if (evsel__name_is(evsel, "kmem:mm_page_alloc") && evsel__field(evsel, "pfn")) { use_pfn = true; break; diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index 0092b9b396..56e3f3a5e0 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -2230,7 +2230,7 @@ static int perf_kwork__top(struct perf_kwork *kwork) perf_kwork__top_report(kwork); out: - free(kwork->top_stat.cpus_runtime); + zfree(&kwork->top_stat.cpus_runtime); return ret; } diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 02bf608d58..5cab312315 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -76,26 +77,38 @@ static void default_print_start(void *ps) static void default_print_end(void *print_state __maybe_unused) {} +static const char *skip_spaces_or_commas(const char *str) +{ + while (isspace(*str) || *str == ',') + ++str; + return str; +} + static void wordwrap(FILE *fp, const char *s, int start, int max, int corr) { int column = start; int n; bool saw_newline = false; + bool comma = false; while (*s) { - int wlen = strcspn(s, " \t\n"); + int wlen = strcspn(s, " ,\t\n"); + const char *sep = comma ? "," : " "; if ((column + wlen >= max && column > start) || saw_newline) { - fprintf(fp, "\n%*s", start, ""); + fprintf(fp, comma ? ",\n%*s" : "\n%*s", start, ""); column = start + corr; } - n = fprintf(fp, "%s%.*s", column > start ? " " : "", wlen, s); + if (column <= start) + sep = ""; + n = fprintf(fp, "%s%.*s", sep, wlen, s); if (n <= 0) break; saw_newline = s[wlen] == '\n'; s += wlen; + comma = s[0] == ','; column += n; - s = skip_spaces(s); + s = skip_spaces_or_commas(s); } } @@ -313,6 +326,9 @@ static void fix_escape_fprintf(FILE *fp, struct strbuf *buf, const char *fmt, .. case '\n': strbuf_addstr(buf, "\\n"); break; + case '\r': + strbuf_addstr(buf, "\\r"); + break; case '\\': fallthrough; case '\"': diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 230461280e..7007d26fe6 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2275,23 +2275,13 @@ setup_args: return -ENOMEM; for (i = 0; i < ARRAY_SIZE(record_args); i++) - rec_argv[i] = strdup(record_args[i]); + rec_argv[i] = record_args[i]; for (j = 0; j < nr_tracepoints; j++) { - const char *ev_name; - - if (has_lock_stat) - ev_name = strdup(lock_tracepoints[j].name); - else - ev_name = strdup(contention_tracepoints[j].name); - - if (!ev_name) { - free(rec_argv); - return -ENOMEM; - } - rec_argv[i++] = "-e"; - rec_argv[i++] = ev_name; + rec_argv[i++] = has_lock_stat + ? lock_tracepoints[j].name + : contention_tracepoints[j].name; } for (j = 0; j < nr_callgraph_args; j++, i++) diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 5b851e64e4..863fcd735d 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -213,7 +213,7 @@ dump_raw_samples(struct perf_tool *tool, if (al.map != NULL) { dso = map__dso(al.map); if (dso) - dso->hit = 1; + dso__set_hit(dso); } field_sep = symbol_conf.field_sep; @@ -255,7 +255,7 @@ dump_raw_samples(struct perf_tool *tool, symbol_conf.field_sep, sample->data_src, symbol_conf.field_sep, - dso ? dso->long_name : "???", + dso ? dso__long_name(dso) : "???", al.sym ? al.sym->name : "???"); out_put: addr_location__exit(&al); diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 019fef8da6..003a3bcebf 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -325,7 +325,7 @@ static void cleanup_params(void) for (i = 0; i < params->nevents; i++) clear_perf_probe_event(params->events + i); line_range__clear(¶ms->line_range); - free(params->target); + zfree(¶ms->target); strfilter__delete(params->filter); nsinfo__put(params->nsi); zfree(¶ms); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 45c755fb50..0a8ba1323d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -332,7 +332,7 @@ static int record__aio_complete(struct mmap *md, struct aiocb *cblock) } else { /* * aio write request may require restart with the - * reminder if the kernel didn't write whole + * remainder if the kernel didn't write whole * chunk at once. */ rem_off = cblock->aio_offset + written; @@ -400,7 +400,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size * * Coping can be done in two steps in case the chunk of profiling data * crosses the upper bound of the kernel buffer. In this case we first move - * part of data from map->start till the upper bound and then the reminder + * part of data from map->start till the upper bound and then the remainder * from the beginning of the kernel buffer till the end of the data chunk. */ @@ -1788,7 +1788,7 @@ record__finish_output(struct record *rec) process_buildids(rec); if (rec->buildid_all) - dsos__hit_all(rec->session); + perf_session__dsos_hit_all(rec->session); } perf_session__write_header(rec->session, rec->evlist, fd, true); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 5b684d2ab4..69618fb011 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -31,6 +31,7 @@ #include "util/evsel.h" #include "util/evswitch.h" #include "util/header.h" +#include "util/mem-info.h" #include "util/session.h" #include "util/srcline.h" #include "util/tool.h" @@ -172,7 +173,7 @@ static int hist_iter__report_callback(struct hist_entry_iter *iter, struct mem_info *mi; struct branch_info *bi; - if (!ui__has_annotation() && !rep->symbol_ipc && !rep->data_type) + if (!ui__has_annotation() && !rep->symbol_ipc) return 0; if (sort__mode == SORT_MODE__BRANCH) { @@ -185,7 +186,7 @@ static int hist_iter__report_callback(struct hist_entry_iter *iter, } else if (rep->mem_mode) { mi = he->mem_info; - err = addr_map_symbol__inc_samples(&mi->daddr, sample, evsel); + err = addr_map_symbol__inc_samples(mem_info__daddr(mi), sample, evsel); if (err) goto out; @@ -322,7 +323,7 @@ static int process_sample_event(struct perf_tool *tool, } if (al.map != NULL) - map__dso(al.map)->hit = 1; + dso__set_hit(map__dso(al.map)); if (ui__has_annotation() || rep->symbol_ipc || rep->total_cycles_mode) { hist__account_cycles(sample->branch_stack, &al, sample, @@ -609,7 +610,7 @@ static void report__warn_kptr_restrict(const struct report *rep) return; if (kernel_map == NULL || - (map__dso(kernel_map)->hit && + (dso__hit(map__dso(kernel_map)) && (kernel_kmap->ref_reloc_sym == NULL || kernel_kmap->ref_reloc_sym->addr == 0))) { const char *desc = @@ -850,7 +851,7 @@ static int maps__fprintf_task_cb(struct map *map, void *data) prot & PROT_EXEC ? 'x' : '-', map__flags(map) ? 's' : 'p', map__pgoff(map), - dso->id.ino, dso->name); + dso__id_const(dso)->ino, dso__name(dso)); if (ret < 0) return ret; @@ -1694,6 +1695,11 @@ repeat: else use_browser = 0; + if (report.data_type && use_browser == 1) { + symbol_conf.annotate_data_member = true; + symbol_conf.annotate_data_sample = true; + } + if (sort_order && strstr(sort_order, "ipc")) { parse_options_usage(report_usage, options, "s", 1); goto error; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 1bfb223473..5977c49ae2 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2148,7 +2148,7 @@ static bool is_idle_sample(struct perf_sample *sample, struct evsel *evsel) { /* pid 0 == swapper == idle task */ - if (strcmp(evsel__name(evsel), "sched:sched_switch") == 0) + if (evsel__name_is(evsel, "sched:sched_switch")) return evsel__intval(evsel, sample, "prev_pid") == 0; return sample->pid == 0; @@ -2375,7 +2375,7 @@ static bool timehist_skip_sample(struct perf_sched *sched, } if (sched->idle_hist) { - if (strcmp(evsel__name(evsel), "sched:sched_switch")) + if (!evsel__name_is(evsel, "sched:sched_switch")) rc = true; else if (evsel__intval(evsel, sample, "prev_pid") != 0 && evsel__intval(evsel, sample, "next_pid") != 0) @@ -3213,7 +3213,7 @@ static int perf_sched__lat(struct perf_sched *sched) perf_sched__sort_lat(sched); printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Max delay start | Max delay end |\n"); + printf(" Task | Runtime ms | Count | Avg delay ms | Max delay ms | Max delay start | Max delay end |\n"); printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n"); next = rb_first_cached(&sched->sorted_atom_root); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 2e7148d667..c16224b1fe 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -32,6 +32,7 @@ #include "util/time-utils.h" #include "util/path.h" #include "util/event.h" +#include "util/mem-info.h" #include "ui/ui.h" #include "print_binary.h" #include "print_insn.h" @@ -136,6 +137,7 @@ enum perf_output_field { PERF_OUTPUT_RETIRE_LAT = 1ULL << 40, PERF_OUTPUT_DSOFF = 1ULL << 41, PERF_OUTPUT_DISASM = 1ULL << 42, + PERF_OUTPUT_BRSTACKDISASM = 1ULL << 43, }; struct perf_script { @@ -210,6 +212,7 @@ struct output_option { {.str = "vcpu", .field = PERF_OUTPUT_VCPU}, {.str = "cgroup", .field = PERF_OUTPUT_CGROUP}, {.str = "retire_lat", .field = PERF_OUTPUT_RETIRE_LAT}, + {.str = "brstackdisasm", .field = PERF_OUTPUT_BRSTACKDISASM}, }; enum { @@ -510,7 +513,8 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) "selected. Hence, no address to lookup the source line number.\n"); return -EINVAL; } - if ((PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN)) && !allow_user_set && + if ((PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN) || PRINT_FIELD(BRSTACKDISASM)) + && !allow_user_set && !(evlist__combined_branch_type(session->evlist) & PERF_SAMPLE_BRANCH_ANY)) { pr_err("Display of branch stack assembler requested, but non all-branch filter set\n" "Hint: run 'perf record -b ...'\n"); @@ -1014,11 +1018,11 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, to = entries[i].to; if (thread__find_map_fb(thread, sample->cpumode, from, &alf) && - !map__dso(alf.map)->adjust_symbols) + !dso__adjust_symbols(map__dso(alf.map))) from = map__dso_map_ip(alf.map, from); if (thread__find_map_fb(thread, sample->cpumode, to, &alt) && - !map__dso(alt.map)->adjust_symbols) + !dso__adjust_symbols(map__dso(alt.map))) to = map__dso_map_ip(alt.map, to); printed += fprintf(fp, " 0x%"PRIx64, from); @@ -1079,7 +1083,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); goto out; } - if (dso->data.status == DSO_DATA_STATUS_ERROR) { + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) { pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); goto out; } @@ -1091,7 +1095,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, len = dso__data_read_offset(dso, machine, offset, (u8 *)buffer, end - start + MAXINSN); - *is64bit = dso->is_64_bit; + *is64bit = dso__is_64_bit(dso); if (len <= 0) pr_debug("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n", start, end); @@ -1162,6 +1166,31 @@ out: return ret; } +static int any_dump_insn(struct perf_event_attr *attr __maybe_unused, + struct perf_insn *x, uint64_t ip, + u8 *inbuf, int inlen, int *lenp, + FILE *fp) +{ +#ifdef HAVE_LIBCAPSTONE_SUPPORT + if (PRINT_FIELD(BRSTACKDISASM)) { + int printed = fprintf_insn_asm(x->machine, x->thread, x->cpumode, x->is64bit, + (uint8_t *)inbuf, inlen, ip, lenp, + PRINT_INSN_IMM_HEX, fp); + + if (printed > 0) + return printed; + } +#endif + return fprintf(fp, "%s", dump_insn(x, ip, inbuf, inlen, lenp)); +} + +static int add_padding(FILE *fp, int printed, int padding) +{ + if (printed >= 0 && printed < padding) + printed += fprintf(fp, "%*s", padding - printed, ""); + return printed; +} + static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, struct perf_insn *x, u8 *inbuf, int len, int insn, FILE *fp, int *total_cycles, @@ -1169,8 +1198,10 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, struct thread *thread) { int ilen = 0; - int printed = fprintf(fp, "\t%016" PRIx64 "\t%-30s\t", ip, - dump_insn(x, ip, inbuf, len, &ilen)); + int printed = fprintf(fp, "\t%016" PRIx64 "\t", ip); + + printed += add_padding(fp, any_dump_insn(attr, x, ip, inbuf, len, &ilen, fp), 30); + printed += fprintf(fp, "\t"); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "ilen: %d\t", ilen); @@ -1262,6 +1293,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, nr = max_blocks + 1; x.thread = thread; + x.machine = machine; x.cpu = sample->cpu; printed += fprintf(fp, "%c", '\n'); @@ -1312,8 +1344,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, break; } else { ilen = 0; - printed += fprintf(fp, "\t%016" PRIx64 "\t%s", ip, - dump_insn(&x, ip, buffer + off, len - off, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t", ip); + printed += any_dump_insn(attr, &x, ip, buffer + off, len - off, &ilen, fp); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1360,8 +1392,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, if (len <= 0) goto out; ilen = 0; - printed += fprintf(fp, "\t%016" PRIx64 "\t%s", sample->ip, - dump_insn(&x, sample->ip, buffer, len, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t", sample->ip); + printed += any_dump_insn(attr, &x, sample->ip, buffer, len, &ilen, fp); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1371,8 +1403,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, } for (off = 0; off <= end - start; off += ilen) { ilen = 0; - printed += fprintf(fp, "\t%016" PRIx64 "\t%s", start + off, - dump_insn(&x, start + off, buffer + off, len - off, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t", start + off); + printed += any_dump_insn(attr, &x, start + off, buffer + off, len - off, &ilen, fp); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1517,7 +1549,8 @@ void script_fetch_insn(struct perf_sample *sample, struct thread *thread, static int perf_sample__fprintf_insn(struct perf_sample *sample, struct perf_event_attr *attr, struct thread *thread, - struct machine *machine, FILE *fp) + struct machine *machine, FILE *fp, + struct addr_location *al) { int printed = 0; @@ -1531,9 +1564,9 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample, } if (PRINT_FIELD(DISASM) && sample->insn_len) { printed += fprintf(fp, "\t\t"); - printed += sample__fprintf_insn_asm(sample, thread, machine, fp); + printed += sample__fprintf_insn_asm(sample, thread, machine, fp, al); } - if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN)) + if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN) || PRINT_FIELD(BRSTACKDISASM)) printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp); return printed; @@ -1606,7 +1639,7 @@ static int perf_sample__fprintf_bts(struct perf_sample *sample, if (print_srcline_last) printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); - printed += perf_sample__fprintf_insn(sample, attr, thread, machine, fp); + printed += perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al); printed += fprintf(fp, "\n"); if (PRINT_FIELD(SRCCODE)) { int ret = map__fprintf_srccode(al->map, al->addr, stdout, @@ -2018,13 +2051,18 @@ static int evlist__max_name_len(struct evlist *evlist) static int data_src__fprintf(u64 data_src, FILE *fp) { - struct mem_info mi = { .data_src.val = data_src }; + struct mem_info *mi = mem_info__new(); char decode[100]; char out[100]; static int maxlen; int len; - perf_script__meminfo_scnprintf(decode, 100, &mi); + if (!mi) + return -ENOMEM; + + mem_info__data_src(mi)->val = data_src; + perf_script__meminfo_scnprintf(decode, 100, mi); + mem_info__put(mi); len = scnprintf(out, 100, "%16" PRIx64 " %s", data_src, decode); if (maxlen < len) @@ -2259,7 +2297,7 @@ static void process_event(struct perf_script *script, if (evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) perf_sample__fprintf_bpf_output(sample, fp); - perf_sample__fprintf_insn(sample, attr, thread, machine, fp); + perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al); if (PRINT_FIELD(PHYS_ADDR)) fprintf(fp, "%16" PRIx64, sample->phys_addr); @@ -2465,7 +2503,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, evsel = evlist__last(*pevlist); if (!evsel->priv) { - if (scr->per_event_dump) { + if (scr->per_event_dump) { evsel->priv = evsel_script__new(evsel, scr->session->data); if (!evsel->priv) return -ENOMEM; @@ -3471,7 +3509,7 @@ static int check_ev_match(char *dir_name, char *scriptname, match = 0; evlist__for_each_entry(session->evlist, pos) { - if (!strcmp(evsel__name(pos), evname)) { + if (evsel__name_is(pos, evname)) { match = 1; break; } @@ -3939,7 +3977,7 @@ int cmd_script(int argc, const char **argv) "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,dsoff," "addr,symoff,srcline,period,iregs,uregs,brstack," "brstacksym,flags,data_src,weight,bpf-output,brstackinsn," - "brstackinsnlen,brstackoff,callindent,insn,disasm,insnlen,synth," + "brstackinsnlen,brstackdisasm,brstackoff,callindent,insn,disasm,insnlen,synth," "phys_addr,metric,misc,srccode,ipc,tod,data_page_size," "code_page_size,ins_lat,machine_pid,vcpu,cgroup,retire_lat", parse_output_fields), diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6bba1a89d0..35f79b48e8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -164,26 +164,6 @@ static struct perf_stat_config stat_config = { .iostat_run = false, }; -static bool cpus_map_matched(struct evsel *a, struct evsel *b) -{ - if (!a->core.cpus && !b->core.cpus) - return true; - - if (!a->core.cpus || !b->core.cpus) - return false; - - if (perf_cpu_map__nr(a->core.cpus) != perf_cpu_map__nr(b->core.cpus)) - return false; - - for (int i = 0; i < perf_cpu_map__nr(a->core.cpus); i++) { - if (perf_cpu_map__cpu(a->core.cpus, i).cpu != - perf_cpu_map__cpu(b->core.cpus, i).cpu) - return false; - } - - return true; -} - static void evlist__check_cpu_maps(struct evlist *evlist) { struct evsel *evsel, *warned_leader = NULL; @@ -194,7 +174,7 @@ static void evlist__check_cpu_maps(struct evlist *evlist) /* Check that leader matches cpus with each member. */ if (leader == evsel) continue; - if (cpus_map_matched(leader, evsel)) + if (perf_cpu_map__equal(leader->core.cpus, evsel->core.cpus)) continue; /* If there's mismatch disable the group and warn user. */ @@ -1319,10 +1299,9 @@ static int cpu__get_cache_id_from_map(struct perf_cpu cpu, char *map) * be the first online CPU in the cache domain else use the * first online CPU of the cache domain as the ID. */ - if (perf_cpu_map__has_any_cpu_or_is_empty(cpu_map)) + id = perf_cpu_map__min(cpu_map).cpu; + if (id == -1) id = cpu.cpu; - else - id = perf_cpu_map__cpu(cpu_map, 0).cpu; /* Free the perf_cpu_map used to find the cache ID */ perf_cpu_map__put(cpu_map); @@ -1642,7 +1621,7 @@ static int perf_stat_init_aggr_mode(void) * taking the highest cpu number to be the size of * the aggregation translate cpumap. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(evsel_list->core.user_requested_cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel_list->core.user_requested_cpus)) nr = perf_cpu_map__max(evsel_list->core.user_requested_cpus).cpu; else nr = 0; @@ -1652,23 +1631,13 @@ static int perf_stat_init_aggr_mode(void) static void cpu_aggr_map__delete(struct cpu_aggr_map *map) { - if (map) { - WARN_ONCE(refcount_read(&map->refcnt) != 0, - "cpu_aggr_map refcnt unbalanced\n"); - free(map); - } -} - -static void cpu_aggr_map__put(struct cpu_aggr_map *map) -{ - if (map && refcount_dec_and_test(&map->refcnt)) - cpu_aggr_map__delete(map); + free(map); } static void perf_stat__exit_aggr_mode(void) { - cpu_aggr_map__put(stat_config.aggr_map); - cpu_aggr_map__put(stat_config.cpus_aggr_map); + cpu_aggr_map__delete(stat_config.aggr_map); + cpu_aggr_map__delete(stat_config.cpus_aggr_map); stat_config.aggr_map = NULL; stat_config.cpus_aggr_map = NULL; } @@ -2106,6 +2075,7 @@ static int add_default_attributes(void) stat_config.metric_no_threshold, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events); } @@ -2139,6 +2109,7 @@ static int add_default_attributes(void) stat_config.metric_no_threshold, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events); } @@ -2173,6 +2144,7 @@ static int add_default_attributes(void) /*metric_no_threshold=*/true, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events) < 0) return -1; } @@ -2214,6 +2186,7 @@ static int add_default_attributes(void) /*metric_no_threshold=*/true, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events) < 0) return -1; @@ -2334,7 +2307,7 @@ int process_stat_config_event(struct perf_session *session, perf_event__read_stat_config(&stat_config, &event->stat_config); - if (perf_cpu_map__has_any_cpu_or_is_empty(st->cpus)) { + if (perf_cpu_map__is_empty(st->cpus)) { if (st->aggr_mode != AGGR_UNSET) pr_warning("warning: processing task data, aggregation mode not set\n"); } else if (st->aggr_mode != AGGR_UNSET) { @@ -2748,6 +2721,7 @@ int cmd_stat(int argc, const char **argv) stat_config.metric_no_threshold, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events); zfree(&metrics); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 5ac6dcc64c..1d6aef51c1 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -129,7 +129,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) /* * We can't annotate with just /proc/kallsyms */ - if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { + if (dso__symtab_type(dso) == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { pr_err("Can't annotate %s: No vmlinux file was found in the " "path\n", sym->name); sleep(1); @@ -182,7 +182,7 @@ static void ui__warn_map_erange(struct map *map, struct symbol *sym, u64 ip) "Tools: %s\n\n" "Not all samples will be on the annotation output.\n\n" "Please report to linux-kernel@vger.kernel.org\n", - ip, dso->long_name, dso__symtab_origin(dso), + ip, dso__long_name(dso), dso__symtab_origin(dso), map__start(map), map__end(map), sym->start, sym->end, sym->binding == STB_GLOBAL ? 'g' : sym->binding == STB_LOCAL ? 'l' : 'w', sym->name, diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 90eaff8c0f..08a3a6effa 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -765,7 +765,7 @@ static const char *fcntl_cmds[] = { static DEFINE_STRARRAY(fcntl_cmds, "F_"); static const char *fcntl_linux_specific_cmds[] = { - "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", + "SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC", "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", }; @@ -947,6 +947,15 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, { .name = "eventfd2", .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, + { .name = "faccessat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, + { .name = "faccessat2", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, + [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, }, { .name = "fchmodat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, { .name = "fchownat", @@ -969,7 +978,6 @@ static const struct syscall_fmt syscall_fmts[] = { [1] = { .scnprintf = SCA_FILENAME, /* path */ }, [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, }, { .name = "fstat", .alias = "newfstat", }, - { .name = "fstatat", .alias = "newfstatat", }, { .name = "futex", .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, }, @@ -1049,8 +1057,12 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, }, { .name = "name_to_handle_at", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, - { .name = "newfstatat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + { .name = "nanosleep", + .arg = { [0] = { .scnprintf = SCA_TIMESPEC, /* req */ }, }, }, + { .name = "newfstatat", .alias = "fstatat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "open", .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "open_by_handle_at", @@ -1142,7 +1154,7 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "stat", .alias = "newstat", }, { .name = "statx", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, - [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } , [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, { .name = "swapoff", .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, @@ -1160,7 +1172,9 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "utimensat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, { .name = "wait4", .errpid = true, @@ -2903,7 +2917,7 @@ static void print_location(FILE *f, struct perf_sample *sample, { if ((verbose > 0 || print_dso) && al->map) - fprintf(f, "%s@", map__dso(al->map)->long_name); + fprintf(f, "%s@", dso__long_name(map__dso(al->map))); if ((verbose > 0 || print_sym) && al->sym) fprintf(f, "%s+0x%" PRIx64, al->sym->name, @@ -4869,6 +4883,11 @@ int cmd_trace(int argc, const char **argv) if (!trace.trace_syscalls) goto skip_augmentation; + if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) { + pr_debug("Syscall augmentation fails with record, disabling augmentation"); + goto skip_augmentation; + } + trace.skel = augmented_raw_syscalls_bpf__open(); if (!trace.skel) { pr_debug("Failed to open augmented syscalls BPF skeleton"); @@ -4902,7 +4921,7 @@ int cmd_trace(int argc, const char **argv) goto out; } trace.syscalls.events.bpf_output = evlist__last(trace.evlist); - assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__")); + assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__")); skip_augmentation: #endif err = -1; @@ -4959,7 +4978,7 @@ skip_augmentation: */ if (trace.syscalls.events.bpf_output) { evlist__for_each_entry(trace.evlist, evsel) { - bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0; + bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit"); if (raw_syscalls_sys_exit) { trace.raw_augmented_syscalls = true; diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h index f2ab5bae21..f4375deabf 100644 --- a/tools/perf/builtin.h +++ b/tools/perf/builtin.h @@ -2,8 +2,10 @@ #ifndef BUILTIN_H #define BUILTIN_H +struct cmdnames; + void list_common_cmds_help(void); -const char *help_unknown_cmd(const char *cmd); +const char *help_unknown_cmd(const char *cmd, struct cmdnames *main_cmds); int cmd_annotate(int argc, const char **argv); int cmd_bench(int argc, const char **argv); diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 66ba33dbce..672421b858 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -9,23 +9,15 @@ FILES=( "include/uapi/linux/const.h" "include/uapi/drm/drm.h" "include/uapi/drm/i915_drm.h" + "include/uapi/linux/bits.h" "include/uapi/linux/fadvise.h" - "include/uapi/linux/fcntl.h" - "include/uapi/linux/fs.h" "include/uapi/linux/fscrypt.h" "include/uapi/linux/kcmp.h" "include/uapi/linux/kvm.h" "include/uapi/linux/in.h" - "include/uapi/linux/mount.h" - "include/uapi/linux/openat2.h" "include/uapi/linux/perf_event.h" - "include/uapi/linux/prctl.h" - "include/uapi/linux/sched.h" "include/uapi/linux/seccomp.h" "include/uapi/linux/stat.h" - "include/uapi/linux/usbdevice_fs.h" - "include/uapi/linux/vhost.h" - "include/uapi/sound/asound.h" "include/linux/bits.h" "include/vdso/bits.h" "include/linux/const.h" @@ -38,9 +30,7 @@ FILES=( "arch/x86/include/asm/cpufeatures.h" "arch/x86/include/asm/inat_types.h" "arch/x86/include/asm/emulate_prefix.h" - "arch/x86/include/asm/irq_vectors.h" "arch/x86/include/asm/msr-index.h" - "arch/x86/include/uapi/asm/prctl.h" "arch/x86/lib/x86-opcode-map.txt" "arch/x86/tools/gen-insn-attr-x86.awk" "arch/arm/include/uapi/asm/perf_regs.h" @@ -97,7 +87,18 @@ SYNC_CHECK_FILES=( declare -a BEAUTY_FILES BEAUTY_FILES=( + "arch/x86/include/asm/irq_vectors.h" + "arch/x86/include/uapi/asm/prctl.h" "include/linux/socket.h" + "include/uapi/linux/fcntl.h" + "include/uapi/linux/fs.h" + "include/uapi/linux/mount.h" + "include/uapi/linux/prctl.h" + "include/uapi/linux/sched.h" + "include/uapi/linux/stat.h" + "include/uapi/linux/usbdevice_fs.h" + "include/uapi/linux/vhost.h" + "include/uapi/sound/asound.h" ) declare -a FAILURES diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh index f94795794b..6ed7e52ab8 100755 --- a/tools/perf/perf-archive.sh +++ b/tools/perf/perf-archive.sh @@ -34,7 +34,7 @@ if [ $UNPACK -eq 1 ]; then TARGET=`find . -regex "\./perf.*\.tar\.bz2"` TARGET_NUM=`echo -n "$TARGET" | grep -c '^'` - if [ -z "$TARGET" -o $TARGET_NUM -gt 1 ]; then + if [ -z "$TARGET" ] || [ $TARGET_NUM -gt 1 ]; then echo -e "Error: $TARGET_NUM files found for unpacking:\n$TARGET" echo "Provide the requested file as an argument" exit 1 diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh index f224d79b89..69cba3c170 100644 --- a/tools/perf/perf-completion.sh +++ b/tools/perf/perf-completion.sh @@ -108,6 +108,8 @@ __perf__ltrim_colon_completions() __perfcomp () { + # Expansion of spaces to array is deliberate. + # shellcheck disable=SC2207 COMPREPLY=( $( compgen -W "$1" -- "$2" ) ) } @@ -127,13 +129,13 @@ __perf_prev_skip_opts () let i=cword-1 cmds_=$($cmd $1 --list-cmds) - prev_skip_opts=() + prev_skip_opts="" while [ $i -ge 0 ]; do - if [[ ${words[i]} == $1 ]]; then + if [[ ${words[i]} == "$1" ]]; then return fi for cmd_ in $cmds_; do - if [[ ${words[i]} == $cmd_ ]]; then + if [[ ${words[i]} == "$cmd_" ]]; then prev_skip_opts=${words[i]} return fi @@ -164,9 +166,10 @@ __perf_main () $prev_skip_opts == @(record|stat|top) ]]; then local cur1=${COMP_WORDS[COMP_CWORD]} - local raw_evts=$($cmd list --raw-dump hw sw cache tracepoint pmu sdt) + local raw_evts local arr s tmp result cpu_evts + raw_evts=$($cmd list --raw-dump hw sw cache tracepoint pmu sdt) # aarch64 doesn't have /sys/bus/event_source/devices/cpu/events if [[ `uname -m` != aarch64 ]]; then cpu_evts=$(ls /sys/bus/event_source/devices/cpu/events) @@ -175,10 +178,12 @@ __perf_main () if [[ "$cur1" == */* && ${cur1#*/} =~ ^[A-Z] ]]; then OLD_IFS="$IFS" IFS=" " + # Expansion of spaces to array is deliberate. + # shellcheck disable=SC2206 arr=($raw_evts) IFS="$OLD_IFS" - for s in ${arr[@]} + for s in "${arr[@]}" do if [[ "$s" == *cpu/* ]]; then tmp=${s#*cpu/} @@ -200,11 +205,13 @@ __perf_main () fi elif [[ $prev == @("--pfm-events") && $prev_skip_opts == @(record|stat|top) ]]; then - local evts=$($cmd list --raw-dump pfm) + local evts + evts=$($cmd list --raw-dump pfm) __perfcomp "$evts" "$cur" elif [[ $prev == @("-M"|"--metrics") && $prev_skip_opts == @(stat) ]]; then - local metrics=$($cmd list --raw-dump metric metricgroup) + local metrics + metrics=$($cmd list --raw-dump metric metricgroup) __perfcomp "$metrics" "$cur" else # List subcommands for perf commands @@ -278,6 +285,8 @@ if [[ -n ${ZSH_VERSION-} ]]; then let cword=CURRENT-1 emulate ksh -c __perf_main let _ret && _default && _ret=0 + # _ret is only assigned 0 or 1, disable inaccurate analysis. + # shellcheck disable=SC2152 return _ret } diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 921bee0a64..bd3f80b5bb 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -18,6 +18,7 @@ #include #include "util/parse-events.h" #include +#include #include "util/debug.h" #include "util/event.h" #include "util/util.h" // usage() @@ -458,7 +459,7 @@ static int libperf_print(enum libperf_print_level level, int main(int argc, const char **argv) { - int err; + int err, done_help = 0; const char *cmd; char sbuf[STRERR_BUFSIZE]; @@ -557,22 +558,32 @@ int main(int argc, const char **argv) pthread__block_sigwinch(); while (1) { - static int done_help; - run_argv(&argc, &argv); if (errno != ENOENT) break; if (!done_help) { - cmd = argv[0] = help_unknown_cmd(cmd); + struct cmdnames main_cmds = {}; + + for (unsigned int i = 0; i < ARRAY_SIZE(commands); i++) { + add_cmdname(&main_cmds, + commands[i].cmd, + strlen(commands[i].cmd)); + } + cmd = argv[0] = help_unknown_cmd(cmd, &main_cmds); + clean_cmdnames(&main_cmds); done_help = 1; + if (!cmd) + break; } else break; } - fprintf(stderr, "Failed to run command '%s': %s\n", - cmd, str_error_r(errno, sbuf, sizeof(sbuf))); + if (cmd) { + fprintf(stderr, "Failed to run command '%s': %s\n", + cmd, str_error_r(errno, sbuf, sizeof(sbuf))); + } out: if (debug_fp) fclose(debug_fp); diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json index 7a2b7b200f..ac75f12e27 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json @@ -9,7 +9,9 @@ "ArchStdEvent": "L1D_CACHE_REFILL_RD" }, { - "ArchStdEvent": "L1D_CACHE_INVAL" + "ArchStdEvent": "L1D_CACHE_INVAL", + "Errata": "Errata AC03_CPU_41", + "BriefDescription": "L1D cache invalidate. Impacted by errata -" }, { "ArchStdEvent": "L1D_TLB_REFILL_RD" diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json index c50d8e930b..f4bfe7083a 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json @@ -9,7 +9,9 @@ "ArchStdEvent": "L1D_CACHE_REFILL_RD" }, { - "ArchStdEvent": "L1D_CACHE_INVAL" + "ArchStdEvent": "L1D_CACHE_INVAL", + "Errata": "Errata AC04_CPU_1", + "BriefDescription": "L1D cache invalidate. Impacted by errata -" }, { "ArchStdEvent": "L1D_TLB_REFILL_RD" diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json b/tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json new file mode 100644 index 0000000000..2d8d18cb85 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json @@ -0,0 +1,93 @@ +[ + { + "EventName": "bp_l1_tlb_miss_l2_tlb_hit", + "EventCode": "0x84", + "BriefDescription": "Instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB." + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for 2M pages.", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for 1G pages.", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x08" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.all", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for all page sizes.", + "UMask": "0x0f" + }, + { + "EventName": "bp_l2_btb_correct", + "EventCode": "0x8b", + "BriefDescription": "L2 branch prediction overrides existing prediction (speculative)." + }, + { + "EventName": "bp_dyn_ind_pred", + "EventCode": "0x8e", + "BriefDescription": "Dynamic indirect predictions (branch used the indirect predictor to make a prediction)." + }, + { + "EventName": "bp_de_redirect", + "EventCode": "0x91", + "BriefDescription": "Number of times an early redirect is sent to branch predictor. This happens when either the decoder or dispatch logic is able to detect that the branch predictor needs to be redirected." + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if4k", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 4k or coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if2m", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 2M pages.", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if1g", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 1G pages.", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.all", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for all page sizes.", + "UMask": "0x07" + }, + { + "EventName": "bp_redirects.resync", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the branch predictor caused by resyncs.", + "UMask": "0x01" + }, + { + "EventName": "bp_redirects.ex_redir", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the branch predictor caused by mispredicts.", + "UMask": "0x02" + }, + { + "EventName": "bp_redirects.all", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the branch predictor." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/decode.json b/tools/perf/pmu-events/arch/x86/amdzen5/decode.json new file mode 100644 index 0000000000..d0eff7f2a3 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/decode.json @@ -0,0 +1,115 @@ +[ + { + "EventName": "de_op_queue_empty", + "EventCode": "0xa9", + "BriefDescription": "Cycles where the op queue is empty. Such cycles indicate that the front-end is not delivering instructions fast enough." + }, + { + "EventName": "de_src_op_disp.x86_decoder", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from x86 decoder.", + "UMask": "0x01" + }, + { + "EventName": "de_src_op_disp.op_cache", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from op cache.", + "UMask": "0x02" + }, + { + "EventName": "de_src_op_disp.all", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from any source.", + "UMask": "0x07" + }, + { + "EventName": "de_dis_ops_from_decoder.any_fp_dispatch", + "EventCode": "0xab", + "BriefDescription": "Number of ops dispatched to the floating-point unit.", + "UMask": "0x04" + }, + { + "EventName": "de_dis_ops_from_decoder.any_integer_dispatch", + "EventCode": "0xab", + "BriefDescription": "Number of ops dispatched to the integer execution unit.", + "UMask": "0x08" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.int_phy_reg_file_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to an integer physical register file resource stall.", + "UMask": "0x01" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.load_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a lack of load queue tokens.", + "UMask": "0x02" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.store_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a lack of store queue tokens.", + "UMask": "0x04" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.taken_brnch_buffer_rsrc", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a taken branch buffer resource stall.", + "UMask": "0x10" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.fp_sch_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a floating-point non-schedulable queue token stall.", + "UMask": "0x40" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.al_tokens", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of ALU tokens.", + "UMask": "0x01" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.ag_tokens", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of agen tokens.", + "UMask": "0x02" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.ex_flush_recovery", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a pending integer execution flush recovery.", + "UMask": "0x04" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.retq", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of retire queue tokens.", + "UMask": "0x20" + }, + { + "EventName": "de_no_dispatch_per_slot.no_ops_from_frontend", + "EventCode": "0x1a0", + "BriefDescription": "In each cycle counts dispatch slots left empty because the front-end did not supply ops.", + "UMask": "0x01" + }, + { + "EventName": "de_no_dispatch_per_slot.backend_stalls", + "EventCode": "0x1a0", + "BriefDescription": "In each cycle counts ops unable to dispatch because of back-end stalls.", + "UMask": "0x1e" + }, + { + "EventName": "de_no_dispatch_per_slot.smt_contention", + "EventCode": "0x1a0", + "BriefDescription": "In each cycle counts ops unable to dispatch because the dispatch cycle was granted to the other SMT thread.", + "UMask": "0x60" + }, + { + "EventName": "de_additional_resource_stalls.dispatch_stalls", + "EventCode": "0x1a2", + "BriefDescription": "Counts additional cycles where dispatch is stalled due to a lack of dispatch resources.", + "UMask": "0x30" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/execution.json b/tools/perf/pmu-events/arch/x86/amdzen5/execution.json new file mode 100644 index 0000000000..5a46d3db74 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/execution.json @@ -0,0 +1,174 @@ +[ + { + "EventName": "ex_ret_instr", + "EventCode": "0xc0", + "BriefDescription": "Retired instructions." + }, + { + "EventName": "ex_ret_ops", + "EventCode": "0xc1", + "BriefDescription": "Retired macro-ops." + }, + { + "EventName": "ex_ret_brn", + "EventCode": "0xc2", + "BriefDescription": "Retired branch instructions (all types of architectural control flow changes, including exceptions and interrupts)." + }, + { + "EventName": "ex_ret_brn_misp", + "EventCode": "0xc3", + "BriefDescription": "Retired branch instructions mispredicted." + }, + { + "EventName": "ex_ret_brn_tkn", + "EventCode": "0xc4", + "BriefDescription": "Retired taken branch instructions (all types of architectural control flow changes, including exceptions and interrupts)." + }, + { + "EventName": "ex_ret_brn_tkn_misp", + "EventCode": "0xc5", + "BriefDescription": "Retired taken branch instructions mispredicted." + }, + { + "EventName": "ex_ret_brn_far", + "EventCode": "0xc6", + "BriefDescription": "Retired far control transfers (far call/jump/return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts). Far control transfers are not subject to branch prediction." + }, + { + "EventName": "ex_ret_near_ret", + "EventCode": "0xc8", + "BriefDescription": "Retired near returns (RET or RET Iw)." + }, + { + "EventName": "ex_ret_near_ret_mispred", + "EventCode": "0xc9", + "BriefDescription": "Retired near returns mispredicted. Each misprediction incurs the same penalty as a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_brn_ind_misp", + "EventCode": "0xca", + "BriefDescription": "Retired indirect branch instructions mispredicted (only EX mispredicts). Each misprediction incurs the same penalty as a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_mmx_fp_instr.x87", + "EventCode": "0xcb", + "BriefDescription": "Retired x87 instructions.", + "UMask": "0x01" + }, + { + "EventName": "ex_ret_mmx_fp_instr.mmx", + "EventCode": "0xcb", + "BriefDescription": "Retired MMX instructions.", + "UMask": "0x02" + }, + { + "EventName": "ex_ret_mmx_fp_instr.sse", + "EventCode": "0xcb", + "BriefDescription": "Retired SSE instructions (includes SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42 and AVX).", + "UMask": "0x04" + }, + { + "EventName": "ex_ret_ind_brch_instr", + "EventCode": "0xcc", + "BriefDescription": "Retired indirect branch instructions." + }, + { + "EventName": "ex_ret_cond", + "EventCode": "0xd1", + "BriefDescription": "Retired conditional branch instructions." + }, + { + "EventName": "ex_div_busy", + "EventCode": "0xd3", + "BriefDescription": "Number of cycles the divider is busy." + }, + { + "EventName": "ex_div_count", + "EventCode": "0xd4", + "BriefDescription": "Divide ops executed." + }, + { + "EventName": "ex_no_retire.empty", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire due to the lack of valid ops in the retire queue (may be caused by front-end bottlenecks or pipeline redirects).", + "UMask": "0x01" + }, + { + "EventName": "ex_no_retire.not_complete", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire while the oldest op is waiting to be executed.", + "UMask": "0x02" + }, + { + "EventName": "ex_no_retire.other", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire caused by other reasons (retire breaks, traps, faults, etc.).", + "UMask": "0x08" + }, + { + "EventName": "ex_no_retire.thread_not_selected", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire because thread arbitration did not select the thread.", + "UMask": "0x10" + }, + { + "EventName": "ex_no_retire.load_not_complete", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire while the oldest op is waiting for load data.", + "UMask": "0xa2" + }, + { + "EventName": "ex_no_retire.all", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire for any reason.", + "UMask": "0x1b" + }, + { + "EventName": "ex_ret_ucode_instr", + "EventCode": "0x1c1", + "BriefDescription": "Retired microcoded instructions." + }, + { + "EventName": "ex_ret_ucode_ops", + "EventCode": "0x1c2", + "BriefDescription": "Retired microcode ops." + }, + { + "EventName": "ex_ret_msprd_brnch_instr_dir_msmtch", + "EventCode": "0x1c7", + "BriefDescription": "Retired branch instructions mispredicted due to direction mismatch." + }, + { + "EventName": "ex_ret_uncond_brnch_instr_mispred", + "EventCode": "0x1c8", + "BriefDescription": "Retired unconditional indirect branch instructions mispredicted." + }, + { + "EventName": "ex_ret_uncond_brnch_instr", + "EventCode": "0x1c9", + "BriefDescription": "Retired unconditional branch instructions." + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops", + "EventCode": "0x1cf", + "BriefDescription": "Ops tagged by IBS.", + "UMask": "0x01" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret", + "EventCode": "0x1cf", + "BriefDescription": "Ops tagged by IBS that retired.", + "UMask": "0x02" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_count_rollover", + "EventCode": "0x1cf", + "BriefDescription": "Ops not tagged by IBS due to a previous tagged op that has not yet signaled interrupt.", + "UMask": "0x04" + }, + { + "EventName": "ex_ret_fused_instr", + "EventCode": "0x1d0", + "BriefDescription": "Retired fused instructions." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json new file mode 100644 index 0000000000..9204bfb1d6 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json @@ -0,0 +1,812 @@ +[ + { + "EventName": "fp_ret_x87_fp_ops.add_sub_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point add and subtract ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ret_x87_fp_ops.mul_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point multiply ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ret_x87_fp_ops.div_sqrt_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point divide and square root ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ret_x87_fp_ops.all", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point ops of all types.", + "UMask": "0x07" + }, + { + "EventName": "fp_ret_sse_avx_ops.add_sub_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point add and subtract ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ret_sse_avx_ops.mult_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point multiply ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ret_sse_avx_ops.div_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point divide and square root ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ret_sse_avx_ops.mac_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point multiply-accumulate ops (each operation is counted as 2 ops).", + "UMask": "0x08" + }, + { + "EventName": "fp_ret_sse_avx_ops.bfloat16_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point bfloat16 ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ret_sse_avx_ops.scalar_single_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point scalar single-precision ops.", + "UMask": "0x40" + }, + { + "EventName": "fp_ret_sse_avx_ops.packed_single_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point packed single-precision ops.", + "UMask": "0x60" + }, + { + "EventName": "fp_ret_sse_avx_ops.scalar_double_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point scalar double-precision ops.", + "UMask": "0x80" + }, + { + "EventName": "fp_ret_sse_avx_ops.packed_double_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point packed double-precision ops.", + "UMask": "0xa0" + }, + { + "EventName": "fp_ret_sse_avx_ops.all", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_ops_retired_by_width.x87_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired x87 floating-point ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ops_retired_by_width.mmx_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired MMX floating-point ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ops_retired_by_width.scalar_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired scalar floating-point ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ops_retired_by_width.pack_128_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired packed 128-bit floating-point ops.", + "UMask": "0x08" + }, + { + "EventName": "fp_ops_retired_by_width.pack_256_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired packed 256-bit floating-point ops.", + "UMask": "0x10" + }, + { + "EventName": "fp_ops_retired_by_width.pack_512_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired packed 512-bit floating-point ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ops_retired_by_width.all", + "EventCode": "0x08", + "BriefDescription": "Retired floating-point ops of all widths.", + "UMask": "0x3f" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_add", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point add ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_sub", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_mul", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_mac", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_div", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point divide ops.", + "UMask": "0x05" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_sqrt", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point square root ops.", + "UMask": "0x06" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_cmp", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point compare ops.", + "UMask": "0x07" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_cvt", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point convert ops.", + "UMask": "0x08" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_blend", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point blend ops.", + "UMask": "0x09" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_other", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_all", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_ops_retired_by_type.vector_add", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point add ops.", + "UMask": "0x10" + }, + { + "EventName": "fp_ops_retired_by_type.vector_sub", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ops_retired_by_type.vector_mul", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "fp_ops_retired_by_type.vector_mac", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "fp_ops_retired_by_type.vector_div", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point divide ops.", + "UMask": "0x50" + }, + { + "EventName": "fp_ops_retired_by_type.vector_sqrt", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point square root ops.", + "UMask": "0x60" + }, + { + "EventName": "fp_ops_retired_by_type.vector_cmp", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point compare ops.", + "UMask": "0x70" + }, + { + "EventName": "fp_ops_retired_by_type.vector_cvt", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point convert ops.", + "UMask": "0x80" + }, + { + "EventName": "fp_ops_retired_by_type.vector_blend", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point blend ops.", + "UMask": "0x90" + }, + { + "EventName": "fp_ops_retired_by_type.vector_shuffle", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_ops_retired_by_type.vector_logical", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_ops_retired_by_type.vector_other", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_ops_retired_by_type.vector_all", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_ops_retired_by_type.all", + "EventCode": "0x0a", + "BriefDescription": "Retired floating-point ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "sse_avx_ops_retired.mmx_add", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer add.", + "UMask": "0x01" + }, + { + "EventName": "sse_avx_ops_retired.mmx_sub", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "sse_avx_ops_retired.mmx_mul", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "sse_avx_ops_retired.mmx_mac", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "sse_avx_ops_retired.mmx_cmp", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer compare ops.", + "UMask": "0x07" + }, + { + "EventName": "sse_avx_ops_retired.mmx_shift", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer shift ops.", + "UMask": "0x09" + }, + { + "EventName": "sse_avx_ops_retired.mmx_mov", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer MOV ops.", + "UMask": "0x0a" + }, + { + "EventName": "sse_avx_ops_retired.mmx_shuffle", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "sse_avx_ops_retired.mmx_pack", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer pack ops.", + "UMask": "0x0c" + }, + { + "EventName": "sse_avx_ops_retired.mmx_logical", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer logical ops.", + "UMask": "0x0d" + }, + { + "EventName": "sse_avx_ops_retired.mmx_other", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "sse_avx_ops_retired.mmx_all", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_add", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer add ops.", + "UMask": "0x10" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_sub", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_mul", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_mac", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_aes", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer AES ops.", + "UMask": "0x50" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_sha", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer SHA ops.", + "UMask": "0x60" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_cmp", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer compare ops.", + "UMask": "0x70" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_clm", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer CLM ops.", + "UMask": "0x80" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_shift", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer shift ops.", + "UMask": "0x90" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_mov", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer MOV ops.", + "UMask": "0xa0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_shuffle", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_pack", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer pack ops.", + "UMask": "0xc0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_logical", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_other", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_all", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "sse_avx_ops_retired.all", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE, AVX and MMX integer ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_pack_ops_retired.fp128_add", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point add ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_pack_ops_retired.fp128_sub", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_pack_ops_retired.fp128_mul", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "fp_pack_ops_retired.fp128_mac", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_pack_ops_retired.fp128_div", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point divide ops.", + "UMask": "0x05" + }, + { + "EventName": "fp_pack_ops_retired.fp128_sqrt", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point square root ops.", + "UMask": "0x06" + }, + { + "EventName": "fp_pack_ops_retired.fp128_cmp", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point compare ops.", + "UMask": "0x07" + }, + { + "EventName": "fp_pack_ops_retired.fp128_cvt", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point convert ops.", + "UMask": "0x08" + }, + { + "EventName": "fp_pack_ops_retired.fp128_blend", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point blend ops.", + "UMask": "0x09" + }, + { + "EventName": "fp_pack_ops_retired.fp128_shuffle", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "fp_pack_ops_retired.fp128_logical", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point logical ops.", + "UMask": "0x0d" + }, + { + "EventName": "fp_pack_ops_retired.fp128_other", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_pack_ops_retired.fp128_all", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_pack_ops_retired.fp256_add", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point add ops.", + "UMask": "0x10" + }, + { + "EventName": "fp_pack_ops_retired.fp256_sub", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_pack_ops_retired.fp256_mul", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "fp_pack_ops_retired.fp256_mac", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "fp_pack_ops_retired.fp256_div", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point divide ops.", + "UMask": "0x50" + }, + { + "EventName": "fp_pack_ops_retired.fp256_sqrt", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point square root ops.", + "UMask": "0x60" + }, + { + "EventName": "fp_pack_ops_retired.fp256_cmp", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point compare ops.", + "UMask": "0x70" + }, + { + "EventName": "fp_pack_ops_retired.fp256_cvt", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point convert ops.", + "UMask": "0x80" + }, + { + "EventName": "fp_pack_ops_retired.fp256_blend", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point blend ops.", + "UMask": "0x90" + }, + { + "EventName": "fp_pack_ops_retired.fp256_shuffle", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_pack_ops_retired.fp256_logical", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_pack_ops_retired.fp256_other", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_pack_ops_retired.fp256_all", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_pack_ops_retired.all", + "EventCode": "0x0c", + "BriefDescription": "Retired packed floating-point ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "packed_int_op_type.int128_add", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer add ops.", + "UMask": "0x01" + }, + { + "EventName": "packed_int_op_type.int128_sub", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "packed_int_op_type.int128_mul", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "packed_int_op_type.int128_mac", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "packed_int_op_type.int128_aes", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer AES ops.", + "UMask": "0x05" + }, + { + "EventName": "packed_int_op_type.int128_sha", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer SHA ops.", + "UMask": "0x06" + }, + { + "EventName": "packed_int_op_type.int128_cmp", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer compare ops.", + "UMask": "0x07" + }, + { + "EventName": "packed_int_op_type.int128_clm", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer CLM ops.", + "UMask": "0x08" + }, + { + "EventName": "packed_int_op_type.int128_shift", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer shift ops.", + "UMask": "0x09" + }, + { + "EventName": "packed_int_op_type.int128_mov", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer MOV ops.", + "UMask": "0x0a" + }, + { + "EventName": "packed_int_op_type.int128_shuffle", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "packed_int_op_type.int128_pack", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer pack ops.", + "UMask": "0x0c" + }, + { + "EventName": "packed_int_op_type.int128_logical", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer logical ops.", + "UMask": "0x0d" + }, + { + "EventName": "packed_int_op_type.int128_other", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "packed_int_op_type.int128_all", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "packed_int_op_type.int256_add", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer add ops.", + "UMask": "0x10" + }, + { + "EventName": "packed_int_op_type.int256_sub", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "packed_int_op_type.int256_mul", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "packed_int_op_type.int256_mac", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "packed_int_op_type.int256_cmp", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer compare ops.", + "UMask": "0x70" + }, + { + "EventName": "packed_int_op_type.int256_shift", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer shift ops.", + "UMask": "0x90" + }, + { + "EventName": "packed_int_op_type.int256_mov", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer MOV ops.", + "UMask": "0xa0" + }, + { + "EventName": "packed_int_op_type.int256_shuffle", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "packed_int_op_type.int256_pack", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer pack ops.", + "UMask": "0xc0" + }, + { + "EventName": "packed_int_op_type.int256_logical", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "packed_int_op_type.int256_other", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "packed_int_op_type.int256_all", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "packed_int_op_type.all", + "EventCode": "0x0d", + "BriefDescription": "Retired packed integer ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_disp_faults.x87_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for x87 fills.", + "UMask": "0x01" + }, + { + "EventName": "fp_disp_faults.xmm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for XMM fills.", + "UMask": "0x02" + }, + { + "EventName": "fp_disp_faults.ymm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for YMM fills.", + "UMask": "0x04" + }, + { + "EventName": "fp_disp_faults.ymm_spill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for YMM spills.", + "UMask": "0x08" + }, + { + "EventName": "fp_disp_faults.sse_avx_all", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults of all types for SSE and AVX ops.", + "UMask": "0x0e" + }, + { + "EventName": "fp_disp_faults.all", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults of all types.", + "UMask": "0x0f" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json b/tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json new file mode 100644 index 0000000000..ad75e5bf95 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json @@ -0,0 +1,72 @@ +[ + { + "EventName": "ic_cache_fill_l2", + "EventCode": "0x82", + "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from the L2 cache." + }, + { + "EventName": "ic_cache_fill_sys", + "EventCode": "0x83", + "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from system memory or another cache." + }, + { + "EventName": "ic_fetch_ibs_events.fetch_tagged", + "EventCode": "0x188", + "BriefDescription": "Fetches tagged by Fetch IBS. Not all tagged fetches result in a valid sample and an IBS interrupt.", + "UMask": "0x02" + }, + { + "EventName": "ic_fetch_ibs_events.sample_discarded", + "EventCode": "0x188", + "BriefDescription": "Fetches discarded after being tagged by Fetch IBS due to reasons other than IBS filtering.", + "UMask": "0x04" + }, + { + "EventName": "ic_fetch_ibs_events.sample_filtered", + "EventCode": "0x188", + "BriefDescription": "Fetches discarded after being tagged by Fetch IBS due to IBS filtering.", + "UMask": "0x08" + }, + { + "EventName": "ic_fetch_ibs_events.sample_valid", + "EventCode": "0x188", + "BriefDescription": "Fetches tagged by Fetch IBS that result in a valid sample and an IBS interrupt.", + "UMask": "0x10" + }, + { + "EventName": "ic_tag_hit_miss.instruction_cache_hit", + "EventCode": "0x18e", + "BriefDescription": "Instruction cache hits.", + "UMask": "0x07" + }, + { + "EventName": "ic_tag_hit_miss.instruction_cache_miss", + "EventCode": "0x18e", + "BriefDescription": "Instruction cache misses.", + "UMask": "0x18" + }, + { + "EventName": "ic_tag_hit_miss.all_instruction_cache_accesses", + "EventCode": "0x18e", + "BriefDescription": "Instruction cache accesses of all types.", + "UMask": "0x1f" + }, + { + "EventName": "op_cache_hit_miss.op_cache_hit", + "EventCode": "0x28f", + "BriefDescription": "Op cache hits.", + "UMask": "0x03" + }, + { + "EventName": "op_cache_hit_miss.op_cache_miss", + "EventCode": "0x28f", + "BriefDescription": "Op cache misses.", + "UMask": "0x04" + }, + { + "EventName": "op_cache_hit_miss.all_op_cache_accesses", + "EventCode": "0x28f", + "BriefDescription": "Op cache accesses of all types.", + "UMask": "0x07" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json b/tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json new file mode 100644 index 0000000000..d1de51a029 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json @@ -0,0 +1,266 @@ +[ + { + "EventName": "l2_request_g1.group2", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of non-cacheable type (non-cached data and instructions reads, self-modifying code checks).", + "UMask": "0x01" + }, + { + "EventName": "l2_request_g1.l2_hw_pf", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: from hardware prefetchers to prefetch directly into L2 (hit or miss).", + "UMask": "0x02" + }, + { + "EventName": "l2_request_g1.prefetch_l2_cmd", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: prefetch directly into L2.", + "UMask": "0x04" + }, + { + "EventName": "l2_request_g1.cacheable_ic_read", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: instruction cache reads.", + "UMask": "0x10" + }, + { + "EventName": "l2_request_g1.ls_rd_blk_c_s", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: data cache shared reads.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g1.rd_blk_x", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: data cache stores.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g1.rd_blk_l", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: data cache reads including hardware and software prefetch.", + "UMask": "0x80" + }, + { + "EventName": "l2_request_g1.all_dc", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of common types from L1 data cache (including prefetches).", + "UMask": "0xe0" + }, + { + "EventName": "l2_request_g1.all_no_prefetch", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of common types not including prefetches.", + "UMask": "0xf1" + }, + { + "EventName": "l2_request_g1.all", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of all types.", + "UMask": "0xf7" + }, + { + "EventName": "l2_request_g2.ls_rd_sized_nc", + "EventCode": "0x61", + "BriefDescription": "L2 cache requests: non-coherent, non-cacheable LS sized reads.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g2.ls_rd_sized", + "EventCode": "0x61", + "BriefDescription": "L2 cache requests: coherent, non-cacheable LS sized reads.", + "UMask": "0x40" + }, + { + "EventName": "l2_wcb_req.wcb_close", + "EventCode": "0x63", + "BriefDescription": "Write Combining Buffer (WCB) closures.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_miss", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: instruction cache request miss in L2.", + "UMask": "0x01" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: instruction cache hit non-modifiable line in L2.", + "UMask": "0x02" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: instruction cache hit modifiable line in L2.", + "UMask": "0x04" + }, + { + "EventName": "l2_cache_req_stat.ic_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for instruction cache hits.", + "UMask": "0x06" + }, + { + "EventName": "l2_cache_req_stat.ic_access_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for instruction cache access.", + "UMask": "0x07" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_c", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache request miss in L2.", + "UMask": "0x08" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data and instruction cache misses.", + "UMask": "0x09" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache store or state change hit in L2.", + "UMask": "0x10" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache read hit non-modifiable line in L2.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache read hit modifiable line in L2.", + "UMask": "0x40" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_cs", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache shared read hit in L2.", + "UMask": "0x80" + }, + { + "EventName": "l2_cache_req_stat.dc_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data cache hits.", + "UMask": "0xf0" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data and instruction cache hits.", + "UMask": "0xf6" + }, + { + "EventName": "l2_cache_req_stat.dc_access_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data cache access.", + "UMask": "0xf8" + }, + { + "EventName": "l2_cache_req_stat.all", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data and instruction cache access.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_hit_l2.l2_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_hit_l2.l1_dc_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_hit_l2.l1_dc_l2_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l2_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_l3.l2_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_miss_l2_l3.l1_dc_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_miss_l2_l3.l1_dc_l2_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_fill_rsp_src.local_ccx", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "l2_fill_rsp_src.near_cache", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "l2_fill_rsp_src.dram_io_near", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "l2_fill_rsp_src.far_cache", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "l2_fill_rsp_src.dram_io_far", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "l2_fill_rsp_src.alternate_memories", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "l2_fill_rsp_src.all", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from all types of data sources.", + "UMask": "0xde" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json b/tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json new file mode 100644 index 0000000000..b50fe14d45 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json @@ -0,0 +1,177 @@ +[ + { + "EventName": "l3_lookup_state.l3_miss", + "EventCode": "0x04", + "BriefDescription": "L3 cache misses.", + "UMask": "0x01", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.l3_hit", + "EventCode": "0x04", + "BriefDescription": "L3 cache hits.", + "UMask": "0xfe", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.all_coherent_accesses_to_l3", + "EventCode": "0x04", + "BriefDescription": "L3 cache requests for all coherent accesses.", + "UMask": "0xff", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.dram_near", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from DRAM in the same NUMA node.", + "UMask": "0x01", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.dram_far", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from DRAM in a different NUMA node.", + "UMask": "0x02", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.near_cache", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from another CCX's cache when the address was in the same NUMA node.", + "UMask": "0x04", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.far_cache", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from another CCX's cache when the address was in a different NUMA node.", + "UMask": "0x08", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.ext_near", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from extension memory (CXL) in the same NUMA node.", + "UMask": "0x10", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.ext_far", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from extension memory (CXL) in a different NUMA node.", + "UMask": "0x20", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.all", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency from all data sources.", + "UMask": "0x3f", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.dram_near", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from DRAM in the same NUMA node.", + "UMask": "0x01", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.dram_far", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from DRAM in a different NUMA node.", + "UMask": "0x02", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.near_cache", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from another CCX's cache when the address was in the same NUMA node.", + "UMask": "0x04", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.far_cache", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from another CCX's cache when the address was in a different NUMA node.", + "UMask": "0x08", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.ext_near", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from extension memory (CXL) in the same NUMA node.", + "UMask": "0x10", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.ext_far", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from extension memory (CXL) in a different NUMA node.", + "UMask": "0x20", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.all", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from all data sources.", + "UMask": "0x3f", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json new file mode 100644 index 0000000000..af2fdf1f55 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json @@ -0,0 +1,451 @@ +[ + { + "EventName": "ls_bad_status2.stli_other", + "EventCode": "0x24", + "BriefDescription": "Store-to-load conflicts (load unable to complete due to a non-forwardable conflict with an older store).", + "UMask": "0x02" + }, + { + "EventName": "ls_locks.bus_lock", + "EventCode": "0x25", + "BriefDescription": "Retired Lock instructions which caused a bus lock.", + "UMask": "0x01" + }, + { + "EventName": "ls_ret_cl_flush", + "EventCode": "0x26", + "BriefDescription": "Retired CLFLUSH instructions." + }, + { + "EventName": "ls_ret_cpuid", + "EventCode": "0x27", + "BriefDescription": "Retired CPUID instructions." + }, + { + "EventName": "ls_dispatch.ld_dispatch", + "EventCode": "0x29", + "BriefDescription": "Number of memory load operations dispatched to the load-store unit.", + "UMask": "0x01" + }, + { + "EventName": "ls_dispatch.store_dispatch", + "EventCode": "0x29", + "BriefDescription": "Number of memory store operations dispatched to the load-store unit.", + "UMask": "0x02" + }, + { + "EventName": "ls_dispatch.ld_st_dispatch", + "EventCode": "0x29", + "BriefDescription": "Number of memory load-store operations dispatched to the load-store unit.", + "UMask": "0x04" + }, + { + "EventName": "ls_dispatch.all", + "EventCode": "0x29", + "BriefDescription": "Number of memory operations dispatched to the load-store unit.", + "UMask": "0x07" + }, + { + "EventName": "ls_smi_rx", + "EventCode": "0x2b", + "BriefDescription": "SMIs received." + }, + { + "EventName": "ls_int_taken", + "EventCode": "0x2c", + "BriefDescription": "Interrupts taken." + }, + { + "EventName": "ls_stlf", + "EventCode": "0x35", + "BriefDescription": "Store-to-load-forward (STLF) hits." + }, + { + "EventName": "ls_st_commit_cancel2.st_commit_cancel_wcb_full", + "EventCode": "0x37", + "BriefDescription": "Non-cacheable store commits cancelled due to the non-cacheable commit buffer being full.", + "UMask": "0x01" + }, + { + "EventName": "ls_mab_alloc.load_store_allocations", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.", + "UMask": "0x3f" + }, + { + "EventName": "ls_mab_alloc.hardware_prefetcher_allocations", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.", + "UMask": "0x40" + }, + { + "EventName": "ls_mab_alloc.all_allocations", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.", + "UMask": "0x7f" + }, + { + "EventName": "ls_dmnd_fills_from_sys.local_l2", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_dmnd_fills_from_sys.local_ccx", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_dmnd_fills_from_sys.near_cache", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_dmnd_fills_from_sys.dram_io_near", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_dmnd_fills_from_sys.far_cache", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_dmnd_fills_from_sys.dram_io_far", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_dmnd_fills_from_sys.alternate_memories", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_dmnd_fills_from_sys.all", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from all types of data sources.", + "UMask": "0xff" + }, + { + "EventName": "ls_any_fills_from_sys.local_l2", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_any_fills_from_sys.local_ccx", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_any_fills_from_sys.local_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from local L2 cache or L3 cache or different L2 cache in the same CCX.", + "UMask": "0x03" + }, + { + "EventName": "ls_any_fills_from_sys.near_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_near", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_any_fills_from_sys.far_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_any_fills_from_sys.remote_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from cache of another CCX when the address was in the same or a different NUMA node.", + "UMask": "0x14" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_far", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in any NUMA node (same or different socket).", + "UMask": "0x48" + }, + { + "EventName": "ls_any_fills_from_sys.far_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either cache of another CCX, DRAM or MMIO when the address was in a different NUMA node (same or different socket).", + "UMask": "0x50" + }, + { + "EventName": "ls_any_fills_from_sys.all_dram_io", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in any NUMA node (same or different socket).", + "UMask": "0x48" + }, + { + "EventName": "ls_any_fills_from_sys.alternate_memories", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_any_fills_from_sys.all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from all types of data sources.", + "UMask": "0xff" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x02" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 2M pages.", + "UMask": "0x04" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 1G pages.", + "UMask": "0x08" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for 4k pages.", + "UMask": "0x10" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x20" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for 2M pages.", + "UMask": "0x40" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for 1G pages.", + "UMask": "0x80" + }, + { + "EventName": "ls_l1_d_tlb_miss.all_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for all page sizes.", + "UMask": "0xf0" + }, + { + "EventName": "ls_l1_d_tlb_miss.all", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses for all page sizes.", + "UMask": "0xff" + }, + { + "EventName": "ls_misal_loads.ma64", + "EventCode": "0x47", + "BriefDescription": "64B misaligned (cacheline crossing) loads.", + "UMask": "0x01" + }, + { + "EventName": "ls_misal_loads.ma4k", + "EventCode": "0x47", + "BriefDescription": "4kB misaligned (page crossing) loads.", + "UMask": "0x02" + }, + { + "EventName": "ls_pref_instr_disp.prefetch", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchT0 (move data to all cache levels), T1 (move data to all cache levels except L1) and T2 (move data to all cache levels except L1 and L2).", + "UMask": "0x01" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_w", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchW (move data to L1 cache and mark it modifiable).", + "UMask": "0x02" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_nta", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchNTA (move data with minimum cache pollution i.e. non-temporal access).", + "UMask": "0x04" + }, + { + "EventName": "ls_pref_instr_disp.all", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of all types.", + "UMask": "0x07" + }, + { + "EventName": "wcb_close.full_line_64b", + "EventCode": "0x50", + "BriefDescription": "Number of events that caused a Write Combining Buffer (WCB) entry to close because all 64 bytes of the entry have been written to.", + "UMask": "0x01" + }, + { + "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit", + "EventCode": "0x52", + "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a data cache hit.", + "UMask": "0x01" + }, + { + "EventName": "ls_inef_sw_pref.mab_mch_cnt", + "EventCode": "0x52", + "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a match on an already allocated Miss Address Buffer (MAB).", + "UMask": "0x02" + }, + { + "EventName": "ls_inef_sw_pref.all", + "EventCode": "0x52", + "BriefDescript6ion": "Software prefetches that did not fetch data outside of the processor core for any reason.", + "UMask": "0x03" + }, + { + "EventName": "ls_sw_pf_dc_fills.local_l2", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_sw_pf_dc_fills.local_ccx", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_sw_pf_dc_fills.near_cache", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_sw_pf_dc_fills.dram_io_near", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_sw_pf_dc_fills.far_cache", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_sw_pf_dc_fills.dram_io_far", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_sw_pf_dc_fills.alternate_memories", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_sw_pf_dc_fills.all", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from all types of data sources.", + "UMask": "0xdf" + }, + { + "EventName": "ls_hw_pf_dc_fills.local_l2", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_hw_pf_dc_fills.local_ccx", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_hw_pf_dc_fills.near_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_hw_pf_dc_fills.dram_io_near", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_hw_pf_dc_fills.far_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_hw_pf_dc_fills.dram_io_far", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_hw_pf_dc_fills.alternate_memories", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_hw_pf_dc_fills.all", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from all types of data sources.", + "UMask": "0xdf" + }, + { + "EventName": "ls_alloc_mab_count", + "EventCode": "0x5f", + "BriefDescription": "In-flight L1 data cache misses i.e. Miss Address Buffer (MAB) allocations each cycle." + }, + { + "EventName": "ls_not_halted_cyc", + "EventCode": "0x76", + "BriefDescription": "Core cycles not in halt." + }, + { + "EventName": "ls_tlb_flush.all", + "EventCode": "0x78", + "BriefDescription": "All TLB Flushes.", + "UMask": "0xff" + }, + { + "EventName": "ls_not_halted_p0_cyc.p0_freq_cyc", + "EventCode": "0x120", + "BriefDescription": "Reference cycles (P0 frequency) not in halt .", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json b/tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json new file mode 100644 index 0000000000..1a629fc947 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json @@ -0,0 +1,101 @@ +[ + { + "EventName": "umc_mem_clk", + "PublicDescription": "Number of memory clock (MEMCLK) cycles.", + "EventCode": "0x00", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.all", + "PublicDescription": "Number of ACTIVATE commands sent.", + "EventCode": "0x05", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.rd", + "PublicDescription": "Number of ACTIVATE commands sent for reads.", + "EventCode": "0x05", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.wr", + "PublicDescription": "Number of ACTIVATE commands sent for writes.", + "EventCode": "0x05", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.all", + "PublicDescription": "Number of PRECHARGE commands sent.", + "EventCode": "0x06", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.rd", + "PublicDescription": "Number of PRECHARGE commands sent for reads.", + "EventCode": "0x06", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.wr", + "PublicDescription": "Number of PRECHARGE commands sent for writes.", + "EventCode": "0x06", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.all", + "PublicDescription": "Number of CAS commands sent.", + "EventCode": "0x0a", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.rd", + "PublicDescription": "Number of CAS commands sent for reads.", + "EventCode": "0x0a", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.wr", + "PublicDescription": "Number of CAS commands sent for writes.", + "EventCode": "0x0a", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.all", + "PublicDescription": "Number of clock cycles used by the data bus.", + "EventCode": "0x14", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.rd", + "PublicDescription": "Number of clock cycles used by the data bus for reads.", + "EventCode": "0x14", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.wr", + "PublicDescription": "Number of clock cycles used by the data bus for writes.", + "EventCode": "0x14", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json b/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json new file mode 100644 index 0000000000..d860bf599c --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json @@ -0,0 +1,99 @@ +[ + { + "MetricName": "total_dispatch_slots", + "BriefDescription": "Total dispatch slots (up to 8 instructions can be dispatched in each cycle).", + "MetricExpr": "8 * ls_not_halted_cyc", + "ScaleUnit": "1slots" + }, + { + "MetricName": "frontend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because the frontend did not supply enough instructions/ops.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation", + "BriefDescription": "Percentage of dispatched ops that did not retire.", + "MetricExpr": "d_ratio(de_src_op_disp.all - ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because of backend stalls.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.backend_stalls, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "smt_contention", + "BriefDescription": "Percentage of dispatch slots that remained unused because the other thread was selected.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.smt_contention, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring", + "BriefDescription": "Percentage of dispatch slots used by ops that retired.", + "MetricExpr": "d_ratio(ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_latency", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a latency bottleneck in the frontend (such as instruction cache or TLB misses).", + "MetricExpr": "d_ratio((8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_bandwidth", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a bandwidth bottleneck in the frontend (such as decode or op cache fetch bandwidth).", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend - (8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation_from_mispredicts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to branch mispredicts.", + "MetricExpr": "d_ratio(bad_speculation * ex_ret_brn_misp, ex_ret_brn_misp + bp_redirects.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "bad_speculation_from_pipeline_restarts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to pipeline restarts (resyncs).", + "MetricExpr": "d_ratio(bad_speculation * bp_redirects.resync, ex_ret_brn_misp + bp_redirects.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound_by_memory", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls due to the memory subsystem.", + "MetricExpr": "backend_bound * d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete)", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "backend_bound_by_cpu", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls not related to the memory subsystem.", + "MetricExpr": "backend_bound * (1 - d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete))", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_fastpath", + "BriefDescription": "Percentage of dispatch slots used by fastpath ops that retired.", + "MetricExpr": "retiring * (1 - d_ratio(ex_ret_ucode_ops, ex_ret_ops))", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_microcode", + "BriefDescription": "Percentage of dispatch slots used by microcode ops that retired.", + "MetricExpr": "retiring * d_ratio(ex_ret_ucode_ops, ex_ret_ops)", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json new file mode 100644 index 0000000000..c97874039c --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json @@ -0,0 +1,345 @@ +[ + { + "MetricName": "branch_misprediction_rate", + "BriefDescription": "Execution-time branch misprediction rate (non-speculative).", + "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)", + "MetricGroup": "branch_prediction", + "ScaleUnit": "1per_branch" + }, + { + "MetricName": "all_data_cache_accesses_pti", + "BriefDescription": "All data cache accesses per thousand instructions.", + "MetricExpr": "ls_dispatch.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_accesses_pti", + "BriefDescription": "All L2 cache accesses per thousand instructions.", + "MetricExpr": "(l2_request_g1.all_no_prefetch + l2_pf_hit_l2.l2_hwpf + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_ic_misses_pti", + "BriefDescription": "L2 cache accesses from L1 instruction cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.cacheable_ic_read / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_dc_misses_pti", + "BriefDescription": "L2 cache accesses from L1 data cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.all_dc / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache accesses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_hit_l2.l1_dc_l2_hwpf + l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_misses_pti", + "BriefDescription": "All L2 cache misses per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache misses from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_fill_miss / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache misses from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ls_rd_blk_c / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache misses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_hits_pti", + "BriefDescription": "All L2 cache hits per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache hits from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache hits from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.dc_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l2_hwpf_pti", + "BriefDescription": "L2 cache hits from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "l2_pf_hit_l2.l1_dc_l2_hwpf / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l3_cache_accesses", + "BriefDescription": "L3 cache accesses.", + "MetricExpr": "l3_lookup_state.all_coherent_accesses_to_l3", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_misses", + "BriefDescription": "L3 misses (including cacheline state change requests).", + "MetricExpr": "l3_lookup_state.l3_miss", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_read_miss_latency", + "BriefDescription": "Average L3 read miss latency (in core clocks).", + "MetricExpr": "(l3_xi_sampled_latency.all * 10) / l3_xi_sampled_latency_requests.all", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_local_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for local DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_near * 10) / l3_xi_sampled_latency_requests.dram_near", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_remote_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for remote DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_far * 10) / l3_xi_sampled_latency_requests.dram_far", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "op_cache_fetch_miss_ratio", + "BriefDescription": "Op cache miss ratio for all fetches.", + "MetricExpr": "d_ratio(op_cache_hit_miss.op_cache_miss, op_cache_hit_miss.all_op_cache_accesses)", + "ScaleUnit": "100%" + }, + { + "MetricName": "ic_fetch_miss_ratio", + "BriefDescription": "Instruction cache miss ratio for all fetches. An instruction cache miss will not be counted by this metric if it is an OC hit.", + "MetricExpr": "d_ratio(ic_tag_hit_miss.instruction_cache_miss, ic_tag_hit_miss.all_instruction_cache_accesses)", + "ScaleUnit": "100%" + }, + { + "MetricName": "l1_data_cache_fills_from_memory_pti", + "BriefDescription": "L1 data cache fills from DRAM or MMIO in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.dram_io_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_remote_node_pti", + "BriefDescription": "L1 data cache fills from a different NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.far_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.local_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_different_ccx_pti", + "BriefDescription": "L1 data cache fills from another CCX cache in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.remote_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l1_data_cache_fills_pti", + "BriefDescription": "All L1 data cache fills per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_local_l2_pti", + "BriefDescription": "L1 demand data cache fills from local L2 cache per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_l2 / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 demand data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_ccx / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.near_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_near / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.far_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_far / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_itlb_misses_pti", + "BriefDescription": "L1 instruction TLB misses per thousand instructions.", + "MetricExpr": "(bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss.all) / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_itlb_misses_pti", + "BriefDescription": "L2 instruction TLB misses and instruction page walks per thousand instructions.", + "MetricExpr": "bp_l1_tlb_miss_l2_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_dtlb_misses_pti", + "BriefDescription": "L1 data TLB misses per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_dtlb_misses_pti", + "BriefDescription": "L2 data TLB misses and data page walks per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.all_l2_miss / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_tlbs_flushed_pti", + "BriefDescription": "All TLBs flushed per thousand instructions.", + "MetricExpr": "ls_tlb_flush.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "macro_ops_dispatched", + "BriefDescription": "Macro-ops dispatched.", + "MetricExpr": "de_src_op_disp.all", + "MetricGroup": "decoder" + }, + { + "MetricName": "sse_avx_stalls", + "BriefDescription": "Mixed SSE/AVX stalls.", + "MetricExpr": "fp_disp_faults.sse_avx_all" + }, + { + "MetricName": "macro_ops_retired", + "BriefDescription": "Macro-ops retired.", + "MetricExpr": "ex_ret_ops" + }, + { + "MetricName": "umc_data_bus_utilization", + "BriefDescription": "Memory controller data bus utilization.", + "MetricExpr": "d_ratio(umc_data_slot_clks.all / 2, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_rate", + "BriefDescription": "Memory controller CAS command rate.", + "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_cas_cmd_read_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for reads.", + "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_write_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for writes.", + "MetricExpr": "d_ratio(umc_cas_cmd.wr, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_mem_read_bandwidth", + "BriefDescription": "Estimated memory read bandwidth.", + "MetricExpr": "(umc_cas_cmd.rd * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_write_bandwidth", + "BriefDescription": "Estimated memory write bandwidth.", + "MetricExpr": "(umc_cas_cmd.wr * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_bandwidth", + "BriefDescription": "Estimated combined memory bandwidth.", + "MetricExpr": "(umc_cas_cmd.all * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_activate_cmd_rate", + "BriefDescription": "Memory controller ACTIVATE command rate.", + "MetricExpr": "d_ratio(umc_act_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_precharge_cmd_rate", + "BriefDescription": "Memory controller PRECHARGE command rate.", + "MetricExpr": "d_ratio(umc_pchg_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + } +] diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index f2d378c9d6..0aed533da8 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -732,9 +732,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "", @@ -745,9 +744,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -764,9 +762,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -807,9 +804,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -838,16 +834,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -867,9 +861,8 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index 7f88b156f7..297046818e 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -670,23 +670,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1045,9 +1042,8 @@ { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1088,9 +1084,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1107,9 +1102,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1132,23 +1126,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -1189,9 +1180,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1202,9 +1192,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1233,16 +1222,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1253,9 +1240,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "Un-cacheable retired load per kilo instruction", @@ -1273,16 +1259,14 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1313,9 +1297,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json b/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json index 095904c770..d6f543471b 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "Decode Stream Buffer (DSB)-to-MITE switches", "EventCode": "0xAB", "EventName": "DSB2MITE_SWITCHES.COUNT", - "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses.\nNote: Invoking MITE requires two or three cycles delay.", + "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses. Note: Invoking MITE requires two or three cycles delay.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -267,11 +267,11 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.DSB_CYCLES_OK]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.DSB_CYCLES_OK]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", "SampleAfterValue": "2000003", "UMask": "0x18" }, @@ -321,11 +321,11 @@ "UMask": "0x18" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "SampleAfterValue": "2000003", "UMask": "0x18" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json b/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json index a00ad0aaf1..c69b2c3333 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json @@ -6866,7 +6866,7 @@ "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).", "EventCode": "0xC9", "EventName": "RTM_RETIRED.ABORTED", - "PEBS": "1", + "PEBS": "2", "PublicDescription": "Number of times RTM abort was triggered.", "SampleAfterValue": "2000003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/other.json b/tools/perf/pmu-events/arch/x86/cascadelakex/other.json index 3ab5e91a4c..95d42ac367 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/other.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server michroarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json b/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json index 66d686cc93..c50ddf5b40 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json @@ -396,7 +396,7 @@ "Errata": "SKL091, SKL044", "EventCode": "0xC0", "EventName": "INST_RETIRED.NOP", - "PEBS": "1", + "PEBS": "2", "SampleAfterValue": "2000003", "UMask": "0x2" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json index 1a342dff15..3fe9ce483b 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json @@ -38,7 +38,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CLFLUSH", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x80", "Unit": "IRP" }, @@ -47,7 +47,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x2", "Unit": "IRP" }, @@ -56,7 +56,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.DRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x4", "Unit": "IRP" }, @@ -65,7 +65,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIDCAHINT", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x20", "Unit": "IRP" }, @@ -74,7 +74,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIRDCUR", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x1", "Unit": "IRP" }, @@ -101,7 +101,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.WBMTOI", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x40", "Unit": "IRP" }, @@ -500,7 +500,7 @@ "EventCode": "0x11", "EventName": "UNC_I_TRANSACTIONS.WRITES", "PerPkg": "1", - "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Trackes only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", + "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Tracks only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", "UMask": "0x2", "Unit": "IRP" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json b/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json index f59405877a..73feadaf76 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json @@ -205,7 +205,7 @@ "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake.", "EventCode": "0x85", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake michroarchitecture.", + "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake microarchitecture.", "SampleAfterValue": "100003", "UMask": "0x10" }, diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json index 9e53da55d0..93d99318a6 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json @@ -267,7 +267,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json index e8bf7c9c44..5420f529f4 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json @@ -264,6 +264,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json index 1f8200fb89..e2086bedec 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json @@ -428,6 +428,7 @@ "BriefDescription": "INST_RETIRED.MACRO_FUSED", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", + "PEBS": "1", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -435,6 +436,7 @@ "BriefDescription": "Retired NOP instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", + "PEBS": "1", "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2" @@ -451,6 +453,7 @@ "BriefDescription": "Iterations of Repeat string retired instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", + "PEBS": "1", "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8" diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json index 86a8f3b7fe..141dab4668 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json @@ -4197,6 +4197,42 @@ "UMask": "0xcd43ff04", "Unit": "CHA" }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437f04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; Misses from local IO", "EventCode": "0x35", @@ -4207,7 +4243,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; ItoM misses from local IO", + "BriefDescription": "TOR Inserts : ItoM, indicating a full cacheline write request, from IO Devices that missed the LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", "PerPkg": "1", @@ -4225,7 +4261,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; RdCur and FsRdCur misses from local IO", + "BriefDescription": "TOR Inserts; RdCur and FsRdCur requests from local IO that miss LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR", "PerPkg": "1", @@ -4251,6 +4287,24 @@ "UMask": "0xc8f3ff04", "Unit": "CHA" }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; RFO from local IO", "EventCode": "0x35", @@ -5713,6 +5767,42 @@ "UMask": "0xcd43fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO", "EventCode": "0x36", @@ -5722,6 +5812,24 @@ "UMask": "0xc8f3fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RFO misses from local IO", "EventCode": "0x36", diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json index 65d088556b..22bb490e96 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json @@ -4888,7 +4888,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AD)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AD)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AD", "PerPkg": "1", @@ -4897,7 +4897,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AK)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AK)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AK", "PerPkg": "1", @@ -4906,7 +4906,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AKC)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AKC)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AKC", "PerPkg": "1", @@ -4915,7 +4915,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (BL)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (BL)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.BL", "PerPkg": "1", @@ -4924,7 +4924,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (IV)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (IV)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.IV", "PerPkg": "1", @@ -5291,7 +5291,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5300,7 +5300,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5309,7 +5309,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5318,7 +5318,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -5763,7 +5763,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5772,7 +5772,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5781,7 +5781,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5790,7 +5790,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, diff --git a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json index daa0639bb1..90292dc03d 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json @@ -249,10 +249,17 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", "SampleAfterValue": "1000003" }, { @@ -284,7 +291,7 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003" @@ -296,6 +303,12 @@ "SampleAfterValue": "1000003", "UMask": "0x1" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", "EventCode": "0x74", @@ -317,6 +330,13 @@ "SampleAfterValue": "1000003", "UMask": "0x20" }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to ROB full", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER", + "SampleAfterValue": "1000003", + "UMask": "0x40" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb", "EventCode": "0x74", @@ -325,11 +345,17 @@ "UMask": "0x10" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL_P]", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", "EventCode": "0x71", @@ -402,12 +428,19 @@ "UMask": "0x4" }, { - "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL_P]", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL]", + "EventCode": "0x72", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of uops issued by the front end every cycle.", "EventCode": "0x0e", diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json index 74dfd9272c..36614429dd 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json @@ -5,7 +5,6 @@ "EventName": "UNC_CHACMS_CLOCKTICKS", "PerPkg": "1", "PortMask": "0x000", - "PublicDescription": "UNC_CHACMS_CLOCKTICKS", "Unit": "CHACMS" }, { @@ -1216,6 +1215,15 @@ "UMask": "0xc88fff01", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores", + "UMask": "0xc827ff01", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", "EventCode": "0x36", @@ -1252,6 +1260,15 @@ "UMask": "0xc88ffd01", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores that hit the LLC", + "UMask": "0xc827fd01", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that hit the cache", "EventCode": "0x36", @@ -1405,6 +1422,15 @@ "UMask": "0xc88efe01", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt issued by iA Cores that missed the LLC", + "UMask": "0xc827fe01", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", "EventCode": "0x36", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index 21e2cb5e31..83d50d80a1 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -618,9 +618,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "", @@ -631,9 +630,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -650,9 +648,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", @@ -669,9 +666,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -700,16 +696,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -729,9 +723,8 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/frontend.json b/tools/perf/pmu-events/arch/x86/icelakex/frontend.json index f6edc4222f..66669d062e 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/frontend.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/frontend.json @@ -282,7 +282,7 @@ "CounterMask": "5", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index c015b8277d..769ba12bef 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -667,23 +667,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1045,16 +1042,14 @@ { "BriefDescription": "\"Bus lock\" per kilo instruction", "MetricExpr": "tma_info_memory_mix_bus_lock_pki", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_bus_lock_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_bus_lock_pki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1095,9 +1090,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1114,9 +1108,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1139,23 +1132,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", @@ -1190,9 +1180,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1203,9 +1192,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1240,23 +1228,20 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l3_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l3_miss_latency" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1267,9 +1252,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "\"Bus lock\" per kilo instruction", @@ -1293,16 +1277,14 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1332,9 +1314,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/memory.json b/tools/perf/pmu-events/arch/x86/icelakex/memory.json index f36ac04f8d..875b584b84 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/memory.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/memory.json @@ -319,6 +319,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json index b6ce14ebf8..a950ba3ddc 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json @@ -1580,7 +1580,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.CODE_READ", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.CODE", @@ -1677,7 +1677,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.DATA_READ", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_ALL", @@ -6782,6 +6782,24 @@ "UMask": "0xc8f3ff04", "Unit": "CHA" }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices and targets local memory : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices and targets remote memory : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts : RFOs issued by IO Devices", "EventCode": "0x35", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json index a066a009c5..6997e6f7d3 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json @@ -13523,7 +13523,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -13532,7 +13532,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -13541,7 +13541,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -13550,7 +13550,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -13559,7 +13559,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Request : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Request : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x8", "Unit": "UPI" }, @@ -13568,7 +13568,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Request, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Request, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x108", "Unit": "UPI" }, @@ -13577,7 +13577,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPCNFLT", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Conflict : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Conflict : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x1aa", "Unit": "UPI" }, @@ -13586,7 +13586,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPI", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Invalid : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Invalid : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x12a", "Unit": "UPI" }, @@ -13595,7 +13595,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xc", "Unit": "UPI" }, @@ -13604,7 +13604,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10c", "Unit": "UPI" }, @@ -13613,7 +13613,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xa", "Unit": "UPI" }, @@ -13622,7 +13622,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10a", "Unit": "UPI" }, @@ -13631,7 +13631,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Snoop : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Snoop : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x9", "Unit": "UPI" }, @@ -13640,7 +13640,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Snoop, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Snoop, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x109", "Unit": "UPI" }, @@ -13649,7 +13649,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Writeback : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Writeback : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xd", "Unit": "UPI" }, @@ -13658,7 +13658,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Writeback, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Writeback, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10d", "Unit": "UPI" }, @@ -14038,7 +14038,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -14047,7 +14047,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -14056,7 +14056,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -14065,7 +14065,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -14074,7 +14074,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Request : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Request : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x8", "Unit": "UPI" }, @@ -14083,7 +14083,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Request, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Request, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x108", "Unit": "UPI" }, @@ -14092,7 +14092,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPCNFLT", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Conflict : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Conflict : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x1aa", "Unit": "UPI" }, @@ -14101,7 +14101,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPI", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Invalid : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Invalid : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x12a", "Unit": "UPI" }, @@ -14110,7 +14110,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xc", "Unit": "UPI" }, @@ -14119,7 +14119,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10c", "Unit": "UPI" }, @@ -14128,7 +14128,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xa", "Unit": "UPI" }, @@ -14137,7 +14137,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10a", "Unit": "UPI" }, @@ -14146,7 +14146,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x9", "Unit": "UPI" }, @@ -14155,7 +14155,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x109", "Unit": "UPI" }, @@ -14164,7 +14164,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xd", "Unit": "UPI" }, @@ -14173,7 +14173,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10d", "Unit": "UPI" }, diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json index 9cef8862c4..1b8a719b81 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json @@ -2476,17 +2476,6 @@ "UMask": "0x10", "Unit": "IIO" }, - { - "BriefDescription": "Number requests sent to PCIe from main die : From IRP", - "EventCode": "0xC2", - "EventName": "UNC_IIO_NUM_REQ_FROM_CPU.IRP", - "FCMask": "0x07", - "PerPkg": "1", - "PortMask": "0xFF", - "PublicDescription": "Number requests sent to PCIe from main die : From IRP : Captures Posted/Non-posted allocations from IRP. i.e. either non-confined P2P traffic or from the CPU", - "UMask": "0x1", - "Unit": "IIO" - }, { "BriefDescription": "Number requests sent to PCIe from main die : From ITC", "EventCode": "0xC2", diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json index 1823149067..fb48be357c 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json @@ -31,7 +31,7 @@ "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.REFERENCE", "PublicDescription": "Counts the number of cacheable memory requests that access the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x4f", "Unit": "cpu_atom" }, @@ -94,7 +94,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x400", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -106,7 +106,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x80", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -118,7 +118,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x10", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -130,7 +130,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x800", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -142,7 +142,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x100", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -154,7 +154,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x20", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -166,7 +166,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x4", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -178,7 +178,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x200", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -190,7 +190,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x40", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -202,7 +202,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x8", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -212,7 +212,7 @@ "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x6", "Unit": "cpu_atom" } diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json index 5e4ef81b43..3a24934e8d 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.\nSoftware can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json index 51d70ba00b..9c188d80b7 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json @@ -155,7 +155,7 @@ "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x3FBFC00001", + "MSRValue": "0xFE7F8000001", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -175,7 +175,7 @@ "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.L3_MISS", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x3FBFC00002", + "MSRValue": "0xFE7F8000002", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/other.json b/tools/perf/pmu-events/arch/x86/lunarlake/other.json index 69adaed568..377f717db6 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/other.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/other.json @@ -24,7 +24,7 @@ "EventCode": "0xB7", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x184000001", + "MSRValue": "0x1FBC000001", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_atom" @@ -34,7 +34,7 @@ "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x184000001", + "MSRValue": "0x1E780000001", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json index 2bde664fdc..2c9f85ec8c 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json @@ -38,10 +38,18 @@ { "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles", "EventName": "CPU_CLK_UNHALTED.CORE", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_atom" }, + { + "BriefDescription": "Core cycles when the core is not in a halt state.", + "EventName": "CPU_CLK_UNHALTED.CORE", + "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the programmable counters available for other events.", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, { "BriefDescription": "Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", "EventCode": "0x3c", @@ -49,10 +57,18 @@ "SampleAfterValue": "2000003", "Unit": "cpu_atom" }, + { + "BriefDescription": "Thread cycles when thread is not in halt state [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.CORE_P", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", + "SampleAfterValue": "2000003", + "Unit": "cpu_core" + }, { "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles", "EventName": "CPU_CLK_UNHALTED.REF_TSC", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x3", "Unit": "cpu_atom" }, @@ -64,6 +80,15 @@ "UMask": "0x3", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts the number of unhalted reference clock cycles", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", + "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses a programmable general purpose performance counter.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Reference cycles when the core is not in halt state.", "EventCode": "0x3c", @@ -74,9 +99,16 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Core cycles when the thread is not in halt state", + "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles", + "EventName": "CPU_CLK_UNHALTED.THREAD", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Core cycles when the thread is not in a halt state.", "EventName": "CPU_CLK_UNHALTED.THREAD", - "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events.", + "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the programmable counters available for other events.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -89,10 +121,10 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Thread cycles when thread is not in halt state", + "BriefDescription": "Thread cycles when thread is not in halt state [This event is alias to CPU_CLK_UNHALTED.CORE_P]", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. [This event is alias to CPU_CLK_UNHALTED.CORE_P]", "SampleAfterValue": "2000003", "Unit": "cpu_core" }, @@ -100,7 +132,7 @@ "BriefDescription": "Fixed Counter: Counts the number of instructions retired", "EventName": "INST_RETIRED.ANY", "PEBS": "1", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_atom" }, @@ -148,11 +180,29 @@ "UMask": "0x82", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL.", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "LBR record is inserted", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, { "BriefDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.\nSoftware can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "10000003", "UMask": "0x2", "Unit": "cpu_core" @@ -175,21 +225,35 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Fixed Counter: Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Fixed Counter: Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the IQ. Also, includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", "SampleAfterValue": "1000003", - "UMask": "0x5", "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0xa4", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0xa4", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, { "BriefDescription": "Fixed Counter: Counts the number of retirement slots not consumed due to front end stalls", "EventName": "TOPDOWN_FE_BOUND.ALL", @@ -198,18 +262,35 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "EventCode": "0x9c", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots.", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003", "UMask": "0x7", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of consumed retirement slots.", + "EventCode": "0xc2", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, { "BriefDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.\nSoftware can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 5297d25f4e..c9891630be 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -5,33 +5,33 @@ GenuineIntel-6-(1C|26|27|35|36),v5,bonnell,core GenuineIntel-6-(3D|47),v29,broadwell,core GenuineIntel-6-56,v11,broadwellde,core GenuineIntel-6-4F,v22,broadwellx,core -GenuineIntel-6-55-[56789ABCDEF],v1.20,cascadelakex,core +GenuineIntel-6-55-[56789ABCDEF],v1.21,cascadelakex,core GenuineIntel-6-9[6C],v1.04,elkhartlake,core -GenuineIntel-6-CF,v1.03,emeraldrapids,core +GenuineIntel-6-CF,v1.06,emeraldrapids,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core -GenuineIntel-6-B6,v1.01,grandridge,core +GenuineIntel-6-B6,v1.02,grandridge,core GenuineIntel-6-A[DE],v1.01,graniterapids,core GenuineIntel-6-(3C|45|46),v35,haswell,core GenuineIntel-6-3F,v28,haswellx,core GenuineIntel-6-7[DE],v1.21,icelake,core -GenuineIntel-6-6[AC],v1.23,icelakex,core +GenuineIntel-6-6[AC],v1.24,icelakex,core GenuineIntel-6-3A,v24,ivybridge,core GenuineIntel-6-3E,v24,ivytown,core GenuineIntel-6-2D,v24,jaketown,core GenuineIntel-6-(57|85),v16,knightslanding,core -GenuineIntel-6-BD,v1.00,lunarlake,core -GenuineIntel-6-A[AC],v1.07,meteorlake,core +GenuineIntel-6-BD,v1.01,lunarlake,core +GenuineIntel-6-A[AC],v1.08,meteorlake,core GenuineIntel-6-1[AEF],v4,nehalemep,core GenuineIntel-6-2E,v4,nehalemex,core GenuineIntel-6-A7,v1.02,rocketlake,core GenuineIntel-6-2A,v19,sandybridge,core -GenuineIntel-6-8F,v1.17,sapphirerapids,core -GenuineIntel-6-AF,v1.01,sierraforest,core +GenuineIntel-6-8F,v1.20,sapphirerapids,core +GenuineIntel-6-AF,v1.02,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v58,skylake,core -GenuineIntel-6-55-[01234],v1.32,skylakex,core -GenuineIntel-6-86,v1.21,snowridgex,core +GenuineIntel-6-55-[01234],v1.33,skylakex,core +GenuineIntel-6-86,v1.22,snowridgex,core GenuineIntel-6-8[CD],v1.15,tigerlake,core GenuineIntel-6-2C,v5,westmereep-dp,core GenuineIntel-6-25,v4,westmereep-sp,core @@ -40,3 +40,4 @@ AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core AuthenticAMD-25-([245][[:xdigit:]]|[[:xdigit:]]),v1,amdzen3,core AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen4,core +AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen5,core diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json index 47861a6dd8..af7acb15f6 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json @@ -966,6 +966,16 @@ "UMask": "0x3", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.", "EventCode": "0xB7", @@ -986,6 +996,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x4003C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.", "EventCode": "0xB7", @@ -1006,6 +1026,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C0002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.", "EventCode": "0xB7", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json index 9da8689eda..f3b7b211af 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json @@ -378,7 +378,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -455,7 +455,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.\nThe count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. The count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json index a5b83293f1..617d0e255f 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json @@ -296,6 +296,16 @@ "UMask": "0x4", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3FBFC00001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", "EventCode": "0x2A,0x2B", @@ -306,6 +316,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3FBFC00002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", "EventCode": "0x2A,0x2B", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/other.json b/tools/perf/pmu-events/arch/x86/meteorlake/other.json index 7effc1f271..0bc2cb2eab 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/other.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/other.json @@ -17,6 +17,16 @@ "UMask": "0x1", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts demand data reads that have any type of response.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that have any type of response.", "EventCode": "0x2A,0x2B", @@ -27,6 +37,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand data reads that were supplied by DRAM.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.DRAM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x184000001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were supplied by DRAM.", "EventCode": "0x2A,0x2B", @@ -37,6 +57,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", "EventCode": "0x2A,0x2B", @@ -47,6 +77,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by DRAM.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.DRAM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x184000002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts streaming stores that have any type of response.", "EventCode": "0xB7", @@ -97,7 +137,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state. For Tremont, UMWAIT and TPAUSE will only put the CPU into C0.1 activity state (not C0.2 activity state)", + "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.", "EventCode": "0x75", "EventName": "SERIALIZATION.C01_MS_SCB", "SampleAfterValue": "200003", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json index 24bbfcebd2..5ff4a7a322 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json @@ -1067,7 +1067,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.\nThe count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "10000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1116,10 +1116,18 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, @@ -1156,7 +1164,7 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003", @@ -1170,6 +1178,13 @@ "UMask": "0x1", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", "EventCode": "0x74", @@ -1211,12 +1226,19 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL_P]", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", "EventCode": "0x71", @@ -1299,13 +1321,21 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL_P]", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL]", + "EventCode": "0x72", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, { "BriefDescription": "Number of non dec-by-all uops decoded by decoder", "EventCode": "0x76", @@ -1591,7 +1621,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.\nSoftware can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json index 08b5c7574c..901d8510f9 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json @@ -1,4 +1,20 @@ [ + { + "BriefDescription": "Each cycle counts number of coherent reads pending on data return from memory controller that were issued by any core.", + "EventCode": "0x85", + "EventName": "UNC_ARB_DAT_OCCUPANCY.RD", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "ARB" + }, + { + "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, etc.", + "EventCode": "0x84", + "EventName": "UNC_HAC_ARB_COH_TRK_REQUESTS.ALL", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "HAC_ARB" + }, { "BriefDescription": "Number of all coherent Data Read entries. Doesn't include prefetches", "EventCode": "0x81", @@ -9,7 +25,7 @@ }, { "BriefDescription": "Number of all CMI transactions", - "EventCode": "0x8a", + "EventCode": "0x8A", "EventName": "UNC_HAC_ARB_TRANSACTIONS.ALL", "PerPkg": "1", "UMask": "0x1", @@ -17,7 +33,7 @@ }, { "BriefDescription": "Number of all CMI reads", - "EventCode": "0x8a", + "EventCode": "0x8A", "EventName": "UNC_HAC_ARB_TRANSACTIONS.READS", "PerPkg": "1", "UMask": "0x2", @@ -25,7 +41,7 @@ }, { "BriefDescription": "Number of all CMI writes not including Mflush", - "EventCode": "0x8a", + "EventCode": "0x8A", "EventName": "UNC_HAC_ARB_TRANSACTIONS.WRITES", "PerPkg": "1", "UMask": "0x4", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json index 9606e76b98..b0447aad0d 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json @@ -432,6 +432,7 @@ "BriefDescription": "Retired load instructions with remote Intel(R) Optane(TM) DC persistent memory as the data source where the data request missed all caches.", "EventCode": "0xd3", "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", + "PEBS": "1", "PublicDescription": "Counts retired load instructions with remote Intel(R) Optane(TM) DC persistent memory as the data source and the data request missed L3.", "SampleAfterValue": "100007", "UMask": "0x10" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json index 9e53da55d0..93d99318a6 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json @@ -267,7 +267,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json index e8bf7c9c44..5420f529f4 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json @@ -264,6 +264,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json index 2cfe814d20..e2086bedec 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json @@ -1,20 +1,4 @@ [ - { - "BriefDescription": "AMX retired arithmetic BF16 operations.", - "EventCode": "0xce", - "EventName": "AMX_OPS_RETIRED.BF16", - "PublicDescription": "Number of AMX-based retired arithmetic bfloat16 (BF16) floating-point operations. Counts TDPBF16PS FP instructions. SW to use operation multiplier of 4", - "SampleAfterValue": "1000003", - "UMask": "0x2" - }, - { - "BriefDescription": "AMX retired arithmetic integer 8-bit operations.", - "EventCode": "0xce", - "EventName": "AMX_OPS_RETIRED.INT8", - "PublicDescription": "Number of AMX-based retired arithmetic integer operations of 8-bit width source operands. Counts TDPB[SS,UU,US,SU]D instructions. SW should use operation multiplier of 8.", - "SampleAfterValue": "1000003", - "UMask": "0x1" - }, { "BriefDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE", "CounterMask": "1", @@ -444,6 +428,7 @@ "BriefDescription": "INST_RETIRED.MACRO_FUSED", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", + "PEBS": "1", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -451,6 +436,7 @@ "BriefDescription": "Retired NOP instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", + "PEBS": "1", "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2" @@ -467,6 +453,7 @@ "BriefDescription": "Iterations of Repeat string retired instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", + "PEBS": "1", "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 6f0e6360e9..f8c0eac8b8 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -727,23 +727,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0) + 0 * slots", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots)) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1113,16 +1110,14 @@ { "BriefDescription": "\"Bus lock\" per kilo instruction", "MetricExpr": "tma_info_memory_mix_bus_lock_pki", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_bus_lock_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_bus_lock_pki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1163,9 +1158,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1182,9 +1176,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1207,23 +1200,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -1264,9 +1254,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1277,9 +1266,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1314,23 +1302,20 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l3_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l3_miss_latency" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1341,9 +1326,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "\"Bus lock\" per kilo instruction", @@ -1385,53 +1369,46 @@ { "BriefDescription": "Off-core accesses per kilo instruction for modified write requests", "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / INST_RETIRED.ANY", - "MetricGroup": "Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_offcore_mwrite_any_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Offcore", + "MetricName": "tma_info_memory_offcore_mwrite_any_pki" }, { "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / INST_RETIRED.ANY", - "MetricGroup": "CacheHits;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_offcore_read_any_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "CacheHits;Offcore", + "MetricName": "tma_info_memory_offcore_read_any_pki" }, { "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_offcore_read_l3m_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Offcore", + "MetricName": "tma_info_memory_offcore_read_l3m_pki" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket", "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "tma_info_memory_r2c_dram_bw", - "MetricgroupNoGroup": "TopdownL1", "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW." }, { "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "tma_info_memory_r2c_l3m_bw", - "MetricgroupNoGroup": "TopdownL1", "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW." }, { "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "tma_info_memory_r2c_offcore_bw", - "MetricgroupNoGroup": "TopdownL1", "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches." }, { @@ -1458,9 +1435,8 @@ { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1490,9 +1466,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json index cf6fa70f37..25a2b96951 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json @@ -4143,6 +4143,42 @@ "UMask": "0xcd43ff04", "Unit": "CHA" }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437f04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; Misses from local IO", "EventCode": "0x35", @@ -4153,7 +4189,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; ItoM misses from local IO", + "BriefDescription": "TOR Inserts : ItoM, indicating a full cacheline write request, from IO Devices that missed the LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", "PerPkg": "1", @@ -4171,7 +4207,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; RdCur and FsRdCur misses from local IO", + "BriefDescription": "TOR Inserts; RdCur and FsRdCur requests from local IO that miss LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR", "PerPkg": "1", @@ -4197,6 +4233,24 @@ "UMask": "0xc8f3ff04", "Unit": "CHA" }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; RFO from local IO", "EventCode": "0x35", @@ -5565,6 +5619,42 @@ "UMask": "0xcd43fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO", "EventCode": "0x36", @@ -5574,6 +5664,24 @@ "UMask": "0xc8f3fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RFO misses from local IO", "EventCode": "0x36", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json index 65d088556b..22bb490e96 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json @@ -4888,7 +4888,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AD)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AD)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AD", "PerPkg": "1", @@ -4897,7 +4897,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AK)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AK)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AK", "PerPkg": "1", @@ -4906,7 +4906,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AKC)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AKC)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AKC", "PerPkg": "1", @@ -4915,7 +4915,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (BL)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (BL)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.BL", "PerPkg": "1", @@ -4924,7 +4924,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (IV)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (IV)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.IV", "PerPkg": "1", @@ -5291,7 +5291,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5300,7 +5300,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5309,7 +5309,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5318,7 +5318,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -5763,7 +5763,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5772,7 +5772,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5781,7 +5781,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5790,7 +5790,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json index ba9843110f..90292dc03d 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json @@ -249,10 +249,17 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", "SampleAfterValue": "1000003" }, { @@ -284,7 +291,7 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003" @@ -296,6 +303,12 @@ "SampleAfterValue": "1000003", "UMask": "0x1" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", "EventCode": "0x74", @@ -332,11 +345,17 @@ "UMask": "0x10" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL_P]", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", "EventCode": "0x71", @@ -409,12 +428,19 @@ "UMask": "0x4" }, { - "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL_P]", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL]", + "EventCode": "0x72", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of uops issued by the front end every cycle.", "EventCode": "0x0e", diff --git a/tools/perf/pmu-events/arch/x86/skylake/frontend.json b/tools/perf/pmu-events/arch/x86/skylake/frontend.json index 095904c770..d6f543471b 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/skylake/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "Decode Stream Buffer (DSB)-to-MITE switches", "EventCode": "0xAB", "EventName": "DSB2MITE_SWITCHES.COUNT", - "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses.\nNote: Invoking MITE requires two or three cycles delay.", + "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses. Note: Invoking MITE requires two or three cycles delay.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -267,11 +267,11 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.DSB_CYCLES_OK]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.DSB_CYCLES_OK]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", "SampleAfterValue": "2000003", "UMask": "0x18" }, @@ -321,11 +321,11 @@ "UMask": "0x18" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "SampleAfterValue": "2000003", "UMask": "0x18" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/cache.json b/tools/perf/pmu-events/arch/x86/skylakex/cache.json index d28d8822a5..14229f4b29 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/cache.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/cache.json @@ -763,6 +763,15 @@ "SampleAfterValue": "100003", "UMask": "0x1" }, + { + "BriefDescription": "OFFCORE_RESPONSE.ALL_READS.L3_HIT.HIT_OTHER_CORE_FWD hit in the L3 and the snoop to one of the sibling cores hits the line in E/S/F state and the line is forwarded.", + "EventCode": "0xB7, 0xBB", + "EventName": "OFFCORE_RESPONSE.ALL_READS.L3_HIT.HIT_OTHER_CORE_FWD", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x8003C07F7", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, { "BriefDescription": "Counts all demand & prefetch RFOs that have any response type.", "EventCode": "0xB7, 0xBB", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/frontend.json b/tools/perf/pmu-events/arch/x86/skylakex/frontend.json index 095904c770..d6f543471b 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/frontend.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "Decode Stream Buffer (DSB)-to-MITE switches", "EventCode": "0xAB", "EventName": "DSB2MITE_SWITCHES.COUNT", - "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses.\nNote: Invoking MITE requires two or three cycles delay.", + "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses. Note: Invoking MITE requires two or three cycles delay.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -267,11 +267,11 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.DSB_CYCLES_OK]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.DSB_CYCLES_OK]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", "SampleAfterValue": "2000003", "UMask": "0x18" }, @@ -321,11 +321,11 @@ "UMask": "0x18" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "SampleAfterValue": "2000003", "UMask": "0x18" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/memory.json b/tools/perf/pmu-events/arch/x86/skylakex/memory.json index 2b797dbc75..dba3cd6b36 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/memory.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/memory.json @@ -864,7 +864,7 @@ "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).", "EventCode": "0xC9", "EventName": "RTM_RETIRED.ABORTED", - "PEBS": "1", + "PEBS": "2", "PublicDescription": "Number of times RTM abort was triggered.", "SampleAfterValue": "2000003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/skylakex/other.json b/tools/perf/pmu-events/arch/x86/skylakex/other.json index cda8a7a45f..2511d72232 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/other.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server michroarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json index 66d686cc93..c50ddf5b40 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json @@ -396,7 +396,7 @@ "Errata": "SKL091, SKL044", "EventCode": "0xC0", "EventName": "INST_RETIRED.NOP", - "PEBS": "1", + "PEBS": "2", "SampleAfterValue": "2000003", "UMask": "0x2" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 025e836a1c..8126f952a3 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -652,23 +652,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1021,9 +1018,8 @@ { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1064,9 +1060,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1083,9 +1078,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1108,23 +1102,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -1165,9 +1156,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1178,9 +1168,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1209,16 +1198,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1229,9 +1216,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "Un-cacheable retired load per kilo instruction", @@ -1249,16 +1235,14 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1289,9 +1273,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json index 3eece8a728..f32d4d9d28 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json @@ -38,7 +38,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CLFLUSH", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x80", "Unit": "IRP" }, @@ -47,7 +47,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x2", "Unit": "IRP" }, @@ -56,7 +56,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.DRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x4", "Unit": "IRP" }, @@ -65,7 +65,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIDCAHINT", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x20", "Unit": "IRP" }, @@ -74,7 +74,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIRDCUR", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x1", "Unit": "IRP" }, @@ -101,7 +101,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.WBMTOI", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x40", "Unit": "IRP" }, @@ -500,7 +500,7 @@ "EventCode": "0x11", "EventName": "UNC_I_TRANSACTIONS.WRITES", "PerPkg": "1", - "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Trackes only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", + "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Tracks only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", "UMask": "0x2", "Unit": "IRP" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json index 2a3a709018..743c91f3d2 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json @@ -34,7 +34,7 @@ "EventCode": "0x1", "EventName": "UNC_IIO_CLOCKTICKS", "PerPkg": "1", - "PublicDescription": "Counts clockticks of the 1GHz trafiic controller clock in the IIO unit.", + "PublicDescription": "Counts clockticks of the 1GHz traffic controller clock in the IIO unit.", "Unit": "IIO" }, { diff --git a/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json b/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json index f59405877a..73feadaf76 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json @@ -205,7 +205,7 @@ "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake.", "EventCode": "0x85", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake michroarchitecture.", + "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake microarchitecture.", "SampleAfterValue": "100003", "UMask": "0x10" }, diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json index a68a5bb05c..4090e4da1b 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json @@ -1444,7 +1444,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.DATA_READ_LOCAL", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.DMND_READ_LOCAL", @@ -1638,7 +1638,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.RFO_LOCAL", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.RFO_PREF_LOCAL", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json index 7e2895f7fe..7cc3635b11 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json @@ -38,7 +38,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CLFLUSH", "PerPkg": "1", - "PublicDescription": "Coherent Ops : CLFlush : Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Coherent Ops : CLFlush : Counts the number of coherency related operations serviced by the IRP", "UMask": "0x80", "Unit": "IRP" }, @@ -65,7 +65,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.WBMTOI", "PerPkg": "1", - "PublicDescription": "Coherent Ops : WbMtoI : Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Coherent Ops : WbMtoI : Counts the number of coherency related operations serviced by the IRP", "UMask": "0x40", "Unit": "IRP" }, @@ -454,7 +454,7 @@ "EventCode": "0x11", "EventName": "UNC_I_TRANSACTIONS.WRITES", "PerPkg": "1", - "PublicDescription": "Inbound Transaction Count : Writes : Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID. : Trackes only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", + "PublicDescription": "Inbound Transaction Count : Writes : Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID. : Tracks only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", "UMask": "0x2", "Unit": "IRP" }, diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json index ecdd6f0f8e..de156e499f 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json @@ -2505,17 +2505,6 @@ "UMask": "0x10", "Unit": "IIO" }, - { - "BriefDescription": "Number requests sent to PCIe from main die : From IRP", - "EventCode": "0xC2", - "EventName": "UNC_IIO_NUM_REQ_FROM_CPU.IRP", - "FCMask": "0x07", - "PerPkg": "1", - "PortMask": "0xFF", - "PublicDescription": "Number requests sent to PCIe from main die : From IRP : Captures Posted/Non-posted allocations from IRP. i.e. either non-confined P2P traffic or from the CPU", - "UMask": "0x1", - "Unit": "IIO" - }, { "BriefDescription": "Number requests sent to PCIe from main die : From ITC", "EventCode": "0xC2", diff --git a/tools/perf/scripts/python/parallel-perf.py b/tools/perf/scripts/python/parallel-perf.py new file mode 100755 index 0000000000..21f32ec5ed --- /dev/null +++ b/tools/perf/scripts/python/parallel-perf.py @@ -0,0 +1,988 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Run a perf script command multiple times in parallel, using perf script +# options --cpu and --time so that each job processes a different chunk +# of the data. +# +# Copyright (c) 2024, Intel Corporation. + +import subprocess +import argparse +import pathlib +import shlex +import time +import copy +import sys +import os +import re + +glb_prog_name = "parallel-perf.py" +glb_min_interval = 10.0 +glb_min_samples = 64 + +class Verbosity(): + + def __init__(self, quiet=False, verbose=False, debug=False): + self.normal = True + self.verbose = verbose + self.debug = debug + self.self_test = True + if self.debug: + self.verbose = True + if self.verbose: + quiet = False + if quiet: + self.normal = False + +# Manage work (Start/Wait/Kill), as represented by a subprocess.Popen command +class Work(): + + def __init__(self, cmd, pipe_to, output_dir="."): + self.popen = None + self.consumer = None + self.cmd = cmd + self.pipe_to = pipe_to + self.output_dir = output_dir + self.cmdout_name = f"{output_dir}/cmd.txt" + self.stdout_name = f"{output_dir}/out.txt" + self.stderr_name = f"{output_dir}/err.txt" + + def Command(self): + sh_cmd = [ shlex.quote(x) for x in self.cmd ] + return " ".join(self.cmd) + + def Stdout(self): + return open(self.stdout_name, "w") + + def Stderr(self): + return open(self.stderr_name, "w") + + def CreateOutputDir(self): + pathlib.Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + def Start(self): + if self.popen: + return + self.CreateOutputDir() + with open(self.cmdout_name, "w") as f: + f.write(self.Command()) + f.write("\n") + stdout = self.Stdout() + stderr = self.Stderr() + if self.pipe_to: + self.popen = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=stderr) + args = shlex.split(self.pipe_to) + self.consumer = subprocess.Popen(args, stdin=self.popen.stdout, stdout=stdout, stderr=stderr) + else: + self.popen = subprocess.Popen(self.cmd, stdout=stdout, stderr=stderr) + + def RemoveEmptyErrFile(self): + if os.path.exists(self.stderr_name): + if os.path.getsize(self.stderr_name) == 0: + os.unlink(self.stderr_name) + + def Errors(self): + if os.path.exists(self.stderr_name): + if os.path.getsize(self.stderr_name) != 0: + return [ f"Non-empty error file {self.stderr_name}" ] + return [] + + def TidyUp(self): + self.RemoveEmptyErrFile() + + def RawPollWait(self, p, wait): + if wait: + return p.wait() + return p.poll() + + def Poll(self, wait=False): + if not self.popen: + return None + result = self.RawPollWait(self.popen, wait) + if self.consumer: + res = result + result = self.RawPollWait(self.consumer, wait) + if result != None and res == None: + self.popen.kill() + result = None + elif result == 0 and res != None and res != 0: + result = res + if result != None: + self.TidyUp() + return result + + def Wait(self): + return self.Poll(wait=True) + + def Kill(self): + if not self.popen: + return + self.popen.kill() + if self.consumer: + self.consumer.kill() + +def KillWork(worklist, verbosity): + for w in worklist: + w.Kill() + for w in worklist: + w.Wait() + +def NumberOfCPUs(): + return os.sysconf("SC_NPROCESSORS_ONLN") + +def NanoSecsToSecsStr(x): + if x == None: + return "" + x = str(x) + if len(x) < 10: + x = "0" * (10 - len(x)) + x + return x[:len(x) - 9] + "." + x[-9:] + +def InsertOptionAfter(cmd, option, after): + try: + pos = cmd.index(after) + cmd.insert(pos + 1, option) + except: + cmd.append(option) + +def CreateWorkList(cmd, pipe_to, output_dir, cpus, time_ranges_by_cpu): + max_len = len(str(cpus[-1])) + cpu_dir_fmt = f"cpu-%.{max_len}u" + worklist = [] + pos = 0 + for cpu in cpus: + if cpu >= 0: + cpu_dir = os.path.join(output_dir, cpu_dir_fmt % cpu) + cpu_option = f"--cpu={cpu}" + else: + cpu_dir = output_dir + cpu_option = None + + tr_dir_fmt = "time-range" + + if len(time_ranges_by_cpu) > 1: + time_ranges = time_ranges_by_cpu[pos] + tr_dir_fmt += f"-{pos}" + pos += 1 + else: + time_ranges = time_ranges_by_cpu[0] + + max_len = len(str(len(time_ranges))) + tr_dir_fmt += f"-%.{max_len}u" + + i = 0 + for r in time_ranges: + if r == [None, None]: + time_option = None + work_output_dir = cpu_dir + else: + time_option = "--time=" + NanoSecsToSecsStr(r[0]) + "," + NanoSecsToSecsStr(r[1]) + work_output_dir = os.path.join(cpu_dir, tr_dir_fmt % i) + i += 1 + work_cmd = list(cmd) + if time_option != None: + InsertOptionAfter(work_cmd, time_option, "script") + if cpu_option != None: + InsertOptionAfter(work_cmd, cpu_option, "script") + w = Work(work_cmd, pipe_to, work_output_dir) + worklist.append(w) + return worklist + +def DoRunWork(worklist, nr_jobs, verbosity): + nr_to_do = len(worklist) + not_started = list(worklist) + running = [] + done = [] + chg = False + while True: + nr_done = len(done) + if chg and verbosity.normal: + nr_run = len(running) + print(f"\rThere are {nr_to_do} jobs: {nr_done} completed, {nr_run} running", flush=True, end=" ") + if verbosity.verbose: + print() + chg = False + if nr_done == nr_to_do: + break + while len(running) < nr_jobs and len(not_started): + w = not_started.pop(0) + running.append(w) + if verbosity.verbose: + print("Starting:", w.Command()) + w.Start() + chg = True + if len(running): + time.sleep(0.1) + finished = [] + not_finished = [] + while len(running): + w = running.pop(0) + r = w.Poll() + if r == None: + not_finished.append(w) + continue + if r == 0: + if verbosity.verbose: + print("Finished:", w.Command()) + finished.append(w) + chg = True + continue + if verbosity.normal and not verbosity.verbose: + print() + print("Job failed!\n return code:", r, "\n command: ", w.Command()) + if w.pipe_to: + print(" piped to: ", w.pipe_to) + print("Killing outstanding jobs") + KillWork(not_finished, verbosity) + KillWork(running, verbosity) + return False + running = not_finished + done += finished + errorlist = [] + for w in worklist: + errorlist += w.Errors() + if len(errorlist): + print("Errors:") + for e in errorlist: + print(e) + elif verbosity.normal: + print("\r"," "*50, "\rAll jobs finished successfully", flush=True) + return True + +def RunWork(worklist, nr_jobs=NumberOfCPUs(), verbosity=Verbosity()): + try: + return DoRunWork(worklist, nr_jobs, verbosity) + except: + for w in worklist: + w.Kill() + raise + return True + +def ReadHeader(perf, file_name): + return subprocess.Popen([perf, "script", "--header-only", "--input", file_name], stdout=subprocess.PIPE).stdout.read().decode("utf-8") + +def ParseHeader(hdr): + result = {} + lines = hdr.split("\n") + for line in lines: + if ":" in line and line[0] == "#": + pos = line.index(":") + name = line[1:pos-1].strip() + value = line[pos+1:].strip() + if name in result: + orig_name = name + nr = 2 + while True: + name = f"{orig_name} {nr}" + if name not in result: + break + nr += 1 + result[name] = value + return result + +def HeaderField(hdr_dict, hdr_fld): + if hdr_fld not in hdr_dict: + raise Exception(f"'{hdr_fld}' missing from header information") + return hdr_dict[hdr_fld] + +# Represent the position of an option within a command string +# and provide the option value and/or remove the option +class OptPos(): + + def Init(self, opt_element=-1, value_element=-1, opt_pos=-1, value_pos=-1, error=None): + self.opt_element = opt_element # list element that contains option + self.value_element = value_element # list element that contains option value + self.opt_pos = opt_pos # string position of option + self.value_pos = value_pos # string position of value + self.error = error # error message string + + def __init__(self, args, short_name, long_name, default=None): + self.args = list(args) + self.default = default + n = 2 + len(long_name) + m = len(short_name) + pos = -1 + for opt in args: + pos += 1 + if m and opt[:2] == f"-{short_name}": + if len(opt) == 2: + if pos + 1 < len(args): + self.Init(pos, pos + 1, 0, 0) + else: + self.Init(error = f"-{short_name} option missing value") + else: + self.Init(pos, pos, 0, 2) + return + if opt[:n] == f"--{long_name}": + if len(opt) == n: + if pos + 1 < len(args): + self.Init(pos, pos + 1, 0, 0) + else: + self.Init(error = f"--{long_name} option missing value") + elif opt[n] == "=": + self.Init(pos, pos, 0, n + 1) + else: + self.Init(error = f"--{long_name} option expected '='") + return + if m and opt[:1] == "-" and opt[:2] != "--" and short_name in opt: + ipos = opt.index(short_name) + if "-" in opt[1:]: + hpos = opt[1:].index("-") + if hpos < ipos: + continue + if ipos + 1 == len(opt): + if pos + 1 < len(args): + self.Init(pos, pos + 1, ipos, 0) + else: + self.Init(error = f"-{short_name} option missing value") + else: + self.Init(pos, pos, ipos, ipos + 1) + return + self.Init() + + def Value(self): + if self.opt_element >= 0: + if self.opt_element != self.value_element: + return self.args[self.value_element] + else: + return self.args[self.value_element][self.value_pos:] + return self.default + + def Remove(self, args): + if self.opt_element == -1: + return + if self.opt_element != self.value_element: + del args[self.value_element] + if self.opt_pos: + args[self.opt_element] = args[self.opt_element][:self.opt_pos] + else: + del args[self.opt_element] + +def DetermineInputFileName(cmd): + p = OptPos(cmd, "i", "input", "perf.data") + if p.error: + raise Exception(f"perf command {p.error}") + file_name = p.Value() + if not os.path.exists(file_name): + raise Exception(f"perf command input file '{file_name}' not found") + return file_name + +def ReadOption(args, short_name, long_name, err_prefix, remove=False): + p = OptPos(args, short_name, long_name) + if p.error: + raise Exception(f"{err_prefix}{p.error}") + value = p.Value() + if remove: + p.Remove(args) + return value + +def ExtractOption(args, short_name, long_name, err_prefix): + return ReadOption(args, short_name, long_name, err_prefix, True) + +def ReadPerfOption(args, short_name, long_name): + return ReadOption(args, short_name, long_name, "perf command ") + +def ExtractPerfOption(args, short_name, long_name): + return ExtractOption(args, short_name, long_name, "perf command ") + +def PerfDoubleQuickCommands(cmd, file_name): + cpu_str = ReadPerfOption(cmd, "C", "cpu") + time_str = ReadPerfOption(cmd, "", "time") + # Use double-quick sampling to determine trace data density + times_cmd = ["perf", "script", "--ns", "--input", file_name, "--itrace=qqi"] + if cpu_str != None and cpu_str != "": + times_cmd.append(f"--cpu={cpu_str}") + if time_str != None and time_str != "": + times_cmd.append(f"--time={time_str}") + cnts_cmd = list(times_cmd) + cnts_cmd.append("-Fcpu") + times_cmd.append("-Fcpu,time") + return cnts_cmd, times_cmd + +class CPUTimeRange(): + def __init__(self, cpu): + self.cpu = cpu + self.sample_cnt = 0 + self.time_ranges = None + self.interval = 0 + self.interval_remaining = 0 + self.remaining = 0 + self.tr_pos = 0 + +def CalcTimeRangesByCPU(line, cpu, cpu_time_ranges, max_time): + cpu_time_range = cpu_time_ranges[cpu] + cpu_time_range.remaining -= 1 + cpu_time_range.interval_remaining -= 1 + if cpu_time_range.remaining == 0: + cpu_time_range.time_ranges[cpu_time_range.tr_pos][1] = max_time + return + if cpu_time_range.interval_remaining == 0: + time = TimeVal(line[1][:-1], 0) + time_ranges = cpu_time_range.time_ranges + time_ranges[cpu_time_range.tr_pos][1] = time - 1 + time_ranges.append([time, max_time]) + cpu_time_range.tr_pos += 1 + cpu_time_range.interval_remaining = cpu_time_range.interval + +def CountSamplesByCPU(line, cpu, cpu_time_ranges): + try: + cpu_time_ranges[cpu].sample_cnt += 1 + except: + print("exception") + print("cpu", cpu) + print("len(cpu_time_ranges)", len(cpu_time_ranges)) + raise + +def ProcessCommandOutputLines(cmd, per_cpu, fn, *x): + # Assume CPU number is at beginning of line and enclosed by [] + pat = re.compile(r"\s*\[[0-9]+\]") + p = subprocess.Popen(cmd, stdout=subprocess.PIPE) + while True: + if line := p.stdout.readline(): + line = line.decode("utf-8") + if pat.match(line): + line = line.split() + if per_cpu: + # Assumes CPU number is enclosed by [] + cpu = int(line[0][1:-1]) + else: + cpu = 0 + fn(line, cpu, *x) + else: + break + p.wait() + +def IntersectTimeRanges(new_time_ranges, time_ranges): + pos = 0 + new_pos = 0 + # Can assume len(time_ranges) != 0 and len(new_time_ranges) != 0 + # Note also, there *must* be at least one intersection. + while pos < len(time_ranges) and new_pos < len(new_time_ranges): + # new end < old start => no intersection, remove new + if new_time_ranges[new_pos][1] < time_ranges[pos][0]: + del new_time_ranges[new_pos] + continue + # new start > old end => no intersection, check next + if new_time_ranges[new_pos][0] > time_ranges[pos][1]: + pos += 1 + if pos < len(time_ranges): + continue + # no next, so remove remaining + while new_pos < len(new_time_ranges): + del new_time_ranges[new_pos] + return + # Found an intersection + # new start < old start => adjust new start = old start + if new_time_ranges[new_pos][0] < time_ranges[pos][0]: + new_time_ranges[new_pos][0] = time_ranges[pos][0] + # new end > old end => keep the overlap, insert the remainder + if new_time_ranges[new_pos][1] > time_ranges[pos][1]: + r = [ time_ranges[pos][1] + 1, new_time_ranges[new_pos][1] ] + new_time_ranges[new_pos][1] = time_ranges[pos][1] + new_pos += 1 + new_time_ranges.insert(new_pos, r) + continue + # new [start, end] is within old [start, end] + new_pos += 1 + +def SplitTimeRangesByTraceDataDensity(time_ranges, cpus, nr, cmd, file_name, per_cpu, min_size, min_interval, verbosity): + if verbosity.normal: + print("\rAnalyzing...", flush=True, end=" ") + if verbosity.verbose: + print() + cnts_cmd, times_cmd = PerfDoubleQuickCommands(cmd, file_name) + + nr_cpus = cpus[-1] + 1 if per_cpu else 1 + if per_cpu: + nr_cpus = cpus[-1] + 1 + cpu_time_ranges = [ CPUTimeRange(cpu) for cpu in range(nr_cpus) ] + else: + nr_cpus = 1 + cpu_time_ranges = [ CPUTimeRange(-1) ] + + if verbosity.debug: + print("nr_cpus", nr_cpus) + print("cnts_cmd", cnts_cmd) + print("times_cmd", times_cmd) + + # Count the number of "double quick" samples per CPU + ProcessCommandOutputLines(cnts_cmd, per_cpu, CountSamplesByCPU, cpu_time_ranges) + + tot = 0 + mx = 0 + for cpu_time_range in cpu_time_ranges: + cnt = cpu_time_range.sample_cnt + tot += cnt + if cnt > mx: + mx = cnt + if verbosity.debug: + print("cpu:", cpu_time_range.cpu, "sample_cnt", cnt) + + if min_size < 1: + min_size = 1 + + if mx < min_size: + # Too little data to be worth splitting + if verbosity.debug: + print("Too little data to split by time") + if nr == 0: + nr = 1 + return [ SplitTimeRangesIntoN(time_ranges, nr, min_interval) ] + + if nr: + divisor = nr + min_size = 1 + else: + divisor = NumberOfCPUs() + + interval = int(round(tot / divisor, 0)) + if interval < min_size: + interval = min_size + + if verbosity.debug: + print("divisor", divisor) + print("min_size", min_size) + print("interval", interval) + + min_time = time_ranges[0][0] + max_time = time_ranges[-1][1] + + for cpu_time_range in cpu_time_ranges: + cnt = cpu_time_range.sample_cnt + if cnt == 0: + cpu_time_range.time_ranges = copy.deepcopy(time_ranges) + continue + # Adjust target interval for CPU to give approximately equal interval sizes + # Determine number of intervals, rounding to nearest integer + n = int(round(cnt / interval, 0)) + if n < 1: + n = 1 + # Determine interval size, rounding up + d, m = divmod(cnt, n) + if m: + d += 1 + cpu_time_range.interval = d + cpu_time_range.interval_remaining = d + cpu_time_range.remaining = cnt + # Init. time ranges for each CPU with the start time + cpu_time_range.time_ranges = [ [min_time, max_time] ] + + # Set time ranges so that the same number of "double quick" samples + # will fall into each time range. + ProcessCommandOutputLines(times_cmd, per_cpu, CalcTimeRangesByCPU, cpu_time_ranges, max_time) + + for cpu_time_range in cpu_time_ranges: + if cpu_time_range.sample_cnt: + IntersectTimeRanges(cpu_time_range.time_ranges, time_ranges) + + return [cpu_time_ranges[cpu].time_ranges for cpu in cpus] + +def SplitSingleTimeRangeIntoN(time_range, n): + if n <= 1: + return [time_range] + start = time_range[0] + end = time_range[1] + duration = int((end - start + 1) / n) + if duration < 1: + return [time_range] + time_ranges = [] + for i in range(n): + time_ranges.append([start, start + duration - 1]) + start += duration + time_ranges[-1][1] = end + return time_ranges + +def TimeRangeDuration(r): + return r[1] - r[0] + 1 + +def TotalDuration(time_ranges): + duration = 0 + for r in time_ranges: + duration += TimeRangeDuration(r) + return duration + +def SplitTimeRangesByInterval(time_ranges, interval): + new_ranges = [] + for r in time_ranges: + duration = TimeRangeDuration(r) + n = duration / interval + n = int(round(n, 0)) + new_ranges += SplitSingleTimeRangeIntoN(r, n) + return new_ranges + +def SplitTimeRangesIntoN(time_ranges, n, min_interval): + if n <= len(time_ranges): + return time_ranges + duration = TotalDuration(time_ranges) + interval = duration / n + if interval < min_interval: + interval = min_interval + return SplitTimeRangesByInterval(time_ranges, interval) + +def RecombineTimeRanges(tr): + new_tr = copy.deepcopy(tr) + n = len(new_tr) + i = 1 + while i < len(new_tr): + # if prev end + 1 == cur start, combine them + if new_tr[i - 1][1] + 1 == new_tr[i][0]: + new_tr[i][0] = new_tr[i - 1][0] + del new_tr[i - 1] + else: + i += 1 + return new_tr + +def OpenTimeRangeEnds(time_ranges, min_time, max_time): + if time_ranges[0][0] <= min_time: + time_ranges[0][0] = None + if time_ranges[-1][1] >= max_time: + time_ranges[-1][1] = None + +def BadTimeStr(time_str): + raise Exception(f"perf command bad time option: '{time_str}'\nCheck also 'time of first sample' and 'time of last sample' in perf script --header-only") + +def ValidateTimeRanges(time_ranges, time_str): + n = len(time_ranges) + for i in range(n): + start = time_ranges[i][0] + end = time_ranges[i][1] + if i != 0 and start <= time_ranges[i - 1][1]: + BadTimeStr(time_str) + if start > end: + BadTimeStr(time_str) + +def TimeVal(s, dflt): + s = s.strip() + if s == "": + return dflt + a = s.split(".") + if len(a) > 2: + raise Exception(f"Bad time value'{s}'") + x = int(a[0]) + if x < 0: + raise Exception("Negative time not allowed") + x *= 1000000000 + if len(a) > 1: + x += int((a[1] + "000000000")[:9]) + return x + +def BadCPUStr(cpu_str): + raise Exception(f"perf command bad cpu option: '{cpu_str}'\nCheck also 'nrcpus avail' in perf script --header-only") + +def ParseTimeStr(time_str, min_time, max_time): + if time_str == None or time_str == "": + return [[min_time, max_time]] + time_ranges = [] + for r in time_str.split(): + a = r.split(",") + if len(a) != 2: + BadTimeStr(time_str) + try: + start = TimeVal(a[0], min_time) + end = TimeVal(a[1], max_time) + except: + BadTimeStr(time_str) + time_ranges.append([start, end]) + ValidateTimeRanges(time_ranges, time_str) + return time_ranges + +def ParseCPUStr(cpu_str, nr_cpus): + if cpu_str == None or cpu_str == "": + return [-1] + cpus = [] + for r in cpu_str.split(","): + a = r.split("-") + if len(a) < 1 or len(a) > 2: + BadCPUStr(cpu_str) + try: + start = int(a[0].strip()) + if len(a) > 1: + end = int(a[1].strip()) + else: + end = start + except: + BadCPUStr(cpu_str) + if start < 0 or end < 0 or end < start or end >= nr_cpus: + BadCPUStr(cpu_str) + cpus.extend(range(start, end + 1)) + cpus = list(set(cpus)) # Remove duplicates + cpus.sort() + return cpus + +class ParallelPerf(): + + def __init__(self, a): + for arg_name in vars(a): + setattr(self, arg_name, getattr(a, arg_name)) + self.orig_nr = self.nr + self.orig_cmd = list(self.cmd) + self.perf = self.cmd[0] + if os.path.exists(self.output_dir): + raise Exception(f"Output '{self.output_dir}' already exists") + if self.jobs < 0 or self.nr < 0 or self.interval < 0: + raise Exception("Bad options (negative values): try -h option for help") + if self.nr != 0 and self.interval != 0: + raise Exception("Cannot specify number of time subdivisions and time interval") + if self.jobs == 0: + self.jobs = NumberOfCPUs() + if self.nr == 0 and self.interval == 0: + if self.per_cpu: + self.nr = 1 + else: + self.nr = self.jobs + + def Init(self): + if self.verbosity.debug: + print("cmd", self.cmd) + self.file_name = DetermineInputFileName(self.cmd) + self.hdr = ReadHeader(self.perf, self.file_name) + self.hdr_dict = ParseHeader(self.hdr) + self.cmd_line = HeaderField(self.hdr_dict, "cmdline") + + def ExtractTimeInfo(self): + self.min_time = TimeVal(HeaderField(self.hdr_dict, "time of first sample"), 0) + self.max_time = TimeVal(HeaderField(self.hdr_dict, "time of last sample"), 0) + self.time_str = ExtractPerfOption(self.cmd, "", "time") + self.time_ranges = ParseTimeStr(self.time_str, self.min_time, self.max_time) + if self.verbosity.debug: + print("time_ranges", self.time_ranges) + + def ExtractCPUInfo(self): + if self.per_cpu: + nr_cpus = int(HeaderField(self.hdr_dict, "nrcpus avail")) + self.cpu_str = ExtractPerfOption(self.cmd, "C", "cpu") + if self.cpu_str == None or self.cpu_str == "": + self.cpus = [ x for x in range(nr_cpus) ] + else: + self.cpus = ParseCPUStr(self.cpu_str, nr_cpus) + else: + self.cpu_str = None + self.cpus = [-1] + if self.verbosity.debug: + print("cpus", self.cpus) + + def IsIntelPT(self): + return self.cmd_line.find("intel_pt") >= 0 + + def SplitTimeRanges(self): + if self.IsIntelPT() and self.interval == 0: + self.split_time_ranges_for_each_cpu = \ + SplitTimeRangesByTraceDataDensity(self.time_ranges, self.cpus, self.orig_nr, + self.orig_cmd, self.file_name, self.per_cpu, + self.min_size, self.min_interval, self.verbosity) + elif self.nr: + self.split_time_ranges_for_each_cpu = [ SplitTimeRangesIntoN(self.time_ranges, self.nr, self.min_interval) ] + else: + self.split_time_ranges_for_each_cpu = [ SplitTimeRangesByInterval(self.time_ranges, self.interval) ] + + def CheckTimeRanges(self): + for tr in self.split_time_ranges_for_each_cpu: + # Re-combined time ranges should be the same + new_tr = RecombineTimeRanges(tr) + if new_tr != self.time_ranges: + if self.verbosity.debug: + print("tr", tr) + print("new_tr", new_tr) + raise Exception("Self test failed!") + + def OpenTimeRangeEnds(self): + for time_ranges in self.split_time_ranges_for_each_cpu: + OpenTimeRangeEnds(time_ranges, self.min_time, self.max_time) + + def CreateWorkList(self): + self.worklist = CreateWorkList(self.cmd, self.pipe_to, self.output_dir, self.cpus, self.split_time_ranges_for_each_cpu) + + def PerfDataRecordedPerCPU(self): + if "--per-thread" in self.cmd_line.split(): + return False + return True + + def DefaultToPerCPU(self): + # --no-per-cpu option takes precedence + if self.no_per_cpu: + return False + if not self.PerfDataRecordedPerCPU(): + return False + # Default to per-cpu for Intel PT data that was recorded per-cpu, + # because decoding can be done for each CPU separately. + if self.IsIntelPT(): + return True + return False + + def Config(self): + self.Init() + self.ExtractTimeInfo() + if not self.per_cpu: + self.per_cpu = self.DefaultToPerCPU() + if self.verbosity.debug: + print("per_cpu", self.per_cpu) + self.ExtractCPUInfo() + self.SplitTimeRanges() + if self.verbosity.self_test: + self.CheckTimeRanges() + # Prefer open-ended time range to starting / ending with min_time / max_time resp. + self.OpenTimeRangeEnds() + self.CreateWorkList() + + def Run(self): + if self.dry_run: + print(len(self.worklist),"jobs:") + for w in self.worklist: + print(w.Command()) + return True + result = RunWork(self.worklist, self.jobs, verbosity=self.verbosity) + if self.verbosity.verbose: + print(glb_prog_name, "done") + return result + +def RunParallelPerf(a): + pp = ParallelPerf(a) + pp.Config() + return pp.Run() + +def Main(args): + ap = argparse.ArgumentParser( + prog=glb_prog_name, formatter_class = argparse.RawDescriptionHelpFormatter, + description = +""" +Run a perf script command multiple times in parallel, using perf script options +--cpu and --time so that each job processes a different chunk of the data. +""", + epilog = +""" +Follow the options by '--' and then the perf script command e.g. + + $ perf record -a -- sleep 10 + $ parallel-perf.py --nr=4 -- perf script --ns + All jobs finished successfully + $ tree parallel-perf-output/ + parallel-perf-output/ + ├── time-range-0 + │   ├── cmd.txt + │   └── out.txt + ├── time-range-1 + │   ├── cmd.txt + │   └── out.txt + ├── time-range-2 + │   ├── cmd.txt + │   └── out.txt + └── time-range-3 + ├── cmd.txt + └── out.txt + $ find parallel-perf-output -name cmd.txt | sort | xargs grep -H . + parallel-perf-output/time-range-0/cmd.txt:perf script --time=,9466.504461499 --ns + parallel-perf-output/time-range-1/cmd.txt:perf script --time=9466.504461500,9469.005396999 --ns + parallel-perf-output/time-range-2/cmd.txt:perf script --time=9469.005397000,9471.506332499 --ns + parallel-perf-output/time-range-3/cmd.txt:perf script --time=9471.506332500, --ns + +Any perf script command can be used, including the use of perf script options +--dlfilter and --script, so that the benefit of running parallel jobs +naturally extends to them also. + +If option --pipe-to is used, standard output is first piped through that +command. Beware, if the command fails (e.g. grep with no matches), it will be +considered a fatal error. + +Final standard output is redirected to files named out.txt in separate +subdirectories under the output directory. Similarly, standard error is +written to files named err.txt. In addition, files named cmd.txt contain the +corresponding perf script command. After processing, err.txt files are removed +if they are empty. + +If any job exits with a non-zero exit code, then all jobs are killed and no +more are started. A message is printed if any job results in a non-empty +err.txt file. + +There is a separate output subdirectory for each time range. If the --per-cpu +option is used, these are further grouped under cpu-n subdirectories, e.g. + + $ parallel-perf.py --per-cpu --nr=2 -- perf script --ns --cpu=0,1 + All jobs finished successfully + $ tree parallel-perf-output + parallel-perf-output/ + ├── cpu-0 + │   ├── time-range-0 + │   │   ├── cmd.txt + │   │   └── out.txt + │   └── time-range-1 + │   ├── cmd.txt + │   └── out.txt + └── cpu-1 + ├── time-range-0 + │   ├── cmd.txt + │   └── out.txt + └── time-range-1 + ├── cmd.txt + └── out.txt + $ find parallel-perf-output -name cmd.txt | sort | xargs grep -H . + parallel-perf-output/cpu-0/time-range-0/cmd.txt:perf script --cpu=0 --time=,9469.005396999 --ns + parallel-perf-output/cpu-0/time-range-1/cmd.txt:perf script --cpu=0 --time=9469.005397000, --ns + parallel-perf-output/cpu-1/time-range-0/cmd.txt:perf script --cpu=1 --time=,9469.005396999 --ns + parallel-perf-output/cpu-1/time-range-1/cmd.txt:perf script --cpu=1 --time=9469.005397000, --ns + +Subdivisions of time range, and cpus if the --per-cpu option is used, are +expressed by the --time and --cpu perf script options respectively. If the +supplied perf script command has a --time option, then that time range is +subdivided, otherwise the time range given by 'time of first sample' to +'time of last sample' is used (refer perf script --header-only). Similarly, the +supplied perf script command may provide a --cpu option, and only those CPUs +will be processed. + +To prevent time intervals becoming too small, the --min-interval option can +be used. + +Note there is special handling for processing Intel PT traces. If an interval is +not specified and the perf record command contained the intel_pt event, then the +time range will be subdivided in order to produce subdivisions that contain +approximately the same amount of trace data. That is accomplished by counting +double-quick (--itrace=qqi) samples, and choosing time ranges that encompass +approximately the same number of samples. In that case, time ranges may not be +the same for each CPU processed. For Intel PT, --per-cpu is the default, but +that can be overridden by --no-per-cpu. Note, for Intel PT, double-quick +decoding produces 1 sample for each PSB synchronization packet, which in turn +come after a certain number of bytes output, determined by psb_period (refer +perf Intel PT documentation). The minimum number of double-quick samples that +will define a time range can be set by the --min_size option, which defaults to +64. +""") + ap.add_argument("-o", "--output-dir", default="parallel-perf-output", help="output directory (default 'parallel-perf-output')") + ap.add_argument("-j", "--jobs", type=int, default=0, help="maximum number of jobs to run in parallel at one time (default is the number of CPUs)") + ap.add_argument("-n", "--nr", type=int, default=0, help="number of time subdivisions (default is the number of jobs)") + ap.add_argument("-i", "--interval", type=float, default=0, help="subdivide the time range using this time interval (in seconds e.g. 0.1 for a tenth of a second)") + ap.add_argument("-c", "--per-cpu", action="store_true", help="process data for each CPU in parallel") + ap.add_argument("-m", "--min-interval", type=float, default=glb_min_interval, help=f"minimum interval (default {glb_min_interval} seconds)") + ap.add_argument("-p", "--pipe-to", help="command to pipe output to (optional)") + ap.add_argument("-N", "--no-per-cpu", action="store_true", help="do not process data for each CPU in parallel") + ap.add_argument("-b", "--min_size", type=int, default=glb_min_samples, help="minimum data size (for Intel PT in PSBs)") + ap.add_argument("-D", "--dry-run", action="store_true", help="do not run any jobs, just show the perf script commands") + ap.add_argument("-q", "--quiet", action="store_true", help="do not print any messages except errors") + ap.add_argument("-v", "--verbose", action="store_true", help="print more messages") + ap.add_argument("-d", "--debug", action="store_true", help="print debugging messages") + cmd_line = list(args) + try: + split_pos = cmd_line.index("--") + cmd = cmd_line[split_pos + 1:] + args = cmd_line[:split_pos] + except: + cmd = None + args = cmd_line + a = ap.parse_args(args=args[1:]) + a.cmd = cmd + a.verbosity = Verbosity(a.quiet, a.verbose, a.debug) + try: + if a.cmd == None: + if len(args) <= 1: + ap.print_help() + return True + raise Exception("Command line must contain '--' before perf command") + return RunParallelPerf(a) + except Exception as e: + print("Fatal error: ", str(e)) + if a.debug: + raise + return False + +if __name__ == "__main__": + if not Main(sys.argv): + sys.exit(1) diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c index 0173f5402a..98956e0e07 100644 --- a/tools/perf/tests/bitmap.c +++ b/tools/perf/tests/bitmap.c @@ -11,18 +11,19 @@ static unsigned long *get_bitmap(const char *str, int nbits) { struct perf_cpu_map *map = perf_cpu_map__new(str); - unsigned long *bm = NULL; - int i; + unsigned long *bm; bm = bitmap_zalloc(nbits); if (map && bm) { - for (i = 0; i < perf_cpu_map__nr(map); i++) - __set_bit(perf_cpu_map__cpu(map, i).cpu, bm); + int i; + struct perf_cpu cpu; + + perf_cpu_map__for_each_cpu(cpu, i, map) + __set_bit(cpu.cpu, bm); } - if (map) - perf_cpu_map__put(map); + perf_cpu_map__put(map); return bm; } diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index e05b370b1e..c3d84b67ca 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -39,7 +39,10 @@ * making them easier to debug. */ static bool dont_fork; -/* Fork the tests in parallel and then wait for their completion. */ +/* Don't fork the tests in parallel and wait for their completion. */ +static bool sequential = true; +/* Do it in parallel, lacks infrastructure to avoid running tests that clash for resources, + * So leave it as the developers choice to enable while working on the needed infra */ static bool parallel; const char *dso_to_test; const char *test_objdump_path = "objdump"; @@ -307,8 +310,8 @@ static int finish_test(struct child_test *child_test, int width) char buf[512]; ssize_t len; - /* Poll to avoid excessive spinning, timeout set for 1000ms. */ - poll(pfds, ARRAY_SIZE(pfds), /*timeout=*/1000); + /* Poll to avoid excessive spinning, timeout set for 100ms. */ + poll(pfds, ARRAY_SIZE(pfds), /*timeout=*/100); if (!err_done && pfds[0].revents) { errno = 0; len = read(err, buf, sizeof(buf) - 1); @@ -374,7 +377,7 @@ static int start_test(struct test_suite *test, int i, int subi, struct child_tes } (*child)->process.no_exec_cmd = run_test_child; err = start_command(&(*child)->process); - if (err || parallel) + if (err || !sequential) return err; return finish_test(*child, width); } @@ -440,7 +443,7 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist) int err = start_test(t, curr, -1, &child_tests[child_test_num++], width); if (err) { - /* TODO: if parallel waitpid the already forked children. */ + /* TODO: if !sequential waitpid the already forked children. */ free(child_tests); return err; } @@ -460,7 +463,7 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist) } } for (i = 0; i < child_test_num; i++) { - if (parallel) { + if (!sequential) { int ret = finish_test(child_tests[i], width); if (ret) @@ -536,8 +539,9 @@ int cmd_test(int argc, const char **argv) "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('F', "dont-fork", &dont_fork, "Do not fork for testcase"), - OPT_BOOLEAN('p', "parallel", ¶llel, - "Run the tests altogether in parallel"), + OPT_BOOLEAN('p', "parallel", ¶llel, "Run the tests in parallel"), + OPT_BOOLEAN('S', "sequential", &sequential, + "Run the tests one after another rather than in parallel"), OPT_STRING('w', "workload", &workload, "work", "workload to run for testing"), OPT_STRING(0, "dso", &dso_to_test, "dso", "dso to test"), OPT_STRING(0, "objdump", &test_objdump_path, "path", @@ -564,6 +568,11 @@ int cmd_test(int argc, const char **argv) if (workload) return run_workload(workload, argc, argv); + if (dont_fork) + sequential = true; + else if (parallel) + sequential = false; + symbol_conf.priv_size = sizeof(int); symbol_conf.try_vmlinux_path = true; diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 29d2f3ee4e..27c82cfb7e 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -253,9 +253,9 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, goto out; } dso = map__dso(al.map); - pr_debug("File is: %s\n", dso->long_name); + pr_debug("File is: %s\n", dso__long_name(dso)); - if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { + if (dso__symtab_type(dso) == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { pr_debug("Unexpected kernel address - skipping\n"); goto out; } @@ -274,7 +274,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, * modules to manage long jumps. Check if the ip offset falls in stubs * sections for kernel modules. And skip module address after text end */ - if (dso->is_kmod && al.addr > dso->text_end) { + if (dso__is_kmod(dso) && al.addr > dso__text_end(dso)) { pr_debug("skipping the module address %#"PRIx64" after text end\n", al.addr); goto out; } @@ -315,7 +315,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, state->done[state->done_cnt++] = map__start(al.map); } - objdump_name = dso->long_name; + objdump_name = dso__long_name(dso); if (dso__needs_decompress(dso)) { if (dso__decompress_kmodule_path(dso, objdump_name, decomp_name, diff --git a/tools/perf/tests/config-fragments/config b/tools/perf/tests/config-fragments/config index c340b3195f..4fca128510 100644 --- a/tools/perf/tests/config-fragments/config +++ b/tools/perf/tests/config-fragments/config @@ -9,3 +9,6 @@ CONFIG_GENERIC_TRACER=y CONFIG_FTRACE=y CONFIG_FTRACE_SYSCALLS=y CONFIG_BRANCH_PROFILE_NONE=y +CONFIG_KPROBES=y +CONFIG_KPROBE_EVENTS=y +CONFIG_UPROBE_EVENTS=y diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c index 2d67422c12..5286ae8bd2 100644 --- a/tools/perf/tests/dso-data.c +++ b/tools/perf/tests/dso-data.c @@ -10,6 +10,7 @@ #include #include #include "dso.h" +#include "dsos.h" #include "machine.h" #include "symbol.h" #include "tests.h" @@ -123,9 +124,10 @@ static int test__dso_data(struct test_suite *test __maybe_unused, int subtest __ TEST_ASSERT_VAL("No test file", file); memset(&machine, 0, sizeof(machine)); + dsos__init(&machine.dsos); - dso = dso__new((const char *)file); - + dso = dso__new(file); + TEST_ASSERT_VAL("Failed to add dso", !dsos__add(&machine.dsos, dso)); TEST_ASSERT_VAL("Failed to access to dso", dso__data_fd(dso, &machine) >= 0); @@ -170,6 +172,7 @@ static int test__dso_data(struct test_suite *test __maybe_unused, int subtest __ } dso__put(dso); + dsos__exit(&machine.dsos); unlink(file); return 0; } @@ -199,40 +202,35 @@ static long open_files_cnt(void) return nr - 1; } -static struct dso **dsos; - -static int dsos__create(int cnt, int size) +static int dsos__create(int cnt, int size, struct dsos *dsos) { int i; - dsos = malloc(sizeof(*dsos) * cnt); - TEST_ASSERT_VAL("failed to alloc dsos array", dsos); + dsos__init(dsos); for (i = 0; i < cnt; i++) { - char *file; + struct dso *dso; + char *file = test_file(size); - file = test_file(size); TEST_ASSERT_VAL("failed to get dso file", file); - - dsos[i] = dso__new(file); - TEST_ASSERT_VAL("failed to get dso", dsos[i]); + dso = dso__new(file); + TEST_ASSERT_VAL("failed to get dso", dso); + TEST_ASSERT_VAL("failed to add dso", !dsos__add(dsos, dso)); + dso__put(dso); } return 0; } -static void dsos__delete(int cnt) +static void dsos__delete(struct dsos *dsos) { - int i; + for (unsigned int i = 0; i < dsos->cnt; i++) { + struct dso *dso = dsos->dsos[i]; - for (i = 0; i < cnt; i++) { - struct dso *dso = dsos[i]; - - unlink(dso->name); - dso__put(dso); + dso__data_close(dso); + unlink(dso__name(dso)); } - - free(dsos); + dsos__exit(dsos); } static int set_fd_limit(int n) @@ -266,10 +264,10 @@ static int test__dso_data_cache(struct test_suite *test __maybe_unused, int subt /* and this is now our dso open FDs limit */ dso_cnt = limit / 2; TEST_ASSERT_VAL("failed to create dsos\n", - !dsos__create(dso_cnt, TEST_FILE_SIZE)); + !dsos__create(dso_cnt, TEST_FILE_SIZE, &machine.dsos)); for (i = 0; i < (dso_cnt - 1); i++) { - struct dso *dso = dsos[i]; + struct dso *dso = machine.dsos.dsos[i]; /* * Open dsos via dso__data_fd(), it opens the data @@ -289,17 +287,17 @@ static int test__dso_data_cache(struct test_suite *test __maybe_unused, int subt } /* verify the first one is already open */ - TEST_ASSERT_VAL("dsos[0] is not open", dsos[0]->data.fd != -1); + TEST_ASSERT_VAL("dsos[0] is not open", dso__data(machine.dsos.dsos[0])->fd != -1); /* open +1 dso to reach the allowed limit */ - fd = dso__data_fd(dsos[i], &machine); + fd = dso__data_fd(machine.dsos.dsos[i], &machine); TEST_ASSERT_VAL("failed to get fd", fd > 0); /* should force the first one to be closed */ - TEST_ASSERT_VAL("failed to close dsos[0]", dsos[0]->data.fd == -1); + TEST_ASSERT_VAL("failed to close dsos[0]", dso__data(machine.dsos.dsos[0])->fd == -1); /* cleanup everything */ - dsos__delete(dso_cnt); + dsos__delete(&machine.dsos); /* Make sure we did not leak any file descriptor. */ nr_end = open_files_cnt(); @@ -324,9 +322,9 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub long nr_end, nr = open_files_cnt(), lim = new_limit(3); int fd, fd_extra; -#define dso_0 (dsos[0]) -#define dso_1 (dsos[1]) -#define dso_2 (dsos[2]) +#define dso_0 (machine.dsos.dsos[0]) +#define dso_1 (machine.dsos.dsos[1]) +#define dso_2 (machine.dsos.dsos[2]) /* Rest the internal dso open counter limit. */ reset_fd_limit(); @@ -346,7 +344,8 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub TEST_ASSERT_VAL("failed to set file limit", !set_fd_limit((lim))); - TEST_ASSERT_VAL("failed to create dsos\n", !dsos__create(3, TEST_FILE_SIZE)); + TEST_ASSERT_VAL("failed to create dsos\n", + !dsos__create(3, TEST_FILE_SIZE, &machine.dsos)); /* open dso_0 */ fd = dso__data_fd(dso_0, &machine); @@ -371,7 +370,7 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub * dso_0 should get closed, because we reached * the file descriptor limit */ - TEST_ASSERT_VAL("failed to close dso_0", dso_0->data.fd == -1); + TEST_ASSERT_VAL("failed to close dso_0", dso__data(dso_0)->fd == -1); /* open dso_0 */ fd = dso__data_fd(dso_0, &machine); @@ -381,11 +380,11 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub * dso_1 should get closed, because we reached * the file descriptor limit */ - TEST_ASSERT_VAL("failed to close dso_1", dso_1->data.fd == -1); + TEST_ASSERT_VAL("failed to close dso_1", dso__data(dso_1)->fd == -1); /* cleanup everything */ close(fd_extra); - dsos__delete(3); + dsos__delete(&machine.dsos); /* Make sure we did not leak any file descriptor. */ nr_end = open_files_cnt(); diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c index 15ff86f9da..1922cac13a 100644 --- a/tools/perf/tests/evsel-roundtrip-name.c +++ b/tools/perf/tests/evsel-roundtrip-name.c @@ -37,7 +37,7 @@ static int perf_evsel__roundtrip_cache_name_test(void) continue; } evlist__for_each_entry(evlist, evsel) { - if (strcmp(evsel__name(evsel), name)) { + if (!evsel__name_is(evsel, name)) { pr_debug("%s != %s\n", evsel__name(evsel), name); ret = TEST_FAIL; } @@ -71,7 +71,7 @@ static int perf_evsel__name_array_test(const char *const names[], int nr_names) continue; } evlist__for_each_entry(evlist, evsel) { - if (strcmp(evsel__name(evsel), names[i])) { + if (!evsel__name_is(evsel, names[i])) { pr_debug("%s != %s\n", evsel__name(evsel), names[i]); ret = TEST_FAIL; } diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c index d08add0f4d..187f12f5bc 100644 --- a/tools/perf/tests/hists_common.c +++ b/tools/perf/tests/hists_common.c @@ -146,7 +146,7 @@ struct machine *setup_fake_machine(struct machines *machines) goto out; } - symbols__insert(&dso->symbols, sym); + symbols__insert(dso__symbols(dso), sym); } dso__put(dso); @@ -183,7 +183,7 @@ void print_hists_in(struct hists *hists) pr_info("%2d: entry: %-8s [%-8s] %20s: period = %"PRIu64"\n", i, thread__comm_str(he->thread), - dso->short_name, + dso__short_name(dso), he->ms.sym->name, he->stat.period); } @@ -212,7 +212,7 @@ void print_hists_out(struct hists *hists) pr_info("%2d: entry: %8s:%5d [%-8s] %20s: period = %"PRIu64"/%"PRIu64"\n", i, thread__comm_str(he->thread), thread__tid(he->thread), - dso->short_name, + dso__short_name(dso), he->ms.sym->name, he->stat.period, he->stat_acc ? he->stat_acc->period : 0); } diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 71dacb0fec..1e0f5a310f 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -164,11 +164,11 @@ static void put_fake_samples(void) typedef int (*test_fn_t)(struct evsel *, struct machine *); #define COMM(he) (thread__comm_str(he->thread)) -#define DSO(he) (map__dso(he->ms.map)->short_name) +#define DSO(he) (dso__short_name(map__dso(he->ms.map))) #define SYM(he) (he->ms.sym->name) #define CPU(he) (he->cpu) #define DEPTH(he) (he->callchain->max_depth) -#define CDSO(cl) (map__dso(cl->ms.map)->short_name) +#define CDSO(cl) (dso__short_name(map__dso(cl->ms.map))) #define CSYM(cl) (cl->ms.sym->name) struct result { diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c index ba1cccf570..33b5cc8352 100644 --- a/tools/perf/tests/hists_output.c +++ b/tools/perf/tests/hists_output.c @@ -129,7 +129,7 @@ static void put_fake_samples(void) typedef int (*test_fn_t)(struct evsel *, struct machine *); #define COMM(he) (thread__comm_str(he->thread)) -#define DSO(he) (map__dso(he->ms.map)->short_name) +#define DSO(he) (dso__short_name(map__dso(he->ms.map))) #define SYM(he) (he->ms.sym->name) #define CPU(he) (he->cpu) #define PID(he) (thread__tid(he->thread)) diff --git a/tools/perf/tests/maps.c b/tools/perf/tests/maps.c index b15417a0d6..4f1f9385ea 100644 --- a/tools/perf/tests/maps.c +++ b/tools/perf/tests/maps.c @@ -26,7 +26,7 @@ static int check_maps_cb(struct map *map, void *data) if (map__start(map) != merged->start || map__end(map) != merged->end || - strcmp(map__dso(map)->name, merged->name) || + strcmp(dso__name(map__dso(map)), merged->name) || refcount_read(map__refcnt(map)) != 1) { return 1; } @@ -39,7 +39,7 @@ static int failed_cb(struct map *map, void *data __maybe_unused) pr_debug("\tstart: %" PRIu64 " end: %" PRIu64 " name: '%s' refcnt: %d\n", map__start(map), map__end(map), - map__dso(map)->name, + dso__name(map__dso(map)), refcount_read(map__refcnt(map))); return 0; diff --git a/tools/perf/tests/mem.c b/tools/perf/tests/mem.c index 56014ec7d4..cb3d749e15 100644 --- a/tools/perf/tests/mem.c +++ b/tools/perf/tests/mem.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "util/map_symbol.h" #include "util/mem-events.h" +#include "util/mem-info.h" #include "util/symbol.h" #include "linux/perf_event.h" #include "util/debug.h" @@ -12,12 +13,14 @@ static int check(union perf_mem_data_src data_src, { char out[100]; char failure[100]; - struct mem_info mi = { .data_src = data_src }; - + struct mem_info *mi = mem_info__new(); int n; - n = perf_mem__snp_scnprintf(out, sizeof out, &mi); - n += perf_mem__lvl_scnprintf(out + n, sizeof out - n, &mi); + TEST_ASSERT_VAL("Memory allocation failed", mi); + *mem_info__data_src(mi) = data_src; + n = perf_mem__snp_scnprintf(out, sizeof out, mi); + n += perf_mem__lvl_scnprintf(out + n, sizeof out - n, mi); + mem_info__put(mi); scnprintf(failure, sizeof failure, "unexpected %s", out); TEST_ASSERT_VAL(failure, !strcmp(string, out)); return 0; diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index feb5727584..edc2adcf1b 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -470,8 +470,7 @@ static int test__checkevent_breakpoint_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:u")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:u")); return test__checkevent_breakpoint(evlist); } @@ -484,8 +483,7 @@ static int test__checkevent_breakpoint_x_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:x:k")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:x:k")); return test__checkevent_breakpoint_x(evlist); } @@ -498,8 +496,7 @@ static int test__checkevent_breakpoint_r_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:r:hp")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:r:hp")); return test__checkevent_breakpoint_r(evlist); } @@ -512,8 +509,7 @@ static int test__checkevent_breakpoint_w_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:w:up")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:w:up")); return test__checkevent_breakpoint_w(evlist); } @@ -526,8 +522,7 @@ static int test__checkevent_breakpoint_rw_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:rw:kp")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:rw:kp")); return test__checkevent_breakpoint_rw(evlist); } @@ -540,8 +535,7 @@ static int test__checkevent_breakpoint_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint(evlist); } @@ -554,8 +548,7 @@ static int test__checkevent_breakpoint_x_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_x(evlist); } @@ -568,8 +561,7 @@ static int test__checkevent_breakpoint_r_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_r(evlist); } @@ -582,8 +574,7 @@ static int test__checkevent_breakpoint_w_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_w(evlist); } @@ -596,8 +587,7 @@ static int test__checkevent_breakpoint_rw_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_rw(evlist); } @@ -609,12 +599,12 @@ static int test__checkevent_breakpoint_2_events(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "breakpoint1")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint1")); evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "breakpoint2")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint2")); return TEST_OK; } @@ -691,15 +681,14 @@ static int test__checkevent_pmu_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "krava")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "krava")); /* cpu/config=2/u" */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", test_config(evsel, 2)); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "cpu/config=2/u")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "cpu/config=2/u")); return TEST_OK; } @@ -953,8 +942,8 @@ static int test__group2(struct evlist *evlist) continue; } if (evsel->core.attr.type == PERF_TYPE_HARDWARE && - test_config(evsel, PERF_COUNT_HW_CACHE_REFERENCES)) { - /* cache-references + :u modifier */ + test_config(evsel, PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) { + /* branches + :u modifier */ TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -2043,7 +2032,7 @@ static const struct evlist_test test__events[] = { /* 8 */ }, { - .name = "{faults:k,cache-references}:u,cycles:k", + .name = "{faults:k,branches}:u,cycles:k", .check = test__group2, /* 9 */ }, @@ -2280,6 +2269,13 @@ static const struct evlist_test test__events[] = { .check = test__checkevent_breakpoint_2_events, /* 3 */ }, +#ifdef HAVE_LIBTRACEEVENT + { + .name = "9p:9p_client_req", + .check = test__checkevent_tracepoint, + /* 4 */ + }, +#endif }; static const struct evlist_test test__events_pmu[] = { @@ -2504,7 +2500,8 @@ static int test_event(const struct evlist_test *e) return TEST_FAIL; } parse_events_error__init(&err); - ret = parse_events(evlist, e->name, &err); + ret = __parse_events(evlist, e->name, /*pmu_filter=*/NULL, &err, /*fake_pmu=*/NULL, + /*warn_if_reordered=*/true, /*fake_tp=*/true); if (ret) { pr_debug("failed to parse event '%s', err %d\n", e->name, ret); parse_events_error__print(&err, e->name); @@ -2532,7 +2529,8 @@ static int test_event_fake_pmu(const char *str) parse_events_error__init(&err); ret = __parse_events(evlist, str, /*pmu_filter=*/NULL, &err, - &perf_pmu__fake, /*warn_if_reordered=*/true); + &perf_pmu__fake, /*warn_if_reordered=*/true, + /*fake_tp=*/true); if (ret) { pr_debug("failed to parse event '%s', err %d\n", str, ret); diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index 47a7c32775..ff3e7bc0a7 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -842,7 +842,7 @@ static int check_parse_id(const char *id, struct parse_events_error *error, *cur = '/'; ret = __parse_events(evlist, dup, /*pmu_filter=*/NULL, error, fake_pmu, - /*warn_if_reordered=*/true); + /*warn_if_reordered=*/true, /*fake_tp=*/false); free(dup); evlist__delete(evlist); @@ -1105,6 +1105,6 @@ static struct test_case pmu_events_tests[] = { }; struct test_suite suite__pmu_events = { - .desc = "PMU events", + .desc = "PMU JSON event tests", .test_cases = pmu_events_tests, }; diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 8f18127d87..06cc0e46cb 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -1,204 +1,353 @@ // SPDX-License-Identifier: GPL-2.0 +#include "evlist.h" +#include "evsel.h" #include "parse-events.h" #include "pmu.h" #include "tests.h" +#include "debug.h" +#include "fncache.h" +#include +#include +#include #include #include #include -#include -#include -#include - -/* Simulated format definitions. */ -static struct test_format { - const char *name; - const char *value; -} test_formats[] = { - { "krava01", "config:0-1,62-63\n", }, - { "krava02", "config:10-17\n", }, - { "krava03", "config:5\n", }, - { "krava11", "config1:0,2,4,6,8,20-28\n", }, - { "krava12", "config1:63\n", }, - { "krava13", "config1:45-47\n", }, - { "krava21", "config2:0-3,10-13,20-23,30-33,40-43,50-53,60-63\n", }, - { "krava22", "config2:8,18,48,58\n", }, - { "krava23", "config2:28-29,38\n", }, -}; +#include +#include +#include +#include -/* Simulated users input. */ -static struct parse_events_term test_terms[] = { - { - .config = "krava01", - .val.num = 15, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava02", - .val.num = 170, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava03", - .val.num = 1, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava11", - .val.num = 27, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava12", - .val.num = 1, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava13", - .val.num = 2, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava21", - .val.num = 119, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava22", - .val.num = 11, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava23", - .val.num = 2, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, -}; +/* Fake PMUs created in temp directory. */ +static LIST_HEAD(test_pmus); + +/* Cleanup test PMU directory. */ +static int test_pmu_put(const char *dir, struct perf_pmu *pmu) +{ + char buf[PATH_MAX + 20]; + int ret; + + if (scnprintf(buf, sizeof(buf), "rm -fr %s", dir) < 0) { + pr_err("Failure to set up buffer for \"%s\"\n", dir); + return -EINVAL; + } + ret = system(buf); + if (ret) + pr_err("Failure to \"%s\"\n", buf); + + list_del(&pmu->list); + perf_pmu__delete(pmu); + return ret; +} /* - * Prepare format directory data, exported by kernel - * at /sys/bus/event_source/devices//format. + * Prepare test PMU directory data, normally exported by kernel at + * /sys/bus/event_source/devices//. Give as input a buffer to hold the file + * path, the result is PMU loaded using that directory. */ -static char *test_format_dir_get(char *dir, size_t sz) +static struct perf_pmu *test_pmu_get(char *dir, size_t sz) { - unsigned int i; + /* Simulated format definitions. */ + const struct test_format { + const char *name; + const char *value; + } test_formats[] = { + { "krava01", "config:0-1,62-63\n", }, + { "krava02", "config:10-17\n", }, + { "krava03", "config:5\n", }, + { "krava11", "config1:0,2,4,6,8,20-28\n", }, + { "krava12", "config1:63\n", }, + { "krava13", "config1:45-47\n", }, + { "krava21", "config2:0-3,10-13,20-23,30-33,40-43,50-53,60-63\n", }, + { "krava22", "config2:8,18,48,58\n", }, + { "krava23", "config2:28-29,38\n", }, + }; + const char *test_event = "krava01=15,krava02=170,krava03=1,krava11=27,krava12=1," + "krava13=2,krava21=119,krava22=11,krava23=2\n"; + + char name[PATH_MAX]; + int dirfd, file; + struct perf_pmu *pmu = NULL; + ssize_t len; - snprintf(dir, sz, "/tmp/perf-pmu-test-format-XXXXXX"); - if (!mkdtemp(dir)) + /* Create equivalent of sysfs mount point. */ + scnprintf(dir, sz, "/tmp/perf-pmu-test-XXXXXX"); + if (!mkdtemp(dir)) { + pr_err("mkdtemp failed\n"); + dir[0] = '\0'; return NULL; + } + dirfd = open(dir, O_DIRECTORY); + if (dirfd < 0) { + pr_err("Failed to open test directory \"%s\"\n", dir); + goto err_out; + } - for (i = 0; i < ARRAY_SIZE(test_formats); i++) { - char name[PATH_MAX]; - struct test_format *format = &test_formats[i]; - FILE *file; + /* Create the test PMU directory and give it a perf_event_attr type number. */ + if (mkdirat(dirfd, "perf-pmu-test", 0755) < 0) { + pr_err("Failed to mkdir PMU directory\n"); + goto err_out; + } + file = openat(dirfd, "perf-pmu-test/type", O_WRONLY | O_CREAT, 0600); + if (!file) { + pr_err("Failed to open for writing file \"type\"\n"); + goto err_out; + } + len = strlen("9999"); + if (write(file, "9999\n", len) < len) { + close(file); + pr_err("Failed to write to 'type' file\n"); + goto err_out; + } + close(file); - scnprintf(name, PATH_MAX, "%s/%s", dir, format->name); + /* Create format directory and files. */ + if (mkdirat(dirfd, "perf-pmu-test/format", 0755) < 0) { + pr_err("Failed to mkdir PMU format directory\n)"); + goto err_out; + } + for (size_t i = 0; i < ARRAY_SIZE(test_formats); i++) { + const struct test_format *format = &test_formats[i]; - file = fopen(name, "w"); - if (!file) - return NULL; + if (scnprintf(name, PATH_MAX, "perf-pmu-test/format/%s", format->name) < 0) { + pr_err("Failure to set up path for \"%s\"\n", format->name); + goto err_out; + } + file = openat(dirfd, name, O_WRONLY | O_CREAT, 0600); + if (!file) { + pr_err("Failed to open for writing file \"%s\"\n", name); + goto err_out; + } - if (1 != fwrite(format->value, strlen(format->value), 1, file)) - break; + if (write(file, format->value, strlen(format->value)) < 0) { + pr_err("Failed to write to file \"%s\"\n", name); + close(file); + goto err_out; + } + close(file); + } - fclose(file); + /* Create test event. */ + if (mkdirat(dirfd, "perf-pmu-test/events", 0755) < 0) { + pr_err("Failed to mkdir PMU events directory\n"); + goto err_out; + } + file = openat(dirfd, "perf-pmu-test/events/test-event", O_WRONLY | O_CREAT, 0600); + if (!file) { + pr_err("Failed to open for writing file \"type\"\n"); + goto err_out; + } + len = strlen(test_event); + if (write(file, test_event, len) < len) { + close(file); + pr_err("Failed to write to 'test-event' file\n"); + goto err_out; } + close(file); - return dir; + /* Make the PMU reading the files created above. */ + pmu = perf_pmus__add_test_pmu(dirfd, "perf-pmu-test"); + if (!pmu) + pr_err("Test PMU creation failed\n"); + +err_out: + if (!pmu) + test_pmu_put(dir, pmu); + if (dirfd >= 0) + close(dirfd); + return pmu; } -/* Cleanup format directory. */ -static int test_format_dir_put(char *dir) +static int test__pmu_format(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - char buf[PATH_MAX + 20]; + char dir[PATH_MAX]; + struct perf_event_attr attr; + struct parse_events_terms terms; + int ret = TEST_FAIL; + struct perf_pmu *pmu = test_pmu_get(dir, sizeof(dir)); - snprintf(buf, sizeof(buf), "rm -f %s/*\n", dir); - if (system(buf)) - return -1; + if (!pmu) + return TEST_FAIL; - snprintf(buf, sizeof(buf), "rmdir %s\n", dir); - return system(buf); + parse_events_terms__init(&terms); + if (parse_events_terms(&terms, + "krava01=15,krava02=170,krava03=1,krava11=27,krava12=1," + "krava13=2,krava21=119,krava22=11,krava23=2", + NULL)) { + pr_err("Term parsing failed\n"); + goto err_out; + } + + memset(&attr, 0, sizeof(attr)); + ret = perf_pmu__config_terms(pmu, &attr, &terms, /*zero=*/false, /*err=*/NULL); + if (ret) { + pr_err("perf_pmu__config_terms failed"); + goto err_out; + } + + if (attr.config != 0xc00000000002a823) { + pr_err("Unexpected config value %llx\n", attr.config); + goto err_out; + } + if (attr.config1 != 0x8000400000000145) { + pr_err("Unexpected config1 value %llx\n", attr.config1); + goto err_out; + } + if (attr.config2 != 0x0400000020041d07) { + pr_err("Unexpected config2 value %llx\n", attr.config2); + goto err_out; + } + + ret = TEST_OK; +err_out: + parse_events_terms__exit(&terms); + test_pmu_put(dir, pmu); + return ret; } -static void add_test_terms(struct parse_events_terms *terms) +static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - unsigned int i; + char dir[PATH_MAX]; + struct parse_events_error err; + struct evlist *evlist; + struct evsel *evsel; + struct perf_event_attr *attr; + int ret = TEST_FAIL; + struct perf_pmu *pmu = test_pmu_get(dir, sizeof(dir)); + const char *event = "perf-pmu-test/test-event/"; - for (i = 0; i < ARRAY_SIZE(test_terms); i++) { - struct parse_events_term *clone; - parse_events_term__clone(&clone, &test_terms[i]); - list_add_tail(&clone->list, &terms->terms); + if (!pmu) + return TEST_FAIL; + + evlist = evlist__new(); + if (evlist == NULL) { + pr_err("Failed allocation"); + goto err_out; + } + parse_events_error__init(&err); + ret = parse_events(evlist, event, &err); + if (ret) { + pr_debug("failed to parse event '%s', err %d\n", event, ret); + parse_events_error__print(&err, event); + if (parse_events_error__contains(&err, "can't access trace events")) + ret = TEST_SKIP; + goto err_out; + } + evsel = evlist__first(evlist); + attr = &evsel->core.attr; + if (attr->config != 0xc00000000002a823) { + pr_err("Unexpected config value %llx\n", attr->config); + goto err_out; + } + if (attr->config1 != 0x8000400000000145) { + pr_err("Unexpected config1 value %llx\n", attr->config1); + goto err_out; + } + if (attr->config2 != 0x0400000020041d07) { + pr_err("Unexpected config2 value %llx\n", attr->config2); + goto err_out; } + + ret = TEST_OK; +err_out: + parse_events_error__exit(&err); + evlist__delete(evlist); + test_pmu_put(dir, pmu); + return ret; } -static int test__pmu(struct test_suite *test __maybe_unused, int subtest __maybe_unused) +static bool permitted_event_name(const char *name) { - char dir[PATH_MAX]; - char *format; - struct parse_events_terms terms; - struct perf_event_attr attr; - struct perf_pmu *pmu; - int fd; - int ret; + bool has_lower = false, has_upper = false; - parse_events_terms__init(&terms); - add_test_terms(&terms); - pmu = zalloc(sizeof(*pmu)); - if (!pmu) { - parse_events_terms__exit(&terms); - return -ENOMEM; - } - - INIT_LIST_HEAD(&pmu->format); - INIT_LIST_HEAD(&pmu->aliases); - INIT_LIST_HEAD(&pmu->caps); - format = test_format_dir_get(dir, sizeof(dir)); - if (!format) { - free(pmu); - parse_events_terms__exit(&terms); - return -EINVAL; + for (size_t i = 0; i < strlen(name); i++) { + char c = name[i]; + + if (islower(c)) { + if (has_upper) + return false; + has_lower = true; + continue; + } + if (isupper(c)) { + if (has_lower) + return false; + has_upper = true; + continue; + } + if (!isdigit(c) && c != '.' && c != '_' && c != '-') + return false; } + return true; +} - memset(&attr, 0, sizeof(attr)); +static int test__pmu_event_names(struct test_suite *test __maybe_unused, + int subtest __maybe_unused) +{ + char path[PATH_MAX]; + DIR *pmu_dir, *event_dir; + struct dirent *pmu_dent, *event_dent; + const char *sysfs = sysfs__mountpoint(); + int ret = TEST_OK; - fd = open(format, O_DIRECTORY); - if (fd < 0) { - ret = fd; - goto out; + if (!sysfs) { + pr_err("Sysfs not mounted\n"); + return TEST_FAIL; } - pmu->name = strdup("perf-pmu-test"); - ret = perf_pmu__format_parse(pmu, fd, /*eager_load=*/true); - if (ret) - goto out; + snprintf(path, sizeof(path), "%s/bus/event_source/devices/", sysfs); + pmu_dir = opendir(path); + if (!pmu_dir) { + pr_err("Error opening \"%s\"\n", path); + return TEST_FAIL; + } + while ((pmu_dent = readdir(pmu_dir))) { + if (!strcmp(pmu_dent->d_name, ".") || + !strcmp(pmu_dent->d_name, "..")) + continue; - ret = perf_pmu__config_terms(pmu, &attr, &terms, /*zero=*/false, /*err=*/NULL); - if (ret) - goto out; - - ret = -EINVAL; - if (attr.config != 0xc00000000002a823) - goto out; - if (attr.config1 != 0x8000400000000145) - goto out; - if (attr.config2 != 0x0400000020041d07) - goto out; - - ret = 0; -out: - test_format_dir_put(format); - perf_pmu__delete(pmu); - parse_events_terms__exit(&terms); + snprintf(path, sizeof(path), "%s/bus/event_source/devices/%s/type", + sysfs, pmu_dent->d_name); + + /* Does it look like a PMU? */ + if (!file_available(path)) + continue; + + /* Process events. */ + snprintf(path, sizeof(path), "%s/bus/event_source/devices/%s/events", + sysfs, pmu_dent->d_name); + + event_dir = opendir(path); + if (!event_dir) { + pr_debug("Skipping as no event directory \"%s\"\n", path); + continue; + } + while ((event_dent = readdir(event_dir))) { + const char *event_name = event_dent->d_name; + + if (!strcmp(event_name, ".") || !strcmp(event_name, "..")) + continue; + + if (!permitted_event_name(event_name)) { + pr_err("Invalid sysfs event name: %s/%s\n", + pmu_dent->d_name, event_name); + ret = TEST_FAIL; + } + } + closedir(event_dir); + } + closedir(pmu_dir); return ret; } -DEFINE_SUITE("Parse perf pmu format", pmu); +static struct test_case tests__pmu[] = { + TEST_CASE("Parsing with PMU format directory", pmu_format), + TEST_CASE("Parsing with PMU event", pmu_events), + TEST_CASE("PMU event names", pmu_event_names), + { .name = NULL, } +}; + +struct test_suite suite__pmu = { + .desc = "Sysfs PMU tests", + .test_cases = tests__pmu, +}; diff --git a/tools/perf/tests/shell/annotate.sh b/tools/perf/tests/shell/annotate.sh new file mode 100755 index 0000000000..1db1e8113d --- /dev/null +++ b/tools/perf/tests/shell/annotate.sh @@ -0,0 +1,83 @@ +#!/bin/sh +# perf annotate basic tests +# SPDX-License-Identifier: GPL-2.0 + +set -e + +shelldir=$(dirname "$0") + +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + +testsym="noploop" + +skip_test_missing_symbol ${testsym} + +err=0 +perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +testprog="perf test -w noploop" +# disassembly format: "percent : offset: instruction (operands ...)" +disasm_regex="[0-9]*\.[0-9]* *: *\w*: *\w*" + +cleanup() { + rm -rf "${perfdata}" + rm -rf "${perfdata}".old + + trap - EXIT TERM INT +} + +trap_cleanup() { + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT + +test_basic() { + echo "Basic perf annotate test" + if ! perf record -o "${perfdata}" ${testprog} 2> /dev/null + then + echo "Basic annotate [Failed: perf record]" + err=1 + return + fi + + # check if it has the target symbol + if ! perf annotate -i "${perfdata}" 2> /dev/null | grep "${testsym}" + then + echo "Basic annotate [Failed: missing target symbol]" + err=1 + return + fi + + # check if it has the disassembly lines + if ! perf annotate -i "${perfdata}" 2> /dev/null | grep "${disasm_regex}" + then + echo "Basic annotate [Failed: missing disasm output from default disassembler]" + err=1 + return + fi + + # check again with a target symbol name + if ! perf annotate -i "${perfdata}" "${testsym}" 2> /dev/null | \ + grep -m 3 "${disasm_regex}" + then + echo "Basic annotate [Failed: missing disasm output when specifying the target symbol]" + err=1 + return + fi + + # check one more with external objdump tool (forced by --objdump option) + if ! perf annotate -i "${perfdata}" --objdump=objdump 2> /dev/null | \ + grep -m 3 "${disasm_regex}" + then + echo "Basic annotate [Failed: missing disasm output from non default disassembler (using --objdump)]" + err=1 + return + fi + echo "Basic annotate test [Success]" +} + +test_basic + +cleanup +exit $err diff --git a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh index a5d707efad..63bb8974b3 100755 --- a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh +++ b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Add 'perf probe's, list and remove them # SPDX-License-Identifier: GPL-2.0 # diff --git a/tools/perf/tests/shell/lib/stat_output.sh b/tools/perf/tests/shell/lib/stat_output.sh index c81d6a9f79..9a176ceae4 100644 --- a/tools/perf/tests/shell/lib/stat_output.sh +++ b/tools/perf/tests/shell/lib/stat_output.sh @@ -79,7 +79,7 @@ check_per_thread() echo "[Skip] paranoid and not root" return fi - perf stat --per-thread -a $2 true + perf stat --per-thread -p $$ $2 true commachecker --per-thread echo "[Success]" } diff --git a/tools/perf/tests/shell/script.sh b/tools/perf/tests/shell/script.sh index fa4d71e2e7..c1a6036536 100755 --- a/tools/perf/tests/shell/script.sh +++ b/tools/perf/tests/shell/script.sh @@ -17,7 +17,7 @@ cleanup() sane=$(echo "${temp_dir}" | cut -b 1-21) if [ "${sane}" = "/tmp/perf-test-script" ] ; then echo "--- Cleaning up ---" - rm -f "${temp_dir}/"* + rm -rf "${temp_dir:?}/"* rmdir "${temp_dir}" fi } @@ -65,7 +65,31 @@ _end_of_file_ echo "DB test [Success]" } +test_parallel_perf() +{ + echo "parallel-perf test" + if ! python3 --version >/dev/null 2>&1 ; then + echo "SKIP: no python3" + err=2 + return + fi + pp=$(dirname "$0")/../../scripts/python/parallel-perf.py + if [ ! -f "${pp}" ] ; then + echo "SKIP: parallel-perf.py script not found " + err=2 + return + fi + perf_data="${temp_dir}/pp-perf.data" + output1_dir="${temp_dir}/output1" + output2_dir="${temp_dir}/output2" + perf record -o "${perf_data}" --sample-cpu uname + python3 "${pp}" -o "${output1_dir}" --jobs 4 --verbose -- perf script -i "${perf_data}" + python3 "${pp}" -o "${output2_dir}" --jobs 4 --verbose --per-cpu -- perf script -i "${perf_data}" + echo "parallel-perf test [Success]" +} + test_db +test_parallel_perf cleanup diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh index 2b9c6212df..6b630d33c3 100755 --- a/tools/perf/tests/shell/stat+json_output.sh +++ b/tools/perf/tests/shell/stat+json_output.sh @@ -105,7 +105,7 @@ check_per_thread() echo "[Skip] paranoia and not root" return fi - perf stat -j --per-thread -a -o "${stat_output}" true + perf stat -j --per-thread -p $$ -o "${stat_output}" true $PYTHON $pythonchecker --per-thread --file "${stat_output}" echo "[Success]" } diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh index 2d92098747..61f8149d85 100755 --- a/tools/perf/tests/shell/stat_bpf_counters.sh +++ b/tools/perf/tests/shell/stat_bpf_counters.sh @@ -4,21 +4,59 @@ set -e +workload="perf bench sched messaging -g 1 -l 100 -t" + # check whether $2 is within +/- 20% of $1 compare_number() { - first_num=$1 - second_num=$2 - - # upper bound is first_num * 120% - upper=$(expr $first_num + $first_num / 5 ) - # lower bound is first_num * 80% - lower=$(expr $first_num - $first_num / 5 ) - - if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then - echo "The difference between $first_num and $second_num are greater than 20%." - exit 1 - fi + first_num=$1 + second_num=$2 + + # upper bound is first_num * 120% + upper=$(expr $first_num + $first_num / 5 ) + # lower bound is first_num * 80% + lower=$(expr $first_num - $first_num / 5 ) + + if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then + echo "The difference between $first_num and $second_num are greater than 20%." + exit 1 + fi +} + +check_counts() +{ + base_cycles=$1 + bpf_cycles=$2 + + if [ "$base_cycles" = "&1 | awk '/cycles/ {print $1}') + bpf_cycles=$(perf stat --no-big-num --bpf-counters -e cycles -- $workload 2>&1 | awk '/cycles/ {print $1}') + check_counts $base_cycles $bpf_cycles + compare_number $base_cycles $bpf_cycles + echo "[Success]" +} + +test_bpf_modifier() +{ + printf "Testing bpf event modifier " + stat_output=$(perf stat --no-big-num -e cycles/name=base_cycles/,cycles/name=bpf_cycles/b -- $workload 2>&1) + base_cycles=$(echo "$stat_output"| awk '/base_cycles/ {print $1}') + bpf_cycles=$(echo "$stat_output"| awk '/bpf_cycles/ {print $1}') + check_counts $base_cycles $bpf_cycles + compare_number $base_cycles $bpf_cycles + echo "[Success]" } # skip if --bpf-counters is not supported @@ -30,16 +68,7 @@ if ! perf stat -e cycles --bpf-counters true > /dev/null 2>&1; then exit 2 fi -base_cycles=$(perf stat --no-big-num -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}') -if [ "$base_cycles" = "&1 | awk '/cycles/ {print $1}') -if [ "$bpf_cycles" = " /dev/null & -PID=$! - -echo " + Recording (PID=$PID)..." -sleep 2 -echo " + Stopping perf-record..." +perf record -o "$PERF_DATA" --call-graph fp -e cycles//u --user-callchains -- $TEST_PROGRAM -kill $PID -wait $PID +# Try opening the file so any immediate errors are visible in the log +perf script -i "$PERF_DATA" -F comm,ip,sym | head -n4 -# expected perf-script output: +# expected perf-script output if 'leaf' has been inserted correctly: # -# program +# perf # 728 leaf # 753 parent # 76c leafloop -# ... +# ... remaining stack to main() ... -perf script -i "$PERF_DATA" -F comm,ip,sym | head -n4 -perf script -i "$PERF_DATA" -F comm,ip,sym | head -n4 | \ - awk '{ if ($2 != "") sym[i++] = $2 } END { if (sym[0] != "leaf" || - sym[1] != "parent" || - sym[2] != "leafloop") exit 1 }' +# Each frame is separated by a tab, some spaces and an address +SEP="[[:space:]]+ [[:xdigit:]]+" +perf script -i "$PERF_DATA" -F comm,ip,sym | tr '\n' ' ' | \ + grep -E -q "perf $SEP leaf $SEP parent $SEP leafloop" diff --git a/tools/perf/tests/symbols.c b/tools/perf/tests/symbols.c index d208105919..ee20a366f3 100644 --- a/tools/perf/tests/symbols.c +++ b/tools/perf/tests/symbols.c @@ -81,7 +81,7 @@ static int create_map(struct test_info *ti, char *filename, struct map **map_p) * If 'filename' matches a current kernel module, must use a kernel * map. Find the one that already exists. */ - if (dso && dso->kernel) { + if (dso && dso__kernel(dso) != DSO_SPACE__USER) { *map_p = find_module_map(ti->machine, dso); dso__put(dso); if (!*map_p) { @@ -116,7 +116,7 @@ static int test_dso(struct dso *dso) if (verbose > 1) dso__fprintf(dso, stderr); - for (nd = rb_first_cached(&dso->symbols); nd; nd = rb_next(nd)) { + for (nd = rb_first_cached(dso__symbols(dso)); nd; nd = rb_next(nd)) { struct symbol *sym = rb_entry(nd, struct symbol, rb_node); if (sym->type != STT_FUNC && sym->type != STT_GNU_IFUNC) @@ -145,7 +145,7 @@ static int subdivided_dso_cb(struct dso *dso, struct machine *machine __maybe_un { struct dso *text_dso = d; - if (dso != text_dso && strstarts(dso->short_name, text_dso->short_name)) + if (dso != text_dso && strstarts(dso__short_name(dso), dso__short_name(text_dso))) if (test_dso(dso) != TEST_OK) return -1; @@ -190,7 +190,7 @@ static int test_file(struct test_info *ti, char *filename) ret = test_dso(dso); /* Module dso is split into many dsos by section */ - if (ret == TEST_OK && dso->kernel) + if (ret == TEST_OK && dso__kernel(dso) != DSO_SPACE__USER) ret = process_subdivided_dso(ti->machine, dso); out_put: map__put(map); diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 2a842f53fb..a8cb5ba898 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -68,6 +68,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) }; int i; struct aggr_cpu_id id; + struct perf_cpu cpu; session = perf_session__new(&data, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); @@ -113,8 +114,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Session header CPU map not set", session->header.env.cpu); for (i = 0; i < session->header.env.nr_cpus_avail; i++) { - struct perf_cpu cpu = { .cpu = i }; - + cpu.cpu = i; if (!perf_cpu_map__has(map, cpu)) continue; pr_debug("CPU %d, core %d, socket %d\n", i, @@ -123,48 +123,48 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that CPU ID contains socket, die, core and CPU - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__cpu(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__cpu(cpu, NULL); TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", - perf_cpu_map__cpu(map, i).cpu == id.cpu.cpu); + cpu.cpu == id.cpu.cpu); TEST_ASSERT_VAL("Cpu map - Core ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); + session->header.env.cpu[cpu.cpu].core_id == id.core); TEST_ASSERT_VAL("Cpu map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Cpu map - Die ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); + session->header.env.cpu[cpu.cpu].die_id == id.die); TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Cpu map - Thread IDX is set", id.thread_idx == -1); } // Test that core ID contains socket, die and core - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__core(cpu, NULL); TEST_ASSERT_VAL("Core map - Core ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); + session->header.env.cpu[cpu.cpu].core_id == id.core); TEST_ASSERT_VAL("Core map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Core map - Die ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); + session->header.env.cpu[cpu.cpu].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Core map - Thread IDX is set", id.thread_idx == -1); } // Test that die ID contains socket and die - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__die(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__die(cpu, NULL); TEST_ASSERT_VAL("Die map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Die map - Die ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); + session->header.env.cpu[cpu.cpu].die_id == id.die); TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Die map - Core is set", id.core == -1); @@ -173,10 +173,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that socket ID contains only socket - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__socket(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__socket(cpu, NULL); TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); @@ -187,10 +187,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that node ID contains only node - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__node(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__node(cpu, NULL); TEST_ASSERT_VAL("Node map - Node ID doesn't match", - cpu__get_node(perf_cpu_map__cpu(map, i)) == id.node); + cpu__get_node(cpu) == id.node); TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Node map - Core is set", id.core == -1); diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index fecbf851bb..e30fd55f8e 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -129,7 +129,7 @@ static int test__vmlinux_matches_kallsyms_cb1(struct map *map, void *data) * cases. */ struct map *pair = maps__find_by_name(args->kallsyms.kmaps, - (dso->kernel ? dso->short_name : dso->name)); + (dso__kernel(dso) ? dso__short_name(dso) : dso__name(dso))); if (pair) { map__set_priv(pair, 1); @@ -162,11 +162,11 @@ static int test__vmlinux_matches_kallsyms_cb2(struct map *map, void *data) } pr_info("WARN: %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as", - map__start(map), map__end(map), map__pgoff(map), dso->name); + map__start(map), map__end(map), map__pgoff(map), dso__name(dso)); if (mem_end != map__end(pair)) pr_info(":\nWARN: *%" PRIx64 "-%" PRIx64 " %" PRIx64, map__start(pair), map__end(pair), map__pgoff(pair)); - pr_info(" %s\n", dso->name); + pr_info(" %s\n", dso__name(dso)); map__set_priv(pair, 1); } map__put(pair); diff --git a/tools/perf/tests/workloads/leafloop.c b/tools/perf/tests/workloads/leafloop.c index 1bf5cc9764..f7561767e3 100644 --- a/tools/perf/tests/workloads/leafloop.c +++ b/tools/perf/tests/workloads/leafloop.c @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #include #include +#include #include "../tests.h" /* We want to check these symbols in perf script */ @@ -8,10 +10,16 @@ noinline void leaf(volatile int b); noinline void parent(volatile int b); static volatile int a; +static volatile sig_atomic_t done; + +static void sighandler(int sig __maybe_unused) +{ + done = 1; +} noinline void leaf(volatile int b) { - for (;;) + while (!done) a += b; } @@ -22,12 +30,16 @@ noinline void parent(volatile int b) static int leafloop(int argc, const char **argv) { - int c = 1; + int sec = 1; if (argc > 0) - c = atoi(argv[0]); + sec = atoi(argv[0]); + + signal(SIGINT, sighandler); + signal(SIGALRM, sighandler); + alarm(sec); - parent(c); + parent(sec); return 0; } diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build index d11ce256f5..cb3c1399ff 100644 --- a/tools/perf/trace/beauty/Build +++ b/tools/perf/trace/beauty/Build @@ -1,6 +1,7 @@ perf-y += clone.o perf-y += fcntl.o perf-y += flock.o +perf-y += fs_at_flags.o perf-y += fsmount.o perf-y += fspick.o ifeq ($(SRCARCH),$(filter $(SRCARCH),x86)) @@ -19,3 +20,17 @@ perf-y += statx.o perf-y += sync_file_range.o perf-y += timespec.o perf-y += tracepoints/ + +ifdef SHELLCHECK + SHELL_TESTS := $(wildcard trace/beauty/*.sh) + TEST_LOGS := $(SHELL_TESTS:trace/beauty/%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -s bash -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h new file mode 100644 index 0000000000..13aea8fc3d --- /dev/null +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_IRQ_VECTORS_H +#define _ASM_X86_IRQ_VECTORS_H + +#include +/* + * Linux IRQ vector layout. + * + * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can + * be defined by Linux. They are used as a jump table by the CPU when a + * given vector is triggered - by a CPU-external, CPU-internal or + * software-triggered event. + * + * Linux sets the kernel code address each entry jumps to early during + * bootup, and never changes them. This is the general layout of the + * IDT entries: + * + * Vectors 0 ... 31 : system traps and exceptions - hardcoded events + * Vectors 32 ... 127 : device interrupts + * Vector 128 : legacy int80 syscall interface + * Vectors 129 ... LOCAL_TIMER_VECTOR-1 + * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts + * + * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. + * + * This file enumerates the exact layout of them: + */ + +/* This is used as an interrupt vector when programming the APIC. */ +#define NMI_VECTOR 0x02 + +/* + * IDT vectors usable for external interrupt sources start at 0x20. + * (0x80 is the syscall vector, 0x30-0x3f are for ISA) + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#define IA32_SYSCALL_VECTOR 0x80 + +/* + * Vectors 0x30-0x3f are used for ISA interrupts. + * round up to the next 16-vector boundary + */ +#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) + +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + */ + +#define SPURIOUS_APIC_VECTOR 0xff +/* + * Sanity check + */ +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) +# error SPURIOUS_APIC_VECTOR definition error +#endif + +#define ERROR_APIC_VECTOR 0xfe +#define RESCHEDULE_VECTOR 0xfd +#define CALL_FUNCTION_VECTOR 0xfc +#define CALL_FUNCTION_SINGLE_VECTOR 0xfb +#define THERMAL_APIC_VECTOR 0xfa +#define THRESHOLD_APIC_VECTOR 0xf9 +#define REBOOT_VECTOR 0xf8 + +/* + * Generic system vector for platform specific use + */ +#define X86_PLATFORM_IPI_VECTOR 0xf7 + +/* + * IRQ work vector: + */ +#define IRQ_WORK_VECTOR 0xf6 + +/* 0xf5 - unused, was UV_BAU_MESSAGE */ +#define DEFERRED_ERROR_VECTOR 0xf4 + +/* Vector on which hypervisor callbacks will be delivered */ +#define HYPERVISOR_CALLBACK_VECTOR 0xf3 + +/* Vector for KVM to deliver posted interrupt IPI */ +#define POSTED_INTR_VECTOR 0xf2 +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 +#define POSTED_INTR_NESTED_VECTOR 0xf0 + +#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef + +#if IS_ENABLED(CONFIG_HYPERV) +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee +#define HYPERV_STIMER0_VECTOR 0xed +#endif + +#define LOCAL_TIMER_VECTOR 0xec + +/* + * Posted interrupt notification vector for all device MSIs delivered to + * the host kernel. + */ +#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb + +#define NR_VECTORS 256 + +#ifdef CONFIG_X86_LOCAL_APIC +#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR +#else +#define FIRST_SYSTEM_VECTOR NR_VECTORS +#endif + +#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) +#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR) + +/* + * Size the maximum number of interrupts. + * + * If the irq_desc[] array has a sparse layout, we can size things + * generously - it scales up linearly with the maximum number of CPUs, + * and the maximum number of IO-APICs, whichever is higher. + * + * In other cases we size more conservatively, to not create too large + * static arrays. + */ + +#define NR_IRQS_LEGACY 16 + +#define CPU_VECTOR_LIMIT (64 * NR_CPUS) +#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) +#define NR_IRQS \ + (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) +#elif defined(CONFIG_X86_IO_APIC) +#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +#elif defined(CONFIG_PCI_MSI) +#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else +#define NR_IRQS NR_IRQS_LEGACY +#endif + +#endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h b/tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h new file mode 100644 index 0000000000..384e2cc6ac --- /dev/null +++ b/tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_X86_PRCTL_H +#define _ASM_X86_PRCTL_H + +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 + +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 + +#define ARCH_GET_XCOMP_SUPP 0x1021 +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 +#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 + +#define ARCH_XCOMP_TILECFG 17 +#define ARCH_XCOMP_TILEDATA 18 + +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 + +/* Don't use 0x3001-0x3004 because of old glibcs */ + +#define ARCH_GET_UNTAG_MASK 0x4001 +#define ARCH_ENABLE_TAGGED_ADDR 0x4002 +#define ARCH_GET_MAX_TAG_BITS 0x4003 +#define ARCH_FORCE_TAGGED_SVA 0x4004 + +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +/* ARCH_SHSTK_ features bits */ +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#endif /* _ASM_X86_PRCTL_H */ diff --git a/tools/perf/trace/beauty/arch_errno_names.sh b/tools/perf/trace/beauty/arch_errno_names.sh index 7df4bf5b55..30d3889b29 100755 --- a/tools/perf/trace/beauty/arch_errno_names.sh +++ b/tools/perf/trace/beauty/arch_errno_names.sh @@ -60,10 +60,12 @@ create_arch_errno_table_func() printf 'arch_syscalls__strerrno_t *arch_syscalls__strerrno_function(const char *arch)\n' printf '{\n' for arch in $archlist; do - printf '\tif (!strcmp(arch, "%s"))\n' $(arch_string "$arch") - printf '\t\treturn errno_to_name__%s;\n' $(arch_string "$arch") + arch_str=$(arch_string "$arch") + printf '\tif (!strcmp(arch, "%s"))\n' "$arch_str" + printf '\t\treturn errno_to_name__%s;\n' "$arch_str" done - printf '\treturn errno_to_name__%s;\n' $(arch_string "$default") + arch_str=$(arch_string "$default") + printf '\treturn errno_to_name__%s;\n' "$arch_str" printf '}\n' } diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 9feb794f5c..78d10d92d3 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -234,8 +234,11 @@ size_t syscall_arg__scnprintf_socket_protocol(char *bf, size_t size, struct sysc size_t syscall_arg__scnprintf_socket_level(char *bf, size_t size, struct syscall_arg *arg); #define SCA_SK_LEVEL syscall_arg__scnprintf_socket_level -size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg); -#define SCA_STATX_FLAGS syscall_arg__scnprintf_statx_flags +size_t syscall_arg__scnprintf_fs_at_flags(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_FS_AT_FLAGS syscall_arg__scnprintf_fs_at_flags + +size_t syscall_arg__scnprintf_faccessat2_flags(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_FACCESSAT2_FLAGS syscall_arg__scnprintf_faccessat2_flags size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg); #define SCA_STATX_MASK syscall_arg__scnprintf_statx_mask diff --git a/tools/perf/trace/beauty/clone.c b/tools/perf/trace/beauty/clone.c index f4db894e0a..c9fa8f7e82 100644 --- a/tools/perf/trace/beauty/clone.c +++ b/tools/perf/trace/beauty/clone.c @@ -7,52 +7,16 @@ #include "trace/beauty/beauty.h" #include +#include #include -#include +#include static size_t clone__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) { - const char *prefix = "CLONE_"; - int printed = 0; +#include "trace/beauty/generated/clone_flags_array.c" + static DEFINE_STRARRAY(clone_flags, "CLONE_"); -#define P_FLAG(n) \ - if (flags & CLONE_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ - flags &= ~CLONE_##n; \ - } - - P_FLAG(VM); - P_FLAG(FS); - P_FLAG(FILES); - P_FLAG(SIGHAND); - P_FLAG(PIDFD); - P_FLAG(PTRACE); - P_FLAG(VFORK); - P_FLAG(PARENT); - P_FLAG(THREAD); - P_FLAG(NEWNS); - P_FLAG(SYSVSEM); - P_FLAG(SETTLS); - P_FLAG(PARENT_SETTID); - P_FLAG(CHILD_CLEARTID); - P_FLAG(DETACHED); - P_FLAG(UNTRACED); - P_FLAG(CHILD_SETTID); - P_FLAG(NEWCGROUP); - P_FLAG(NEWUTS); - P_FLAG(NEWIPC); - P_FLAG(NEWUSER); - P_FLAG(NEWPID); - P_FLAG(NEWNET); - P_FLAG(IO); - P_FLAG(CLEAR_SIGHAND); - P_FLAG(INTO_CGROUP); -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; + return strarray__scnprintf_flags(&strarray__clone_flags, bf, size, show_prefix, flags); } size_t syscall_arg__scnprintf_clone_flags(char *bf, size_t size, struct syscall_arg *arg) diff --git a/tools/perf/trace/beauty/clone.sh b/tools/perf/trace/beauty/clone.sh new file mode 100755 index 0000000000..18b6c0d756 --- /dev/null +++ b/tools/perf/trace/beauty/clone.sh @@ -0,0 +1,17 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 1 ] ; then + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ +else + beauty_uapi_linux_dir=$1 +fi + +linux_sched=${beauty_uapi_linux_dir}/sched.h + +printf "static const char *clone_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+CLONE_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +grep -E $regex ${linux_sched} | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n" +printf "};\n" diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c index 56ef83b3d1..d075904dcc 100644 --- a/tools/perf/trace/beauty/fcntl.c +++ b/tools/perf/trace/beauty/fcntl.c @@ -7,7 +7,7 @@ #include "trace/beauty/beauty.h" #include -#include +#include static size_t fcntl__scnprintf_getfd(unsigned long val, char *bf, size_t size, bool show_prefix) { diff --git a/tools/perf/trace/beauty/flock.c b/tools/perf/trace/beauty/flock.c index c14274edd6..a6514a6f07 100644 --- a/tools/perf/trace/beauty/flock.c +++ b/tools/perf/trace/beauty/flock.c @@ -2,7 +2,7 @@ #include "trace/beauty/beauty.h" #include -#include +#include #ifndef LOCK_MAND #define LOCK_MAND 32 diff --git a/tools/perf/trace/beauty/fs_at_flags.c b/tools/perf/trace/beauty/fs_at_flags.c new file mode 100644 index 0000000000..c200669cb9 --- /dev/null +++ b/tools/perf/trace/beauty/fs_at_flags.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * trace/beauty/fs_at_flags.c + * + * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo + */ + +#include "trace/beauty/beauty.h" +#include +#include +#include + +/* + * uapi/linux/fcntl.h does not keep a copy in tools headers directory, + * for system with kernel versions before v5.8, need to sync AT_EACCESS macro. + */ +#ifndef AT_EACCESS +#define AT_EACCESS 0x200 +#endif + +#include "trace/beauty/generated/fs_at_flags_array.c" +static DEFINE_STRARRAY(fs_at_flags, "AT_"); + +static size_t fs_at__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) +{ + return strarray__scnprintf_flags(&strarray__fs_at_flags, bf, size, show_prefix, flags); +} + +size_t syscall_arg__scnprintf_fs_at_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + bool show_prefix = arg->show_string_prefix; + int flags = arg->val; + + return fs_at__scnprintf_flags(flags, bf, size, show_prefix); +} + +static size_t faccessat2__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) +{ + int printed = 0; + + // AT_EACCESS is the same as AT_REMOVEDIR, that is in fs_at_flags_array, + // special case it here. + if (flags & AT_EACCESS) { + flags &= ~AT_EACCESS; + printed += scnprintf(bf + printed, size - printed, "%sEACCESS%s", + show_prefix ? strarray__fs_at_flags.prefix : "", flags ? "|" : ""); + } + + return strarray__scnprintf_flags(&strarray__fs_at_flags, bf + printed, size - printed, show_prefix, flags); +} + +size_t syscall_arg__scnprintf_faccessat2_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + bool show_prefix = arg->show_string_prefix; + int flags = arg->val; + + return faccessat2__scnprintf_flags(flags, bf, size, show_prefix); +} diff --git a/tools/perf/trace/beauty/fs_at_flags.sh b/tools/perf/trace/beauty/fs_at_flags.sh new file mode 100755 index 0000000000..456f59addf --- /dev/null +++ b/tools/perf/trace/beauty/fs_at_flags.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 1 ] ; then + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ +else + beauty_uapi_linux_dir=$1 +fi + +linux_fcntl=${beauty_uapi_linux_dir}/fcntl.h + +printf "static const char *fs_at_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+AT_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +# AT_EACCESS is only meaningful to faccessat, so we will special case it there... +# AT_STATX_SYNC_TYPE is not a bit, its a mask of AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC and AT_STATX_DONT_SYNC +grep -E $regex ${linux_fcntl} | \ + grep -v AT_EACCESS | \ + grep -v AT_STATX_SYNC_TYPE | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n" +printf "};\n" diff --git a/tools/perf/trace/beauty/fsconfig.sh b/tools/perf/trace/beauty/fsconfig.sh index bc6ef7bb7a..09cee79de0 100755 --- a/tools/perf/trace/beauty/fsconfig.sh +++ b/tools/perf/trace/beauty/fsconfig.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h printf "static const char *fsconfig_cmds[] = {\n" ms='[[:space:]]*' diff --git a/tools/perf/trace/beauty/fsmount.c b/tools/perf/trace/beauty/fsmount.c index 30c8c082a3..28c2c16fc1 100644 --- a/tools/perf/trace/beauty/fsmount.c +++ b/tools/perf/trace/beauty/fsmount.c @@ -7,7 +7,14 @@ #include "trace/beauty/beauty.h" #include -#include +#include + +#ifndef MOUNT_ATTR__ATIME +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#endif +#ifndef MOUNT_ATTR_RELATIME +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#endif static size_t fsmount__scnprintf_attr_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) { diff --git a/tools/perf/trace/beauty/fsmount.sh b/tools/perf/trace/beauty/fsmount.sh index cba8897a75..6b67a54cde 100755 --- a/tools/perf/trace/beauty/fsmount.sh +++ b/tools/perf/trace/beauty/fsmount.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h # Remove MOUNT_ATTR_RELATIME as it is zeros, handle it a special way in the beautifier # Only handle MOUNT_ATTR_ followed by a capital letter/num as __ is special case diff --git a/tools/perf/trace/beauty/fspick.sh b/tools/perf/trace/beauty/fspick.sh index 1f088329b9..0d9951c22b 100755 --- a/tools/perf/trace/beauty/fspick.sh +++ b/tools/perf/trace/beauty/fspick.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h printf "static const char *fspick_flags[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+FSPICK_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 139c330ccf..89d16b9037 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -16,6 +16,7 @@ struct cred; struct socket; struct sock; struct sk_buff; +struct proto_accept_arg; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) @@ -433,7 +434,7 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); -extern struct file *do_accept(struct file *file, unsigned file_flags, +extern struct file *do_accept(struct file *file, struct proto_accept_arg *arg, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h new file mode 100644 index 0000000000..c0bcc185fa --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FCNTL_H +#define _UAPI_LINUX_FCNTL_H + +#include +#include + +#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) +#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) + +/* + * Request nofications on a directory. + * See below for events that may be notified. + */ +#define F_NOTIFY (F_LINUX_SPECIFIC_BASE + 2) + +#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) + +/* + * Cancel a blocking posix lock; internal use only until we expose an + * asynchronous lock api to userspace: + */ +#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5) + +/* Create a file descriptor with FD_CLOEXEC set. */ +#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6) + +/* + * Set and get of pipe page size array + */ +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) + +/* + * Set/Get seals + */ +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +/* + * Types of seals + */ +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ +/* (1U << 31) is reserved for signed error codes */ + +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWH_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + +/* + * Types of directory notifications that may be requested. + */ +#define DN_ACCESS 0x00000001 /* File accessed */ +#define DN_MODIFY 0x00000002 /* File modified */ +#define DN_CREATE 0x00000004 /* File created */ +#define DN_DELETE 0x00000008 /* File removed */ +#define DN_RENAME 0x00000010 /* File renamed */ +#define DN_ATTRIB 0x00000020 /* File changed attibutes */ +#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ + +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ +#define AT_FDCWD -100 /* Special value used to indicate + openat should use the current + working directory. */ +#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ +#define AT_REMOVEDIR 0x200 /* Remove directory instead of + unlinking file. */ +#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ +#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ +#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ + +#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ +#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ +#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ +#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ + +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ + +/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ +#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to + compare object identity and may not + be usable to open_by_handle_at(2) */ +#if defined(__KERNEL__) +#define AT_GETATTR_NOSEC 0x80000000 +#endif + +#endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h new file mode 100644 index 0000000000..45e4e64fd6 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FS_H +#define _UAPI_LINUX_FS_H + +/* + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. + */ + +#include +#include +#include +#ifndef __KERNEL__ +#include +#endif + +/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */ +#if !defined(__KERNEL__) +#include +#endif + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1< + +/* + * These are the fs-independent mount-flags: up to 32 flags are supported + * + * Usage of these is restricted within the kernel to core mount(2) code and + * callers of sys_mount() only. Filesystems should be using the SB_* + * equivalent instead. + */ +#define MS_RDONLY 1 /* Mount read-only */ +#define MS_NOSUID 2 /* Ignore suid and sgid bits */ +#define MS_NODEV 4 /* Disallow access to device special files */ +#define MS_NOEXEC 8 /* Disallow program execution */ +#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ +#define MS_REMOUNT 32 /* Alter flags of a mounted FS */ +#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ +#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */ +#define MS_NOATIME 1024 /* Do not update access times. */ +#define MS_NODIRATIME 2048 /* Do not update directory access times */ +#define MS_BIND 4096 +#define MS_MOVE 8192 +#define MS_REC 16384 +#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. + MS_VERBOSE is deprecated. */ +#define MS_SILENT 32768 +#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ +#define MS_UNBINDABLE (1<<17) /* change to unbindable */ +#define MS_PRIVATE (1<<18) /* change to private */ +#define MS_SLAVE (1<<19) /* change to slave */ +#define MS_SHARED (1<<20) /* change to shared */ +#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ +#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ +#define MS_I_VERSION (1<<23) /* Update inode I_version field */ +#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ + +/* These sb flags are internal to the kernel */ +#define MS_SUBMOUNT (1<<26) +#define MS_NOREMOTELOCK (1<<27) +#define MS_NOSEC (1<<28) +#define MS_BORN (1<<29) +#define MS_ACTIVE (1<<30) +#define MS_NOUSER (1<<31) + +/* + * Superblock flags that can be altered by MS_REMOUNT + */ +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ + MS_LAZYTIME) + +/* + * Old magic mount flag and mask + */ +#define MS_MGC_VAL 0xC0ED0000 +#define MS_MGC_MSK 0xffff0000 + +/* + * open_tree() flags. + */ +#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ + +/* + * move_mount() flags. + */ +#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ +#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ +#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ +#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ +#define MOVE_MOUNT__MASK 0x00000377 + +/* + * fsopen() flags. + */ +#define FSOPEN_CLOEXEC 0x00000001 + +/* + * fspick() flags. + */ +#define FSPICK_CLOEXEC 0x00000001 +#define FSPICK_SYMLINK_NOFOLLOW 0x00000002 +#define FSPICK_NO_AUTOMOUNT 0x00000004 +#define FSPICK_EMPTY_PATH 0x00000008 + +/* + * The type of fsconfig() call made. + */ +enum fsconfig_command { + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ + FSCONFIG_CMD_CREATE = 6, /* Create new or reuse existing superblock */ + FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ + FSCONFIG_CMD_CREATE_EXCL = 8, /* Create new superblock, fail if reusing existing superblock */ +}; + +/* + * fsmount() flags. + */ +#define FSMOUNT_CLOEXEC 0x00000001 + +/* + * Mount attributes. + */ +#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ +#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ +#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ +#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ +#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ +#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ +#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ +#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */ + +/* + * mount_setattr() + */ +struct mount_attr { + __u64 attr_set; + __u64 attr_clr; + __u64 propagation; + __u64 userns_fd; +}; + +/* List of all mount_attr versions. */ +#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ + + +/* + * Structure for getting mount/superblock/filesystem info with statmount(2). + * + * The interface is similar to statx(2): individual fields or groups can be + * selected with the @mask argument of statmount(). Kernel will set the @mask + * field according to the supported fields. + * + * If string fields are selected, then the caller needs to pass a buffer that + * has space after the fixed part of the structure. Nul terminated strings are + * copied there and offsets relative to @str are stored in the relevant fields. + * If the buffer is too small, then EOVERFLOW is returned. The actually used + * size is returned in @size. + */ +struct statmount { + __u32 size; /* Total size, including strings */ + __u32 __spare1; + __u64 mask; /* What results were written */ + __u32 sb_dev_major; /* Device ID */ + __u32 sb_dev_minor; + __u64 sb_magic; /* ..._SUPER_MAGIC */ + __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */ + __u32 fs_type; /* [str] Filesystem type */ + __u64 mnt_id; /* Unique ID of mount */ + __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */ + __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */ + __u32 mnt_parent_id_old; + __u64 mnt_attr; /* MOUNT_ATTR_... */ + __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */ + __u64 mnt_peer_group; /* ID of shared peer group */ + __u64 mnt_master; /* Mount receives propagation from this ID */ + __u64 propagate_from; /* Propagation from in current namespace */ + __u32 mnt_root; /* [str] Root of mount relative to root of fs */ + __u32 mnt_point; /* [str] Mountpoint relative to current root */ + __u64 __spare2[50]; + char str[]; /* Variable size part containing strings */ +}; + +/* + * Structure for passing mount ID and miscellaneous parameters to statmount(2) + * and listmount(2). + * + * For statmount(2) @param represents the request mask. + * For listmount(2) @param represents the last listed mount id (or zero). + */ +struct mnt_id_req { + __u32 size; + __u32 spare; + __u64 mnt_id; + __u64 param; +}; + +/* List of all mnt_id_req versions. */ +#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ + +/* + * @mask bits for statmount(2) + */ +#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ +#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ +#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ +#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ +#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ +#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ + +/* + * Special @mnt_id values that can be passed to listmount + */ +#define LSMT_ROOT 0xffffffffffffffff /* root mount */ + +#endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h new file mode 100644 index 0000000000..35791791a8 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_PRCTL_H +#define _LINUX_PRCTL_H + +#include + +/* Values to pass as first argument to prctl() */ + +#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ +#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ + +/* Get/set current->mm->dumpable */ +#define PR_GET_DUMPABLE 3 +#define PR_SET_DUMPABLE 4 + +/* Get/set unaligned access control bits (if meaningful) */ +#define PR_GET_UNALIGN 5 +#define PR_SET_UNALIGN 6 +# define PR_UNALIGN_NOPRINT 1 /* silently fix up unaligned user accesses */ +# define PR_UNALIGN_SIGBUS 2 /* generate SIGBUS on unaligned user access */ + +/* Get/set whether or not to drop capabilities on setuid() away from + * uid 0 (as per security/commoncap.c) */ +#define PR_GET_KEEPCAPS 7 +#define PR_SET_KEEPCAPS 8 + +/* Get/set floating-point emulation control bits (if meaningful) */ +#define PR_GET_FPEMU 9 +#define PR_SET_FPEMU 10 +# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ +# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ + +/* Get/set floating-point exception mode (if meaningful) */ +#define PR_GET_FPEXC 11 +#define PR_SET_FPEXC 12 +# define PR_FP_EXC_SW_ENABLE 0x80 /* Use FPEXC for FP exception enables */ +# define PR_FP_EXC_DIV 0x010000 /* floating point divide by zero */ +# define PR_FP_EXC_OVF 0x020000 /* floating point overflow */ +# define PR_FP_EXC_UND 0x040000 /* floating point underflow */ +# define PR_FP_EXC_RES 0x080000 /* floating point inexact result */ +# define PR_FP_EXC_INV 0x100000 /* floating point invalid operation */ +# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */ +# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */ +# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */ +# define PR_FP_EXC_PRECISE 3 /* precise exception mode */ + +/* Get/set whether we use statistical process timing or accurate timestamp + * based process timing */ +#define PR_GET_TIMING 13 +#define PR_SET_TIMING 14 +# define PR_TIMING_STATISTICAL 0 /* Normal, traditional, + statistical process timing */ +# define PR_TIMING_TIMESTAMP 1 /* Accurate timestamp based + process timing */ + +#define PR_SET_NAME 15 /* Set process name */ +#define PR_GET_NAME 16 /* Get process name */ + +/* Get/set process endian */ +#define PR_GET_ENDIAN 19 +#define PR_SET_ENDIAN 20 +# define PR_ENDIAN_BIG 0 +# define PR_ENDIAN_LITTLE 1 /* True little endian mode */ +# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */ + +/* Get/set process seccomp mode */ +#define PR_GET_SECCOMP 21 +#define PR_SET_SECCOMP 22 + +/* Get/set the capability bounding set (as per security/commoncap.c) */ +#define PR_CAPBSET_READ 23 +#define PR_CAPBSET_DROP 24 + +/* Get/set the process' ability to use the timestamp counter instruction */ +#define PR_GET_TSC 25 +#define PR_SET_TSC 26 +# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */ +# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */ + +/* Get/set securebits (as per security/commoncap.c) */ +#define PR_GET_SECUREBITS 27 +#define PR_SET_SECUREBITS 28 + +/* + * Get/set the timerslack as used by poll/select/nanosleep + * A value of 0 means "use default" + */ +#define PR_SET_TIMERSLACK 29 +#define PR_GET_TIMERSLACK 30 + +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 + +/* + * Set early/late kill mode for hwpoison memory corruption. + * This influences when the process gets killed on a memory corruption. + */ +#define PR_MCE_KILL 33 +# define PR_MCE_KILL_CLEAR 0 +# define PR_MCE_KILL_SET 1 + +# define PR_MCE_KILL_LATE 0 +# define PR_MCE_KILL_EARLY 1 +# define PR_MCE_KILL_DEFAULT 2 + +#define PR_MCE_KILL_GET 34 + +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 +# define PR_SET_MM_ARG_START 8 +# define PR_SET_MM_ARG_END 9 +# define PR_SET_MM_ENV_START 10 +# define PR_SET_MM_ENV_END 11 +# define PR_SET_MM_AUXV 12 +# define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; + +/* + * Set specific pid that is allowed to ptrace the current task. + * A value of 0 mean "no process". + */ +#define PR_SET_PTRACER 0x59616d61 +# define PR_SET_PTRACER_ANY ((unsigned long)-1) + +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 + +/* + * If no_new_privs is set, then operations that grant new privileges (i.e. + * execve) will either fail or not grant them. This affects suid/sgid, + * file capabilities, and LSMs. + * + * Operations that merely manipulate or drop existing privileges (setresuid, + * capset, etc.) will still work. Drop those privileges if you want them gone. + * + * Changing LSM security domain is considered a new privilege. So, for example, + * asking selinux for a specific new context (e.g. with runcon) will result + * in execve returning -EPERM. + * + * See Documentation/userspace-api/no_new_privs.rst for more details. + */ +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 + +#define PR_GET_TID_ADDRESS 40 + +#define PR_SET_THP_DISABLE 41 +#define PR_GET_THP_DISABLE 42 + +/* + * No longer implemented, but left here to ensure the numbers stay reserved: + */ +#define PR_MPX_ENABLE_MANAGEMENT 43 +#define PR_MPX_DISABLE_MANAGEMENT 44 + +#define PR_SET_FP_MODE 45 +#define PR_GET_FP_MODE 46 +# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ +# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ + +/* Control the ambient capability set */ +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 + +/* arm64 Scalable Vector Extension controls */ +/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */ +#define PR_SVE_SET_VL 50 /* set task vector length */ +# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SVE_GET_VL 51 /* get task vector length */ +/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */ +# define PR_SVE_VL_LEN_MASK 0xffff +# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ + +/* Per task speculation control */ +#define PR_GET_SPECULATION_CTRL 52 +#define PR_SET_SPECULATION_CTRL 53 +/* Speculation control variants */ +# define PR_SPEC_STORE_BYPASS 0 +# define PR_SPEC_INDIRECT_BRANCH 1 +# define PR_SPEC_L1D_FLUSH 2 +/* Return and control values for PR_SET/GET_SPECULATION_CTRL */ +# define PR_SPEC_NOT_AFFECTED 0 +# define PR_SPEC_PRCTL (1UL << 0) +# define PR_SPEC_ENABLE (1UL << 1) +# define PR_SPEC_DISABLE (1UL << 2) +# define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) + +/* Reset arm64 pointer authentication keys */ +#define PR_PAC_RESET_KEYS 54 +# define PR_PAC_APIAKEY (1UL << 0) +# define PR_PAC_APIBKEY (1UL << 1) +# define PR_PAC_APDAKEY (1UL << 2) +# define PR_PAC_APDBKEY (1UL << 3) +# define PR_PAC_APGAKEY (1UL << 4) + +/* Tagged user address controls for arm64 */ +#define PR_SET_TAGGED_ADDR_CTRL 55 +#define PR_GET_TAGGED_ADDR_CTRL 56 +# define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* MTE tag check fault modes */ +# define PR_MTE_TCF_NONE 0UL +# define PR_MTE_TCF_SYNC (1UL << 1) +# define PR_MTE_TCF_ASYNC (1UL << 2) +# define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC) +/* MTE tag inclusion mask */ +# define PR_MTE_TAG_SHIFT 3 +# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) +/* Unused; kept only for source compatibility */ +# define PR_MTE_TCF_SHIFT 1 + +/* Control reclaim behavior when allocating memory */ +#define PR_SET_IO_FLUSHER 57 +#define PR_GET_IO_FLUSHER 58 + +/* Dispatch syscalls to a userspace handler */ +#define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +/* The control values for the user space selector when dispatch is enabled */ +# define SYSCALL_DISPATCH_FILTER_ALLOW 0 +# define SYSCALL_DISPATCH_FILTER_BLOCK 1 + +/* Set/get enabled arm64 pointer authentication keys */ +#define PR_PAC_SET_ENABLED_KEYS 60 +#define PR_PAC_GET_ENABLED_KEYS 61 + +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 +# define PR_SCHED_CORE_SCOPE_THREAD 0 +# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 +# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 + +/* arm64 Scalable Matrix Extension controls */ +/* Flag values must be in sync with SVE versions */ +#define PR_SME_SET_VL 63 /* set task vector length */ +# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SME_GET_VL 64 /* get task vector length */ +/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */ +# define PR_SME_VL_LEN_MASK 0xffff +# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ + +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) + +#define PR_GET_MDWE 66 + +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + +#define PR_GET_AUXV 0x41555856 + +#define PR_SET_MEMORY_MERGE 67 +#define PR_GET_MEMORY_MERGE 68 + +#define PR_RISCV_V_SET_CONTROL 69 +#define PR_RISCV_V_GET_CONTROL 70 +# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0 +# define PR_RISCV_V_VSTATE_CTRL_OFF 1 +# define PR_RISCV_V_VSTATE_CTRL_ON 2 +# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4) +# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3 +# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc +# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f + +#define PR_RISCV_SET_ICACHE_FLUSH_CTX 71 +# define PR_RISCV_CTX_SW_FENCEI_ON 0 +# define PR_RISCV_CTX_SW_FENCEI_OFF 1 +# define PR_RISCV_SCOPE_PER_PROCESS 0 +# define PR_RISCV_SCOPE_PER_THREAD 1 + +/* PowerPC Dynamic Execution Control Register (DEXCR) controls */ +#define PR_PPC_GET_DEXCR 72 +#define PR_PPC_SET_DEXCR 73 +/* DEXCR aspect to act on */ +# define PR_PPC_DEXCR_SBHE 0 /* Speculative branch hint enable */ +# define PR_PPC_DEXCR_IBRTPD 1 /* Indirect branch recurrent target prediction disable */ +# define PR_PPC_DEXCR_SRAPD 2 /* Subroutine return address prediction disable */ +# define PR_PPC_DEXCR_NPHIE 3 /* Non-privileged hash instruction enable */ +/* Action to apply / return */ +# define PR_PPC_DEXCR_CTRL_EDITABLE 0x1 /* Aspect can be modified with PR_PPC_SET_DEXCR */ +# define PR_PPC_DEXCR_CTRL_SET 0x2 /* Set the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_CLEAR 0x4 /* Clear the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_MASK 0x1f + +#endif /* _LINUX_PRCTL_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/sched.h b/tools/perf/trace/beauty/include/uapi/linux/sched.h new file mode 100644 index 0000000000..3bac0a8cea --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/sched.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_SCHED_H +#define _UAPI_LINUX_SCHED_H + +#include + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ +#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#define CLONE_NEWNET 0x40000000 /* New network namespace */ +#define CLONE_IO 0x80000000 /* Clone io context */ + +/* Flags for the clone3() syscall. */ +#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ + +/* + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 + * syscalls only: + */ +#define CLONE_NEWTIME 0x00000080 /* New time namespace */ + +#ifndef __ASSEMBLY__ +/** + * struct clone_args - arguments for the clone3 syscall + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * Note, @stack is expected to point to the + * lowest address. The stack direction will be + * determined by the kernel and set up + * appropriately based on @stack_size. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * @set_tid: Pointer to an array of type *pid_t. The size + * of the array is defined using @set_tid_size. + * This array is used to select PIDs/TIDs for + * newly created processes. The first element in + * this defines the PID in the most nested PID + * namespace. Each additional element in the array + * defines the PID in the parent PID namespace of + * the original PID namespace. If the array has + * less entries than the number of currently + * nested PID namespaces only the PIDs in the + * corresponding namespaces are set. + * @set_tid_size: This defines the size of the array referenced + * in @set_tid. This cannot be larger than the + * kernel's limit of nested PID namespaces. + * @cgroup: If CLONE_INTO_CGROUP is specified set this to + * a file descriptor for the cgroup. + * + * The structure is versioned by size and thus extensible. + * New struct members must go at the end of the struct and + * must be properly 64bit aligned. + */ +struct clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; + __aligned_u64 cgroup; +}; +#endif + +#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ + +/* + * Scheduling policies + */ +#define SCHED_NORMAL 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 +#define SCHED_BATCH 3 +/* SCHED_ISO: reserved but not implemented yet */ +#define SCHED_IDLE 5 +#define SCHED_DEADLINE 6 + +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 + +/* + * For the sched_{set,get}attr() calls + */ +#define SCHED_FLAG_RESET_ON_FORK 0x01 +#define SCHED_FLAG_RECLAIM 0x02 +#define SCHED_FLAG_DL_OVERRUN 0x04 +#define SCHED_FLAG_KEEP_POLICY 0x08 +#define SCHED_FLAG_KEEP_PARAMS 0x10 +#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 +#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 + +#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) + +#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ + SCHED_FLAG_UTIL_CLAMP_MAX) + +#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ + SCHED_FLAG_UTIL_CLAMP) + +#endif /* _UAPI_LINUX_SCHED_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h new file mode 100644 index 0000000000..67626d5353 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_STAT_H +#define _UAPI_LINUX_STAT_H + +#include + +#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) + +#define S_IFMT 00170000 +#define S_IFSOCK 0140000 +#define S_IFLNK 0120000 +#define S_IFREG 0100000 +#define S_IFBLK 0060000 +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFIFO 0010000 +#define S_ISUID 0004000 +#define S_ISGID 0002000 +#define S_ISVTX 0001000 + +#define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK) +#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) +#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) +#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) +#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK) +#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO) +#define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK) + +#define S_IRWXU 00700 +#define S_IRUSR 00400 +#define S_IWUSR 00200 +#define S_IXUSR 00100 + +#define S_IRWXG 00070 +#define S_IRGRP 00040 +#define S_IWGRP 00020 +#define S_IXGRP 00010 + +#define S_IRWXO 00007 +#define S_IROTH 00004 +#define S_IWOTH 00002 +#define S_IXOTH 00001 + +#endif + +/* + * Timestamp structure for the timestamps in struct statx. + * + * tv_sec holds the number of seconds before (negative) or after (positive) + * 00:00:00 1st January 1970 UTC. + * + * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. + * + * __reserved is held in case we need a yet finer resolution. + */ +struct statx_timestamp { + __s64 tv_sec; + __u32 tv_nsec; + __s32 __reserved; +}; + +/* + * Structures for the extended file attribute retrieval system call + * (statx()). + * + * The caller passes a mask of what they're specifically interested in as a + * parameter to statx(). What statx() actually got will be indicated in + * st_mask upon return. + * + * For each bit in the mask argument: + * + * - if the datum is not supported: + * + * - the bit will be cleared, and + * + * - the datum will be set to an appropriate fabricated value if one is + * available (eg. CIFS can take a default uid and gid), otherwise + * + * - the field will be cleared; + * + * - otherwise, if explicitly requested: + * + * - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is + * set or if the datum is considered out of date, and + * + * - the field will be filled in and the bit will be set; + * + * - otherwise, if not requested, but available in approximate form without any + * effort, it will be filled in anyway, and the bit will be set upon return + * (it might not be up to date, however, and no attempt will be made to + * synchronise the internal state first); + * + * - otherwise the field and the bit will be cleared before returning. + * + * Items in STATX_BASIC_STATS may be marked unavailable on return, but they + * will have values installed for compatibility purposes so that stat() and + * co. can be emulated in userspace. + */ +struct statx { + /* 0x00 */ + __u32 stx_mask; /* What results were written [uncond] */ + __u32 stx_blksize; /* Preferred general I/O size [uncond] */ + __u64 stx_attributes; /* Flags conveying information about the file [uncond] */ + /* 0x10 */ + __u32 stx_nlink; /* Number of hard links */ + __u32 stx_uid; /* User ID of owner */ + __u32 stx_gid; /* Group ID of owner */ + __u16 stx_mode; /* File mode */ + __u16 __spare0[1]; + /* 0x20 */ + __u64 stx_ino; /* Inode number */ + __u64 stx_size; /* File size */ + __u64 stx_blocks; /* Number of 512-byte blocks allocated */ + __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ + /* 0x40 */ + struct statx_timestamp stx_atime; /* Last access time */ + struct statx_timestamp stx_btime; /* File creation time */ + struct statx_timestamp stx_ctime; /* Last attribute change time */ + struct statx_timestamp stx_mtime; /* Last data modification time */ + /* 0x80 */ + __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */ + __u32 stx_rdev_minor; + __u32 stx_dev_major; /* ID of device containing file [uncond] */ + __u32 stx_dev_minor; + /* 0x90 */ + __u64 stx_mnt_id; + __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ + __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ + __u64 stx_subvol; /* Subvolume identifier */ + /* 0xa0 */ + __u64 __spare3[11]; /* Spare space for future expansion */ + /* 0x100 */ +}; + +/* + * Flags to be stx_mask + * + * Query request/result mask for statx() and struct statx::stx_mask. + * + * These bits should be set in the mask argument of statx() to request + * particular items when calling statx(). + */ +#define STATX_TYPE 0x00000001U /* Want/got stx_mode & S_IFMT */ +#define STATX_MODE 0x00000002U /* Want/got stx_mode & ~S_IFMT */ +#define STATX_NLINK 0x00000004U /* Want/got stx_nlink */ +#define STATX_UID 0x00000008U /* Want/got stx_uid */ +#define STATX_GID 0x00000010U /* Want/got stx_gid */ +#define STATX_ATIME 0x00000020U /* Want/got stx_atime */ +#define STATX_MTIME 0x00000040U /* Want/got stx_mtime */ +#define STATX_CTIME 0x00000080U /* Want/got stx_ctime */ +#define STATX_INO 0x00000100U /* Want/got stx_ino */ +#define STATX_SIZE 0x00000200U /* Want/got stx_size */ +#define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ +#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ +#define STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ +#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ +#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ + +#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ + +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + +/* + * Attributes to be found in stx_attributes and masked in stx_attributes_mask. + * + * These give information about the features or the state of a file that might + * be of use to ordinary userspace programs such as GUIs or ls rather than + * specialised tools. + * + * Note that the flags marked [I] correspond to the FS_IOC_SETFLAGS flags + * semantically. Where possible, the numerical value is picked to correspond + * also. Note that the DAX attribute indicates that the file is in the CPU + * direct access state. It does not correspond to the per-inode flag that + * some filesystems support. + * + */ +#define STATX_ATTR_COMPRESSED 0x00000004 /* [I] File is compressed by the fs */ +#define STATX_ATTR_IMMUTABLE 0x00000010 /* [I] File is marked immutable */ +#define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ +#define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ +#define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ +#define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ +#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ +#define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ + + +#endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h b/tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h new file mode 100644 index 0000000000..74a84e0242 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/*****************************************************************************/ + +/* + * usbdevice_fs.h -- USB device file system. + * + * Copyright (C) 2000 + * Thomas Sailer (sailer@ife.ee.ethz.ch) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * History: + * 0.1 04.01.2000 Created + */ + +/*****************************************************************************/ + +#ifndef _UAPI_LINUX_USBDEVICE_FS_H +#define _UAPI_LINUX_USBDEVICE_FS_H + +#include +#include + +/* --------------------------------------------------------------------- */ + +/* usbdevfs ioctl codes */ + +struct usbdevfs_ctrltransfer { + __u8 bRequestType; + __u8 bRequest; + __u16 wValue; + __u16 wIndex; + __u16 wLength; + __u32 timeout; /* in milliseconds */ + void __user *data; +}; + +struct usbdevfs_bulktransfer { + unsigned int ep; + unsigned int len; + unsigned int timeout; /* in milliseconds */ + void __user *data; +}; + +struct usbdevfs_setinterface { + unsigned int interface; + unsigned int altsetting; +}; + +struct usbdevfs_disconnectsignal { + unsigned int signr; + void __user *context; +}; + +#define USBDEVFS_MAXDRIVERNAME 255 + +struct usbdevfs_getdriver { + unsigned int interface; + char driver[USBDEVFS_MAXDRIVERNAME + 1]; +}; + +struct usbdevfs_connectinfo { + unsigned int devnum; + unsigned char slow; +}; + +struct usbdevfs_conninfo_ex { + __u32 size; /* Size of the structure from the kernel's */ + /* point of view. Can be used by userspace */ + /* to determine how much data can be */ + /* used/trusted. */ + __u32 busnum; /* USB bus number, as enumerated by the */ + /* kernel, the device is connected to. */ + __u32 devnum; /* Device address on the bus. */ + __u32 speed; /* USB_SPEED_* constants from ch9.h */ + __u8 num_ports; /* Number of ports the device is connected */ + /* to on the way to the root hub. It may */ + /* be bigger than size of 'ports' array so */ + /* userspace can detect overflows. */ + __u8 ports[7]; /* List of ports on the way from the root */ + /* hub to the device. Current limit in */ + /* USB specification is 7 tiers (root hub, */ + /* 5 intermediate hubs, device), which */ + /* gives at most 6 port entries. */ +}; + +#define USBDEVFS_URB_SHORT_NOT_OK 0x01 +#define USBDEVFS_URB_ISO_ASAP 0x02 +#define USBDEVFS_URB_BULK_CONTINUATION 0x04 +#define USBDEVFS_URB_NO_FSBR 0x20 /* Not used */ +#define USBDEVFS_URB_ZERO_PACKET 0x40 +#define USBDEVFS_URB_NO_INTERRUPT 0x80 + +#define USBDEVFS_URB_TYPE_ISO 0 +#define USBDEVFS_URB_TYPE_INTERRUPT 1 +#define USBDEVFS_URB_TYPE_CONTROL 2 +#define USBDEVFS_URB_TYPE_BULK 3 + +struct usbdevfs_iso_packet_desc { + unsigned int length; + unsigned int actual_length; + unsigned int status; +}; + +struct usbdevfs_urb { + unsigned char type; + unsigned char endpoint; + int status; + unsigned int flags; + void __user *buffer; + int buffer_length; + int actual_length; + int start_frame; + union { + int number_of_packets; /* Only used for isoc urbs */ + unsigned int stream_id; /* Only used with bulk streams */ + }; + int error_count; + unsigned int signr; /* signal to be sent on completion, + or 0 if none should be sent. */ + void __user *usercontext; + struct usbdevfs_iso_packet_desc iso_frame_desc[]; +}; + +/* ioctls for talking directly to drivers */ +struct usbdevfs_ioctl { + int ifno; /* interface 0..N ; negative numbers reserved */ + int ioctl_code; /* MUST encode size + direction of data so the + * macros in give correct values */ + void __user *data; /* param buffer (in, or out) */ +}; + +/* You can do most things with hubs just through control messages, + * except find out what device connects to what port. */ +struct usbdevfs_hub_portinfo { + char nports; /* number of downstream ports in this hub */ + char port [127]; /* e.g. port 3 connects to device 27 */ +}; + +/* System and bus capability flags */ +#define USBDEVFS_CAP_ZERO_PACKET 0x01 +#define USBDEVFS_CAP_BULK_CONTINUATION 0x02 +#define USBDEVFS_CAP_NO_PACKET_SIZE_LIM 0x04 +#define USBDEVFS_CAP_BULK_SCATTER_GATHER 0x08 +#define USBDEVFS_CAP_REAP_AFTER_DISCONNECT 0x10 +#define USBDEVFS_CAP_MMAP 0x20 +#define USBDEVFS_CAP_DROP_PRIVILEGES 0x40 +#define USBDEVFS_CAP_CONNINFO_EX 0x80 +#define USBDEVFS_CAP_SUSPEND 0x100 + +/* USBDEVFS_DISCONNECT_CLAIM flags & struct */ + +/* disconnect-and-claim if the driver matches the driver field */ +#define USBDEVFS_DISCONNECT_CLAIM_IF_DRIVER 0x01 +/* disconnect-and-claim except when the driver matches the driver field */ +#define USBDEVFS_DISCONNECT_CLAIM_EXCEPT_DRIVER 0x02 + +struct usbdevfs_disconnect_claim { + unsigned int interface; + unsigned int flags; + char driver[USBDEVFS_MAXDRIVERNAME + 1]; +}; + +struct usbdevfs_streams { + unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */ + unsigned int num_eps; + unsigned char eps[]; +}; + +/* + * USB_SPEED_* values returned by USBDEVFS_GET_SPEED are defined in + * linux/usb/ch9.h + */ + +#define USBDEVFS_CONTROL _IOWR('U', 0, struct usbdevfs_ctrltransfer) +#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) +#define USBDEVFS_BULK _IOWR('U', 2, struct usbdevfs_bulktransfer) +#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) +#define USBDEVFS_RESETEP _IOR('U', 3, unsigned int) +#define USBDEVFS_SETINTERFACE _IOR('U', 4, struct usbdevfs_setinterface) +#define USBDEVFS_SETCONFIGURATION _IOR('U', 5, unsigned int) +#define USBDEVFS_GETDRIVER _IOW('U', 8, struct usbdevfs_getdriver) +#define USBDEVFS_SUBMITURB _IOR('U', 10, struct usbdevfs_urb) +#define USBDEVFS_SUBMITURB32 _IOR('U', 10, struct usbdevfs_urb32) +#define USBDEVFS_DISCARDURB _IO('U', 11) +#define USBDEVFS_REAPURB _IOW('U', 12, void *) +#define USBDEVFS_REAPURB32 _IOW('U', 12, __u32) +#define USBDEVFS_REAPURBNDELAY _IOW('U', 13, void *) +#define USBDEVFS_REAPURBNDELAY32 _IOW('U', 13, __u32) +#define USBDEVFS_DISCSIGNAL _IOR('U', 14, struct usbdevfs_disconnectsignal) +#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) +#define USBDEVFS_CLAIMINTERFACE _IOR('U', 15, unsigned int) +#define USBDEVFS_RELEASEINTERFACE _IOR('U', 16, unsigned int) +#define USBDEVFS_CONNECTINFO _IOW('U', 17, struct usbdevfs_connectinfo) +#define USBDEVFS_IOCTL _IOWR('U', 18, struct usbdevfs_ioctl) +#define USBDEVFS_IOCTL32 _IOWR('U', 18, struct usbdevfs_ioctl32) +#define USBDEVFS_HUB_PORTINFO _IOR('U', 19, struct usbdevfs_hub_portinfo) +#define USBDEVFS_RESET _IO('U', 20) +#define USBDEVFS_CLEAR_HALT _IOR('U', 21, unsigned int) +#define USBDEVFS_DISCONNECT _IO('U', 22) +#define USBDEVFS_CONNECT _IO('U', 23) +#define USBDEVFS_CLAIM_PORT _IOR('U', 24, unsigned int) +#define USBDEVFS_RELEASE_PORT _IOR('U', 25, unsigned int) +#define USBDEVFS_GET_CAPABILITIES _IOR('U', 26, __u32) +#define USBDEVFS_DISCONNECT_CLAIM _IOR('U', 27, struct usbdevfs_disconnect_claim) +#define USBDEVFS_ALLOC_STREAMS _IOR('U', 28, struct usbdevfs_streams) +#define USBDEVFS_FREE_STREAMS _IOR('U', 29, struct usbdevfs_streams) +#define USBDEVFS_DROP_PRIVILEGES _IOW('U', 30, __u32) +#define USBDEVFS_GET_SPEED _IO('U', 31) +/* + * Returns struct usbdevfs_conninfo_ex; length is variable to allow + * extending size of the data returned. + */ +#define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) +#define USBDEVFS_FORBID_SUSPEND _IO('U', 33) +#define USBDEVFS_ALLOW_SUSPEND _IO('U', 34) +#define USBDEVFS_WAIT_FOR_RESUME _IO('U', 35) + +#endif /* _UAPI_LINUX_USBDEVICE_FS_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/vhost.h b/tools/perf/trace/beauty/include/uapi/linux/vhost.h new file mode 100644 index 0000000000..b95dd84eef --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/vhost.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include +#include +#include + +#define VHOST_FILE_UNBIND -1 + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits are used for + * vhost specific features. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Write logging setup. */ +/* Memory writes can optionally be logged by setting bit at an offset + * (calculated from the physical address) from specified log base. + * The bit is set using an atomic 32 bit operation. */ +/* Set base address for logging. */ +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Specify an eventfd file descriptor to signal on log write. */ +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +/* By default, a device gets one vhost_worker that its virtqueues share. This + * command allows the owner of the device to create an additional vhost_worker + * for the device. It can later be bound to 1 or more of its virtqueues using + * the VHOST_ATTACH_VRING_WORKER command. + * + * This must be called after VHOST_SET_OWNER and the caller must be the owner + * of the device. The new thread will inherit caller's cgroups and namespaces, + * and will share the caller's memory space. The new thread will also be + * counted against the caller's RLIMIT_NPROC value. + * + * The worker's ID used in other commands will be returned in + * vhost_worker_state. + */ +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any + * virtqueue. If userspace is not able to call this for workers its created, + * the kernel will free all the device's workers when the device is closed. + */ +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) + +/* Ring setup. */ +/* Set number of descriptors in ring. This parameter can not + * be modified while ring is running (bound to a device). */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Set addresses for the ring. */ +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) + +/* Set the vring byte order in num. Valid values are VHOST_VRING_LITTLE_ENDIAN + * or VHOST_VRING_BIG_ENDIAN (other values return -EINVAL). + * The byte order cannot be changed while the device is active: trying to do so + * returns -EBUSY. + * This is a legacy only API that is simply ignored when VIRTIO_F_VERSION_1 is + * set. + * Not all kernel configurations support this ioctl, but all configurations that + * support SET also support GET. + */ +#define VHOST_VRING_LITTLE_ENDIAN 0 +#define VHOST_VRING_BIG_ENDIAN 1 +#define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) +#define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's + * virtqueues. + * + * This will replace the virtqueue's existing worker. If the replaced worker + * is no longer attached to any virtqueues, it can be freed with + * VHOST_FREE_WORKER. + */ +#define VHOST_ATTACH_VRING_WORKER _IOW(VHOST_VIRTIO, 0x15, \ + struct vhost_vring_worker) +/* Return the vring worker's ID */ +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ + struct vhost_vring_worker) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +/* Set busy loop timeout (in us) */ +#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \ + struct vhost_vring_state) +/* Get busy loop timeout (in us) */ +#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ + struct vhost_vring_state) + +/* Set or get vhost backend capability */ + +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) + +/* VHOST_NET specific defines */ + +/* Attach virtio net ring to a raw socket, or tap device. + * The socket must be already bound to an ethernet device, this device will be + * used for transmit. Pass fd -1 to unbind from the socket and the transmit + * device. This can be used to stop the ring (e.g. for migration). */ +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/* VHOST_SCSI specific defines */ + +#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) +#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) +/* Changing this breaks userspace. */ +#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) +/* Set and get the events missed flag */ +#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) +#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) + +/* VHOST_VSOCK specific defines */ + +#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) +#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) + +/* VHOST_VDPA specific defines */ + +/* Get the device id. The device ids follow the same definition of + * the device id defined in virtio-spec. + */ +#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32) +/* Get and set the status. The status bits follow the same definition + * of the device status defined in virtio-spec. + */ +#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8) +#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8) +/* Get and set the device config. The device config follows the same + * definition of the device config defined in virtio-spec. + */ +#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \ + struct vhost_vdpa_config) +#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \ + struct vhost_vdpa_config) +/* Enable/disable the ring. */ +#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \ + struct vhost_vring_state) +/* Get the max ring size. */ +#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16) + +/* Set event fd for config interrupt*/ +#define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) + +/* Get the valid iova range */ +#define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ + struct vhost_vdpa_iova_range) +/* Get the config size */ +#define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) + +/* Get the number of address spaces. */ +#define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) + +/* Get the group for a virtqueue: read index, write group in num, + * The virtqueue index is stored in the index field of + * vhost_vring_state. The group for this specific virtqueue is + * returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_GROUP _IOWR(VHOST_VIRTIO, 0x7B, \ + struct vhost_vring_state) +/* Set the ASID for a virtqueue group. The group index is stored in + * the index field of vhost_vring_state, the ASID associated with this + * group is stored at num field of vhost_vring_state. + */ +#define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ + struct vhost_vring_state) + +/* Suspend a device so it does not process virtqueue requests anymore + * + * After the return of ioctl the device must preserve all the necessary state + * (the virtqueue vring base plus the possible device specific states) that is + * required for restoring in the future. The device must not change its + * configuration after that point. + */ +#define VHOST_VDPA_SUSPEND _IO(VHOST_VIRTIO, 0x7D) + +/* Resume a device so it can resume processing virtqueue requests + * + * After the return of this ioctl the device will have restored all the + * necessary states and it is fully operational to continue processing the + * virtqueue descriptors. + */ +#define VHOST_VDPA_RESUME _IO(VHOST_VIRTIO, 0x7E) + +/* Get the group for the descriptor table including driver & device areas + * of a virtqueue: read index, write group in num. + * The virtqueue index is stored in the index field of vhost_vring_state. + * The group ID of the descriptor table for this specific virtqueue + * is returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ + struct vhost_vring_state) + + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + +/* Get the queue size of a specific virtqueue. + * userspace set the vring index in vhost_vring_state.index + * kernel set the queue size in vhost_vring_state.num + */ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ + struct vhost_vring_state) +#endif diff --git a/tools/perf/trace/beauty/include/uapi/sound/asound.h b/tools/perf/trace/beauty/include/uapi/sound/asound.h new file mode 100644 index 0000000000..628d46a0da --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/sound/asound.h @@ -0,0 +1,1252 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Advanced Linux Sound Architecture - ALSA - Driver + * Copyright (c) 1994-2003 by Jaroslav Kysela , + * Abramo Bagnara + */ + +#ifndef _UAPI__SOUND_ASOUND_H +#define _UAPI__SOUND_ASOUND_H + +#if defined(__KERNEL__) || defined(__linux__) +#include +#include +#else +#include +#include +#endif + +#ifndef __KERNEL__ +#include +#include +#endif + +/* + * protocol version + */ + +#define SNDRV_PROTOCOL_VERSION(major, minor, subminor) (((major)<<16)|((minor)<<8)|(subminor)) +#define SNDRV_PROTOCOL_MAJOR(version) (((version)>>16)&0xffff) +#define SNDRV_PROTOCOL_MINOR(version) (((version)>>8)&0xff) +#define SNDRV_PROTOCOL_MICRO(version) ((version)&0xff) +#define SNDRV_PROTOCOL_INCOMPATIBLE(kversion, uversion) \ + (SNDRV_PROTOCOL_MAJOR(kversion) != SNDRV_PROTOCOL_MAJOR(uversion) || \ + (SNDRV_PROTOCOL_MAJOR(kversion) == SNDRV_PROTOCOL_MAJOR(uversion) && \ + SNDRV_PROTOCOL_MINOR(kversion) != SNDRV_PROTOCOL_MINOR(uversion))) + +/**************************************************************************** + * * + * Digital audio interface * + * * + ****************************************************************************/ + +#define AES_IEC958_STATUS_SIZE 24 + +struct snd_aes_iec958 { + unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ + unsigned char subcode[147]; /* AES/IEC958 subcode bits */ + unsigned char pad; /* nothing */ + unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ +}; + +/**************************************************************************** + * * + * CEA-861 Audio InfoFrame. Used in HDMI and DisplayPort * + * * + ****************************************************************************/ + +struct snd_cea_861_aud_if { + unsigned char db1_ct_cc; /* coding type and channel count */ + unsigned char db2_sf_ss; /* sample frequency and size */ + unsigned char db3; /* not used, all zeros */ + unsigned char db4_ca; /* channel allocation code */ + unsigned char db5_dminh_lsv; /* downmix inhibit & level-shit values */ +}; + +/**************************************************************************** + * * + * Section for driver hardware dependent interface - /dev/snd/hw? * + * * + ****************************************************************************/ + +#define SNDRV_HWDEP_VERSION SNDRV_PROTOCOL_VERSION(1, 0, 1) + +enum { + SNDRV_HWDEP_IFACE_OPL2 = 0, + SNDRV_HWDEP_IFACE_OPL3, + SNDRV_HWDEP_IFACE_OPL4, + SNDRV_HWDEP_IFACE_SB16CSP, /* Creative Signal Processor */ + SNDRV_HWDEP_IFACE_EMU10K1, /* FX8010 processor in EMU10K1 chip */ + SNDRV_HWDEP_IFACE_YSS225, /* Yamaha FX processor */ + SNDRV_HWDEP_IFACE_ICS2115, /* Wavetable synth */ + SNDRV_HWDEP_IFACE_SSCAPE, /* Ensoniq SoundScape ISA card (MC68EC000) */ + SNDRV_HWDEP_IFACE_VX, /* Digigram VX cards */ + SNDRV_HWDEP_IFACE_MIXART, /* Digigram miXart cards */ + SNDRV_HWDEP_IFACE_USX2Y, /* Tascam US122, US224 & US428 usb */ + SNDRV_HWDEP_IFACE_EMUX_WAVETABLE, /* EmuX wavetable */ + SNDRV_HWDEP_IFACE_BLUETOOTH, /* Bluetooth audio */ + SNDRV_HWDEP_IFACE_USX2Y_PCM, /* Tascam US122, US224 & US428 rawusb pcm */ + SNDRV_HWDEP_IFACE_PCXHR, /* Digigram PCXHR */ + SNDRV_HWDEP_IFACE_SB_RC, /* SB Extigy/Audigy2NX remote control */ + SNDRV_HWDEP_IFACE_HDA, /* HD-audio */ + SNDRV_HWDEP_IFACE_USB_STREAM, /* direct access to usb stream */ + SNDRV_HWDEP_IFACE_FW_DICE, /* TC DICE FireWire device */ + SNDRV_HWDEP_IFACE_FW_FIREWORKS, /* Echo Audio Fireworks based device */ + SNDRV_HWDEP_IFACE_FW_BEBOB, /* BridgeCo BeBoB based device */ + SNDRV_HWDEP_IFACE_FW_OXFW, /* Oxford OXFW970/971 based device */ + SNDRV_HWDEP_IFACE_FW_DIGI00X, /* Digidesign Digi 002/003 family */ + SNDRV_HWDEP_IFACE_FW_TASCAM, /* TASCAM FireWire series */ + SNDRV_HWDEP_IFACE_LINE6, /* Line6 USB processors */ + SNDRV_HWDEP_IFACE_FW_MOTU, /* MOTU FireWire series */ + SNDRV_HWDEP_IFACE_FW_FIREFACE, /* RME Fireface series */ + + /* Don't forget to change the following: */ + SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_FIREFACE +}; + +struct snd_hwdep_info { + unsigned int device; /* WR: device number */ + int card; /* R: card number */ + unsigned char id[64]; /* ID (user selectable) */ + unsigned char name[80]; /* hwdep name */ + int iface; /* hwdep interface */ + unsigned char reserved[64]; /* reserved for future */ +}; + +/* generic DSP loader */ +struct snd_hwdep_dsp_status { + unsigned int version; /* R: driver-specific version */ + unsigned char id[32]; /* R: driver-specific ID string */ + unsigned int num_dsps; /* R: number of DSP images to transfer */ + unsigned int dsp_loaded; /* R: bit flags indicating the loaded DSPs */ + unsigned int chip_ready; /* R: 1 = initialization finished */ + unsigned char reserved[16]; /* reserved for future use */ +}; + +struct snd_hwdep_dsp_image { + unsigned int index; /* W: DSP index */ + unsigned char name[64]; /* W: ID (e.g. file name) */ + unsigned char __user *image; /* W: binary image */ + size_t length; /* W: size of image in bytes */ + unsigned long driver_data; /* W: driver-specific data */ +}; + +#define SNDRV_HWDEP_IOCTL_PVERSION _IOR ('H', 0x00, int) +#define SNDRV_HWDEP_IOCTL_INFO _IOR ('H', 0x01, struct snd_hwdep_info) +#define SNDRV_HWDEP_IOCTL_DSP_STATUS _IOR('H', 0x02, struct snd_hwdep_dsp_status) +#define SNDRV_HWDEP_IOCTL_DSP_LOAD _IOW('H', 0x03, struct snd_hwdep_dsp_image) + +/***************************************************************************** + * * + * Digital Audio (PCM) interface - /dev/snd/pcm?? * + * * + *****************************************************************************/ + +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) + +typedef unsigned long snd_pcm_uframes_t; +typedef signed long snd_pcm_sframes_t; + +enum { + SNDRV_PCM_CLASS_GENERIC = 0, /* standard mono or stereo device */ + SNDRV_PCM_CLASS_MULTI, /* multichannel device */ + SNDRV_PCM_CLASS_MODEM, /* software modem class */ + SNDRV_PCM_CLASS_DIGITIZER, /* digitizer class */ + /* Don't forget to change the following: */ + SNDRV_PCM_CLASS_LAST = SNDRV_PCM_CLASS_DIGITIZER, +}; + +enum { + SNDRV_PCM_SUBCLASS_GENERIC_MIX = 0, /* mono or stereo subdevices are mixed together */ + SNDRV_PCM_SUBCLASS_MULTI_MIX, /* multichannel subdevices are mixed together */ + /* Don't forget to change the following: */ + SNDRV_PCM_SUBCLASS_LAST = SNDRV_PCM_SUBCLASS_MULTI_MIX, +}; + +enum { + SNDRV_PCM_STREAM_PLAYBACK = 0, + SNDRV_PCM_STREAM_CAPTURE, + SNDRV_PCM_STREAM_LAST = SNDRV_PCM_STREAM_CAPTURE, +}; + +typedef int __bitwise snd_pcm_access_t; +#define SNDRV_PCM_ACCESS_MMAP_INTERLEAVED ((__force snd_pcm_access_t) 0) /* interleaved mmap */ +#define SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED ((__force snd_pcm_access_t) 1) /* noninterleaved mmap */ +#define SNDRV_PCM_ACCESS_MMAP_COMPLEX ((__force snd_pcm_access_t) 2) /* complex mmap */ +#define SNDRV_PCM_ACCESS_RW_INTERLEAVED ((__force snd_pcm_access_t) 3) /* readi/writei */ +#define SNDRV_PCM_ACCESS_RW_NONINTERLEAVED ((__force snd_pcm_access_t) 4) /* readn/writen */ +#define SNDRV_PCM_ACCESS_LAST SNDRV_PCM_ACCESS_RW_NONINTERLEAVED + +typedef int __bitwise snd_pcm_format_t; +#define SNDRV_PCM_FORMAT_S8 ((__force snd_pcm_format_t) 0) +#define SNDRV_PCM_FORMAT_U8 ((__force snd_pcm_format_t) 1) +#define SNDRV_PCM_FORMAT_S16_LE ((__force snd_pcm_format_t) 2) +#define SNDRV_PCM_FORMAT_S16_BE ((__force snd_pcm_format_t) 3) +#define SNDRV_PCM_FORMAT_U16_LE ((__force snd_pcm_format_t) 4) +#define SNDRV_PCM_FORMAT_U16_BE ((__force snd_pcm_format_t) 5) +#define SNDRV_PCM_FORMAT_S24_LE ((__force snd_pcm_format_t) 6) /* low three bytes */ +#define SNDRV_PCM_FORMAT_S24_BE ((__force snd_pcm_format_t) 7) /* low three bytes */ +#define SNDRV_PCM_FORMAT_U24_LE ((__force snd_pcm_format_t) 8) /* low three bytes */ +#define SNDRV_PCM_FORMAT_U24_BE ((__force snd_pcm_format_t) 9) /* low three bytes */ +/* + * For S32/U32 formats, 'msbits' hardware parameter is often used to deliver information about the + * available bit count in most significant bit. It's for the case of so-called 'left-justified' or + * `right-padding` sample which has less width than 32 bit. + */ +#define SNDRV_PCM_FORMAT_S32_LE ((__force snd_pcm_format_t) 10) +#define SNDRV_PCM_FORMAT_S32_BE ((__force snd_pcm_format_t) 11) +#define SNDRV_PCM_FORMAT_U32_LE ((__force snd_pcm_format_t) 12) +#define SNDRV_PCM_FORMAT_U32_BE ((__force snd_pcm_format_t) 13) +#define SNDRV_PCM_FORMAT_FLOAT_LE ((__force snd_pcm_format_t) 14) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_FLOAT_BE ((__force snd_pcm_format_t) 15) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_FLOAT64_LE ((__force snd_pcm_format_t) 16) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_FLOAT64_BE ((__force snd_pcm_format_t) 17) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE ((__force snd_pcm_format_t) 18) /* IEC-958 subframe, Little Endian */ +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE ((__force snd_pcm_format_t) 19) /* IEC-958 subframe, Big Endian */ +#define SNDRV_PCM_FORMAT_MU_LAW ((__force snd_pcm_format_t) 20) +#define SNDRV_PCM_FORMAT_A_LAW ((__force snd_pcm_format_t) 21) +#define SNDRV_PCM_FORMAT_IMA_ADPCM ((__force snd_pcm_format_t) 22) +#define SNDRV_PCM_FORMAT_MPEG ((__force snd_pcm_format_t) 23) +#define SNDRV_PCM_FORMAT_GSM ((__force snd_pcm_format_t) 24) +#define SNDRV_PCM_FORMAT_S20_LE ((__force snd_pcm_format_t) 25) /* in four bytes, LSB justified */ +#define SNDRV_PCM_FORMAT_S20_BE ((__force snd_pcm_format_t) 26) /* in four bytes, LSB justified */ +#define SNDRV_PCM_FORMAT_U20_LE ((__force snd_pcm_format_t) 27) /* in four bytes, LSB justified */ +#define SNDRV_PCM_FORMAT_U20_BE ((__force snd_pcm_format_t) 28) /* in four bytes, LSB justified */ +/* gap in the numbering for a future standard linear format */ +#define SNDRV_PCM_FORMAT_SPECIAL ((__force snd_pcm_format_t) 31) +#define SNDRV_PCM_FORMAT_S24_3LE ((__force snd_pcm_format_t) 32) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S24_3BE ((__force snd_pcm_format_t) 33) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U24_3LE ((__force snd_pcm_format_t) 34) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U24_3BE ((__force snd_pcm_format_t) 35) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S20_3LE ((__force snd_pcm_format_t) 36) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S20_3BE ((__force snd_pcm_format_t) 37) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U20_3LE ((__force snd_pcm_format_t) 38) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U20_3BE ((__force snd_pcm_format_t) 39) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S18_3LE ((__force snd_pcm_format_t) 40) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S18_3BE ((__force snd_pcm_format_t) 41) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U18_3LE ((__force snd_pcm_format_t) 42) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U18_3BE ((__force snd_pcm_format_t) 43) /* in three bytes */ +#define SNDRV_PCM_FORMAT_G723_24 ((__force snd_pcm_format_t) 44) /* 8 samples in 3 bytes */ +#define SNDRV_PCM_FORMAT_G723_24_1B ((__force snd_pcm_format_t) 45) /* 1 sample in 1 byte */ +#define SNDRV_PCM_FORMAT_G723_40 ((__force snd_pcm_format_t) 46) /* 8 Samples in 5 bytes */ +#define SNDRV_PCM_FORMAT_G723_40_1B ((__force snd_pcm_format_t) 47) /* 1 sample in 1 byte */ +#define SNDRV_PCM_FORMAT_DSD_U8 ((__force snd_pcm_format_t) 48) /* DSD, 1-byte samples DSD (x8) */ +#define SNDRV_PCM_FORMAT_DSD_U16_LE ((__force snd_pcm_format_t) 49) /* DSD, 2-byte samples DSD (x16), little endian */ +#define SNDRV_PCM_FORMAT_DSD_U32_LE ((__force snd_pcm_format_t) 50) /* DSD, 4-byte samples DSD (x32), little endian */ +#define SNDRV_PCM_FORMAT_DSD_U16_BE ((__force snd_pcm_format_t) 51) /* DSD, 2-byte samples DSD (x16), big endian */ +#define SNDRV_PCM_FORMAT_DSD_U32_BE ((__force snd_pcm_format_t) 52) /* DSD, 4-byte samples DSD (x32), big endian */ +#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U32_BE +#define SNDRV_PCM_FORMAT_FIRST SNDRV_PCM_FORMAT_S8 + +#ifdef SNDRV_LITTLE_ENDIAN +#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_LE +#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_LE +#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_LE +#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_LE +#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_LE +#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_LE +#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_LE +#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_LE +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE +#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_LE +#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_LE +#endif +#ifdef SNDRV_BIG_ENDIAN +#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_BE +#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_BE +#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_BE +#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_BE +#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_BE +#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_BE +#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_BE +#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_BE +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE +#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_BE +#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_BE +#endif + +typedef int __bitwise snd_pcm_subformat_t; +#define SNDRV_PCM_SUBFORMAT_STD ((__force snd_pcm_subformat_t) 0) +#define SNDRV_PCM_SUBFORMAT_MSBITS_MAX ((__force snd_pcm_subformat_t) 1) +#define SNDRV_PCM_SUBFORMAT_MSBITS_20 ((__force snd_pcm_subformat_t) 2) +#define SNDRV_PCM_SUBFORMAT_MSBITS_24 ((__force snd_pcm_subformat_t) 3) +#define SNDRV_PCM_SUBFORMAT_LAST SNDRV_PCM_SUBFORMAT_MSBITS_24 + +#define SNDRV_PCM_INFO_MMAP 0x00000001 /* hardware supports mmap */ +#define SNDRV_PCM_INFO_MMAP_VALID 0x00000002 /* period data are valid during transfer */ +#define SNDRV_PCM_INFO_DOUBLE 0x00000004 /* Double buffering needed for PCM start/stop */ +#define SNDRV_PCM_INFO_BATCH 0x00000010 /* double buffering */ +#define SNDRV_PCM_INFO_SYNC_APPLPTR 0x00000020 /* need the explicit sync of appl_ptr update */ +#define SNDRV_PCM_INFO_PERFECT_DRAIN 0x00000040 /* silencing at the end of stream is not required */ +#define SNDRV_PCM_INFO_INTERLEAVED 0x00000100 /* channels are interleaved */ +#define SNDRV_PCM_INFO_NONINTERLEAVED 0x00000200 /* channels are not interleaved */ +#define SNDRV_PCM_INFO_COMPLEX 0x00000400 /* complex frame organization (mmap only) */ +#define SNDRV_PCM_INFO_BLOCK_TRANSFER 0x00010000 /* hardware transfer block of samples */ +#define SNDRV_PCM_INFO_OVERRANGE 0x00020000 /* hardware supports ADC (capture) overrange detection */ +#define SNDRV_PCM_INFO_RESUME 0x00040000 /* hardware supports stream resume after suspend */ +#define SNDRV_PCM_INFO_PAUSE 0x00080000 /* pause ioctl is supported */ +#define SNDRV_PCM_INFO_HALF_DUPLEX 0x00100000 /* only half duplex */ +#define SNDRV_PCM_INFO_JOINT_DUPLEX 0x00200000 /* playback and capture stream are somewhat correlated */ +#define SNDRV_PCM_INFO_SYNC_START 0x00400000 /* pcm support some kind of sync go */ +#define SNDRV_PCM_INFO_NO_PERIOD_WAKEUP 0x00800000 /* period wakeup can be disabled */ +#define SNDRV_PCM_INFO_HAS_WALL_CLOCK 0x01000000 /* (Deprecated)has audio wall clock for audio/system time sync */ +#define SNDRV_PCM_INFO_HAS_LINK_ATIME 0x01000000 /* report hardware link audio time, reset on startup */ +#define SNDRV_PCM_INFO_HAS_LINK_ABSOLUTE_ATIME 0x02000000 /* report absolute hardware link audio time, not reset on startup */ +#define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME 0x04000000 /* report estimated link audio time */ +#define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000 /* report synchronized audio/system time */ +#define SNDRV_PCM_INFO_EXPLICIT_SYNC 0x10000000 /* needs explicit sync of pointers and data */ +#define SNDRV_PCM_INFO_NO_REWINDS 0x20000000 /* hardware can only support monotonic changes of appl_ptr */ +#define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ +#define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ + +#if (__BITS_PER_LONG == 32 && defined(__USE_TIME_BITS64)) || defined __KERNEL__ +#define __SND_STRUCT_TIME64 +#endif + +typedef int __bitwise snd_pcm_state_t; +#define SNDRV_PCM_STATE_OPEN ((__force snd_pcm_state_t) 0) /* stream is open */ +#define SNDRV_PCM_STATE_SETUP ((__force snd_pcm_state_t) 1) /* stream has a setup */ +#define SNDRV_PCM_STATE_PREPARED ((__force snd_pcm_state_t) 2) /* stream is ready to start */ +#define SNDRV_PCM_STATE_RUNNING ((__force snd_pcm_state_t) 3) /* stream is running */ +#define SNDRV_PCM_STATE_XRUN ((__force snd_pcm_state_t) 4) /* stream reached an xrun */ +#define SNDRV_PCM_STATE_DRAINING ((__force snd_pcm_state_t) 5) /* stream is draining */ +#define SNDRV_PCM_STATE_PAUSED ((__force snd_pcm_state_t) 6) /* stream is paused */ +#define SNDRV_PCM_STATE_SUSPENDED ((__force snd_pcm_state_t) 7) /* hardware is suspended */ +#define SNDRV_PCM_STATE_DISCONNECTED ((__force snd_pcm_state_t) 8) /* hardware is disconnected */ +#define SNDRV_PCM_STATE_LAST SNDRV_PCM_STATE_DISCONNECTED + +enum { + SNDRV_PCM_MMAP_OFFSET_DATA = 0x00000000, + SNDRV_PCM_MMAP_OFFSET_STATUS_OLD = 0x80000000, + SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD = 0x81000000, + SNDRV_PCM_MMAP_OFFSET_STATUS_NEW = 0x82000000, + SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW = 0x83000000, +#ifdef __SND_STRUCT_TIME64 + SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_NEW, + SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW, +#else + SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_OLD, + SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD, +#endif +}; + +union snd_pcm_sync_id { + unsigned char id[16]; + unsigned short id16[8]; + unsigned int id32[4]; +}; + +struct snd_pcm_info { + unsigned int device; /* RO/WR (control): device number */ + unsigned int subdevice; /* RO/WR (control): subdevice number */ + int stream; /* RO/WR (control): stream direction */ + int card; /* R: card number */ + unsigned char id[64]; /* ID (user selectable) */ + unsigned char name[80]; /* name of this device */ + unsigned char subname[32]; /* subdevice name */ + int dev_class; /* SNDRV_PCM_CLASS_* */ + int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */ + unsigned int subdevices_count; + unsigned int subdevices_avail; + union snd_pcm_sync_id sync; /* hardware synchronization ID */ + unsigned char reserved[64]; /* reserved for future... */ +}; + +typedef int snd_pcm_hw_param_t; +#define SNDRV_PCM_HW_PARAM_ACCESS 0 /* Access type */ +#define SNDRV_PCM_HW_PARAM_FORMAT 1 /* Format */ +#define SNDRV_PCM_HW_PARAM_SUBFORMAT 2 /* Subformat */ +#define SNDRV_PCM_HW_PARAM_FIRST_MASK SNDRV_PCM_HW_PARAM_ACCESS +#define SNDRV_PCM_HW_PARAM_LAST_MASK SNDRV_PCM_HW_PARAM_SUBFORMAT + +#define SNDRV_PCM_HW_PARAM_SAMPLE_BITS 8 /* Bits per sample */ +#define SNDRV_PCM_HW_PARAM_FRAME_BITS 9 /* Bits per frame */ +#define SNDRV_PCM_HW_PARAM_CHANNELS 10 /* Channels */ +#define SNDRV_PCM_HW_PARAM_RATE 11 /* Approx rate */ +#define SNDRV_PCM_HW_PARAM_PERIOD_TIME 12 /* Approx distance between + * interrupts in us + */ +#define SNDRV_PCM_HW_PARAM_PERIOD_SIZE 13 /* Approx frames between + * interrupts + */ +#define SNDRV_PCM_HW_PARAM_PERIOD_BYTES 14 /* Approx bytes between + * interrupts + */ +#define SNDRV_PCM_HW_PARAM_PERIODS 15 /* Approx interrupts per + * buffer + */ +#define SNDRV_PCM_HW_PARAM_BUFFER_TIME 16 /* Approx duration of buffer + * in us + */ +#define SNDRV_PCM_HW_PARAM_BUFFER_SIZE 17 /* Size of buffer in frames */ +#define SNDRV_PCM_HW_PARAM_BUFFER_BYTES 18 /* Size of buffer in bytes */ +#define SNDRV_PCM_HW_PARAM_TICK_TIME 19 /* Approx tick duration in us */ +#define SNDRV_PCM_HW_PARAM_FIRST_INTERVAL SNDRV_PCM_HW_PARAM_SAMPLE_BITS +#define SNDRV_PCM_HW_PARAM_LAST_INTERVAL SNDRV_PCM_HW_PARAM_TICK_TIME + +#define SNDRV_PCM_HW_PARAMS_NORESAMPLE (1<<0) /* avoid rate resampling */ +#define SNDRV_PCM_HW_PARAMS_EXPORT_BUFFER (1<<1) /* export buffer */ +#define SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP (1<<2) /* disable period wakeups */ +#define SNDRV_PCM_HW_PARAMS_NO_DRAIN_SILENCE (1<<3) /* suppress drain with the filling + * of the silence samples + */ + +struct snd_interval { + unsigned int min, max; + unsigned int openmin:1, + openmax:1, + integer:1, + empty:1; +}; + +#define SNDRV_MASK_MAX 256 + +struct snd_mask { + __u32 bits[(SNDRV_MASK_MAX+31)/32]; +}; + +struct snd_pcm_hw_params { + unsigned int flags; + struct snd_mask masks[SNDRV_PCM_HW_PARAM_LAST_MASK - + SNDRV_PCM_HW_PARAM_FIRST_MASK + 1]; + struct snd_mask mres[5]; /* reserved masks */ + struct snd_interval intervals[SNDRV_PCM_HW_PARAM_LAST_INTERVAL - + SNDRV_PCM_HW_PARAM_FIRST_INTERVAL + 1]; + struct snd_interval ires[9]; /* reserved intervals */ + unsigned int rmask; /* W: requested masks */ + unsigned int cmask; /* R: changed masks */ + unsigned int info; /* R: Info flags for returned setup */ + unsigned int msbits; /* R: used most significant bits (in sample bit-width) */ + unsigned int rate_num; /* R: rate numerator */ + unsigned int rate_den; /* R: rate denominator */ + snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ + unsigned char reserved[64]; /* reserved for future */ +}; + +enum { + SNDRV_PCM_TSTAMP_NONE = 0, + SNDRV_PCM_TSTAMP_ENABLE, + SNDRV_PCM_TSTAMP_LAST = SNDRV_PCM_TSTAMP_ENABLE, +}; + +struct snd_pcm_sw_params { + int tstamp_mode; /* timestamp mode */ + unsigned int period_step; + unsigned int sleep_min; /* min ticks to sleep */ + snd_pcm_uframes_t avail_min; /* min avail frames for wakeup */ + snd_pcm_uframes_t xfer_align; /* obsolete: xfer size need to be a multiple */ + snd_pcm_uframes_t start_threshold; /* min hw_avail frames for automatic start */ + /* + * The following two thresholds alleviate playback buffer underruns; when + * hw_avail drops below the threshold, the respective action is triggered: + */ + snd_pcm_uframes_t stop_threshold; /* - stop playback */ + snd_pcm_uframes_t silence_threshold; /* - pre-fill buffer with silence */ + snd_pcm_uframes_t silence_size; /* max size of silence pre-fill; when >= boundary, + * fill played area with silence immediately */ + snd_pcm_uframes_t boundary; /* pointers wrap point */ + unsigned int proto; /* protocol version */ + unsigned int tstamp_type; /* timestamp type (req. proto >= 2.0.12) */ + unsigned char reserved[56]; /* reserved for future */ +}; + +struct snd_pcm_channel_info { + unsigned int channel; + __kernel_off_t offset; /* mmap offset */ + unsigned int first; /* offset to first sample in bits */ + unsigned int step; /* samples distance in bits */ +}; + +enum { + /* + * first definition for backwards compatibility only, + * maps to wallclock/link time for HDAudio playback and DEFAULT/DMA time for everything else + */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT = 0, + + /* timestamp definitions */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT = 1, /* DMA time, reported as per hw_ptr */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK = 2, /* link time reported by sample or wallclock counter, reset on startup */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ABSOLUTE = 3, /* link time reported by sample or wallclock counter, not reset on startup */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ESTIMATED = 4, /* link time estimated indirectly */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED = 5, /* link time synchronized with system time */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LAST = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED +}; + +#ifndef __KERNEL__ +/* explicit padding avoids incompatibility between i386 and x86-64 */ +typedef struct { unsigned char pad[sizeof(time_t) - sizeof(int)]; } __time_pad; + +struct snd_pcm_status { + snd_pcm_state_t state; /* stream state */ + __time_pad pad1; /* align to timespec */ + struct timespec trigger_tstamp; /* time when stream was started/stopped/paused */ + struct timespec tstamp; /* reference timestamp */ + snd_pcm_uframes_t appl_ptr; /* appl ptr */ + snd_pcm_uframes_t hw_ptr; /* hw ptr */ + snd_pcm_sframes_t delay; /* current delay in frames */ + snd_pcm_uframes_t avail; /* number of frames available */ + snd_pcm_uframes_t avail_max; /* max frames available on hw since last status */ + snd_pcm_uframes_t overrange; /* count of ADC (capture) overrange detections from last status */ + snd_pcm_state_t suspended_state; /* suspended stream state */ + __u32 audio_tstamp_data; /* needed for 64-bit alignment, used for configs/report to/from userspace */ + struct timespec audio_tstamp; /* sample counter, wall clock, PHC or on-demand sync'ed */ + struct timespec driver_tstamp; /* useful in case reference system tstamp is reported with delay */ + __u32 audio_tstamp_accuracy; /* in ns units, only valid if indicated in audio_tstamp_data */ + unsigned char reserved[52-2*sizeof(struct timespec)]; /* must be filled with zero */ +}; +#endif + +/* + * For mmap operations, we need the 64-bit layout, both for compat mode, + * and for y2038 compatibility. For 64-bit applications, the two definitions + * are identical, so we keep the traditional version. + */ +#ifdef __SND_STRUCT_TIME64 +#define __snd_pcm_mmap_status64 snd_pcm_mmap_status +#define __snd_pcm_mmap_control64 snd_pcm_mmap_control +#define __snd_pcm_sync_ptr64 snd_pcm_sync_ptr +#ifdef __KERNEL__ +#define __snd_timespec64 __kernel_timespec +#else +#define __snd_timespec64 timespec +#endif +struct __snd_timespec { + __s32 tv_sec; + __s32 tv_nsec; +}; +#else +#define __snd_pcm_mmap_status snd_pcm_mmap_status +#define __snd_pcm_mmap_control snd_pcm_mmap_control +#define __snd_pcm_sync_ptr snd_pcm_sync_ptr +#define __snd_timespec timespec +struct __snd_timespec64 { + __s64 tv_sec; + __s64 tv_nsec; +}; + +#endif + +struct __snd_pcm_mmap_status { + snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ + int pad1; /* Needed for 64 bit alignment */ + snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ + struct __snd_timespec tstamp; /* Timestamp */ + snd_pcm_state_t suspended_state; /* RO: suspended stream state */ + struct __snd_timespec audio_tstamp; /* from sample counter or wall clock */ +}; + +struct __snd_pcm_mmap_control { + snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ + snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ +}; + +#define SNDRV_PCM_SYNC_PTR_HWSYNC (1<<0) /* execute hwsync */ +#define SNDRV_PCM_SYNC_PTR_APPL (1<<1) /* get appl_ptr from driver (r/w op) */ +#define SNDRV_PCM_SYNC_PTR_AVAIL_MIN (1<<2) /* get avail_min from driver */ + +struct __snd_pcm_sync_ptr { + unsigned int flags; + union { + struct __snd_pcm_mmap_status status; + unsigned char reserved[64]; + } s; + union { + struct __snd_pcm_mmap_control control; + unsigned char reserved[64]; + } c; +}; + +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) +typedef char __pad_before_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; +typedef char __pad_after_uframe[0]; +#endif + +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) +typedef char __pad_before_uframe[0]; +typedef char __pad_after_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; +#endif + +struct __snd_pcm_mmap_status64 { + snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ + __u32 pad1; /* Needed for 64 bit alignment */ + __pad_before_uframe __pad1; + snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ + __pad_after_uframe __pad2; + struct __snd_timespec64 tstamp; /* Timestamp */ + snd_pcm_state_t suspended_state;/* RO: suspended stream state */ + __u32 pad3; /* Needed for 64 bit alignment */ + struct __snd_timespec64 audio_tstamp; /* sample counter or wall clock */ +}; + +struct __snd_pcm_mmap_control64 { + __pad_before_uframe __pad1; + snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ + __pad_before_uframe __pad2; // This should be __pad_after_uframe, but binary + // backwards compatibility constraints prevent a fix. + + __pad_before_uframe __pad3; + snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ + __pad_after_uframe __pad4; +}; + +struct __snd_pcm_sync_ptr64 { + __u32 flags; + __u32 pad1; + union { + struct __snd_pcm_mmap_status64 status; + unsigned char reserved[64]; + } s; + union { + struct __snd_pcm_mmap_control64 control; + unsigned char reserved[64]; + } c; +}; + +struct snd_xferi { + snd_pcm_sframes_t result; + void __user *buf; + snd_pcm_uframes_t frames; +}; + +struct snd_xfern { + snd_pcm_sframes_t result; + void __user * __user *bufs; + snd_pcm_uframes_t frames; +}; + +enum { + SNDRV_PCM_TSTAMP_TYPE_GETTIMEOFDAY = 0, /* gettimeofday equivalent */ + SNDRV_PCM_TSTAMP_TYPE_MONOTONIC, /* posix_clock_monotonic equivalent */ + SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, /* monotonic_raw (no NTP) */ + SNDRV_PCM_TSTAMP_TYPE_LAST = SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, +}; + +/* channel positions */ +enum { + SNDRV_CHMAP_UNKNOWN = 0, + SNDRV_CHMAP_NA, /* N/A, silent */ + SNDRV_CHMAP_MONO, /* mono stream */ + /* this follows the alsa-lib mixer channel value + 3 */ + SNDRV_CHMAP_FL, /* front left */ + SNDRV_CHMAP_FR, /* front right */ + SNDRV_CHMAP_RL, /* rear left */ + SNDRV_CHMAP_RR, /* rear right */ + SNDRV_CHMAP_FC, /* front center */ + SNDRV_CHMAP_LFE, /* LFE */ + SNDRV_CHMAP_SL, /* side left */ + SNDRV_CHMAP_SR, /* side right */ + SNDRV_CHMAP_RC, /* rear center */ + /* new definitions */ + SNDRV_CHMAP_FLC, /* front left center */ + SNDRV_CHMAP_FRC, /* front right center */ + SNDRV_CHMAP_RLC, /* rear left center */ + SNDRV_CHMAP_RRC, /* rear right center */ + SNDRV_CHMAP_FLW, /* front left wide */ + SNDRV_CHMAP_FRW, /* front right wide */ + SNDRV_CHMAP_FLH, /* front left high */ + SNDRV_CHMAP_FCH, /* front center high */ + SNDRV_CHMAP_FRH, /* front right high */ + SNDRV_CHMAP_TC, /* top center */ + SNDRV_CHMAP_TFL, /* top front left */ + SNDRV_CHMAP_TFR, /* top front right */ + SNDRV_CHMAP_TFC, /* top front center */ + SNDRV_CHMAP_TRL, /* top rear left */ + SNDRV_CHMAP_TRR, /* top rear right */ + SNDRV_CHMAP_TRC, /* top rear center */ + /* new definitions for UAC2 */ + SNDRV_CHMAP_TFLC, /* top front left center */ + SNDRV_CHMAP_TFRC, /* top front right center */ + SNDRV_CHMAP_TSL, /* top side left */ + SNDRV_CHMAP_TSR, /* top side right */ + SNDRV_CHMAP_LLFE, /* left LFE */ + SNDRV_CHMAP_RLFE, /* right LFE */ + SNDRV_CHMAP_BC, /* bottom center */ + SNDRV_CHMAP_BLC, /* bottom left center */ + SNDRV_CHMAP_BRC, /* bottom right center */ + SNDRV_CHMAP_LAST = SNDRV_CHMAP_BRC, +}; + +#define SNDRV_CHMAP_POSITION_MASK 0xffff +#define SNDRV_CHMAP_PHASE_INVERSE (0x01 << 16) +#define SNDRV_CHMAP_DRIVER_SPEC (0x02 << 16) + +#define SNDRV_PCM_IOCTL_PVERSION _IOR('A', 0x00, int) +#define SNDRV_PCM_IOCTL_INFO _IOR('A', 0x01, struct snd_pcm_info) +#define SNDRV_PCM_IOCTL_TSTAMP _IOW('A', 0x02, int) +#define SNDRV_PCM_IOCTL_TTSTAMP _IOW('A', 0x03, int) +#define SNDRV_PCM_IOCTL_USER_PVERSION _IOW('A', 0x04, int) +#define SNDRV_PCM_IOCTL_HW_REFINE _IOWR('A', 0x10, struct snd_pcm_hw_params) +#define SNDRV_PCM_IOCTL_HW_PARAMS _IOWR('A', 0x11, struct snd_pcm_hw_params) +#define SNDRV_PCM_IOCTL_HW_FREE _IO('A', 0x12) +#define SNDRV_PCM_IOCTL_SW_PARAMS _IOWR('A', 0x13, struct snd_pcm_sw_params) +#define SNDRV_PCM_IOCTL_STATUS _IOR('A', 0x20, struct snd_pcm_status) +#define SNDRV_PCM_IOCTL_DELAY _IOR('A', 0x21, snd_pcm_sframes_t) +#define SNDRV_PCM_IOCTL_HWSYNC _IO('A', 0x22) +#define __SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct __snd_pcm_sync_ptr) +#define __SNDRV_PCM_IOCTL_SYNC_PTR64 _IOWR('A', 0x23, struct __snd_pcm_sync_ptr64) +#define SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct snd_pcm_sync_ptr) +#define SNDRV_PCM_IOCTL_STATUS_EXT _IOWR('A', 0x24, struct snd_pcm_status) +#define SNDRV_PCM_IOCTL_CHANNEL_INFO _IOR('A', 0x32, struct snd_pcm_channel_info) +#define SNDRV_PCM_IOCTL_PREPARE _IO('A', 0x40) +#define SNDRV_PCM_IOCTL_RESET _IO('A', 0x41) +#define SNDRV_PCM_IOCTL_START _IO('A', 0x42) +#define SNDRV_PCM_IOCTL_DROP _IO('A', 0x43) +#define SNDRV_PCM_IOCTL_DRAIN _IO('A', 0x44) +#define SNDRV_PCM_IOCTL_PAUSE _IOW('A', 0x45, int) +#define SNDRV_PCM_IOCTL_REWIND _IOW('A', 0x46, snd_pcm_uframes_t) +#define SNDRV_PCM_IOCTL_RESUME _IO('A', 0x47) +#define SNDRV_PCM_IOCTL_XRUN _IO('A', 0x48) +#define SNDRV_PCM_IOCTL_FORWARD _IOW('A', 0x49, snd_pcm_uframes_t) +#define SNDRV_PCM_IOCTL_WRITEI_FRAMES _IOW('A', 0x50, struct snd_xferi) +#define SNDRV_PCM_IOCTL_READI_FRAMES _IOR('A', 0x51, struct snd_xferi) +#define SNDRV_PCM_IOCTL_WRITEN_FRAMES _IOW('A', 0x52, struct snd_xfern) +#define SNDRV_PCM_IOCTL_READN_FRAMES _IOR('A', 0x53, struct snd_xfern) +#define SNDRV_PCM_IOCTL_LINK _IOW('A', 0x60, int) +#define SNDRV_PCM_IOCTL_UNLINK _IO('A', 0x61) + +/***************************************************************************** + * * + * MIDI v1.0 interface * + * * + *****************************************************************************/ + +/* + * Raw MIDI section - /dev/snd/midi?? + */ + +#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 4) + +enum { + SNDRV_RAWMIDI_STREAM_OUTPUT = 0, + SNDRV_RAWMIDI_STREAM_INPUT, + SNDRV_RAWMIDI_STREAM_LAST = SNDRV_RAWMIDI_STREAM_INPUT, +}; + +#define SNDRV_RAWMIDI_INFO_OUTPUT 0x00000001 +#define SNDRV_RAWMIDI_INFO_INPUT 0x00000002 +#define SNDRV_RAWMIDI_INFO_DUPLEX 0x00000004 +#define SNDRV_RAWMIDI_INFO_UMP 0x00000008 + +struct snd_rawmidi_info { + unsigned int device; /* RO/WR (control): device number */ + unsigned int subdevice; /* RO/WR (control): subdevice number */ + int stream; /* WR: stream */ + int card; /* R: card number */ + unsigned int flags; /* SNDRV_RAWMIDI_INFO_XXXX */ + unsigned char id[64]; /* ID (user selectable) */ + unsigned char name[80]; /* name of device */ + unsigned char subname[32]; /* name of active or selected subdevice */ + unsigned int subdevices_count; + unsigned int subdevices_avail; + unsigned char reserved[64]; /* reserved for future use */ +}; + +#define SNDRV_RAWMIDI_MODE_FRAMING_MASK (7<<0) +#define SNDRV_RAWMIDI_MODE_FRAMING_SHIFT 0 +#define SNDRV_RAWMIDI_MODE_FRAMING_NONE (0<<0) +#define SNDRV_RAWMIDI_MODE_FRAMING_TSTAMP (1<<0) +#define SNDRV_RAWMIDI_MODE_CLOCK_MASK (7<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_SHIFT 3 +#define SNDRV_RAWMIDI_MODE_CLOCK_NONE (0<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_REALTIME (1<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC (2<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC_RAW (3<<3) + +#define SNDRV_RAWMIDI_FRAMING_DATA_LENGTH 16 + +struct snd_rawmidi_framing_tstamp { + /* For now, frame_type is always 0. Midi 2.0 is expected to add new + * types here. Applications are expected to skip unknown frame types. + */ + __u8 frame_type; + __u8 length; /* number of valid bytes in data field */ + __u8 reserved[2]; + __u32 tv_nsec; /* nanoseconds */ + __u64 tv_sec; /* seconds */ + __u8 data[SNDRV_RAWMIDI_FRAMING_DATA_LENGTH]; +} __packed; + +struct snd_rawmidi_params { + int stream; + size_t buffer_size; /* queue size in bytes */ + size_t avail_min; /* minimum avail bytes for wakeup */ + unsigned int no_active_sensing: 1; /* do not send active sensing byte in close() */ + unsigned int mode; /* For input data only, frame incoming data */ + unsigned char reserved[12]; /* reserved for future use */ +}; + +#ifndef __KERNEL__ +struct snd_rawmidi_status { + int stream; + __time_pad pad1; + struct timespec tstamp; /* Timestamp */ + size_t avail; /* available bytes */ + size_t xruns; /* count of overruns since last status (in bytes) */ + unsigned char reserved[16]; /* reserved for future use */ +}; +#endif + +/* UMP EP info flags */ +#define SNDRV_UMP_EP_INFO_STATIC_BLOCKS 0x01 + +/* UMP EP Protocol / JRTS capability bits */ +#define SNDRV_UMP_EP_INFO_PROTO_MIDI_MASK 0x0300 +#define SNDRV_UMP_EP_INFO_PROTO_MIDI1 0x0100 /* MIDI 1.0 */ +#define SNDRV_UMP_EP_INFO_PROTO_MIDI2 0x0200 /* MIDI 2.0 */ +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_MASK 0x0003 +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_TX 0x0001 /* JRTS Transmit */ +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_RX 0x0002 /* JRTS Receive */ + +/* UMP Endpoint information */ +struct snd_ump_endpoint_info { + int card; /* card number */ + int device; /* device number */ + unsigned int flags; /* additional info */ + unsigned int protocol_caps; /* protocol capabilities */ + unsigned int protocol; /* current protocol */ + unsigned int num_blocks; /* # of function blocks */ + unsigned short version; /* UMP major/minor version */ + unsigned short family_id; /* MIDI device family ID */ + unsigned short model_id; /* MIDI family model ID */ + unsigned int manufacturer_id; /* MIDI manufacturer ID */ + unsigned char sw_revision[4]; /* software revision */ + unsigned short padding; + unsigned char name[128]; /* endpoint name string */ + unsigned char product_id[128]; /* unique product id string */ + unsigned char reserved[32]; +} __packed; + +/* UMP direction */ +#define SNDRV_UMP_DIR_INPUT 0x01 +#define SNDRV_UMP_DIR_OUTPUT 0x02 +#define SNDRV_UMP_DIR_BIDIRECTION 0x03 + +/* UMP block info flags */ +#define SNDRV_UMP_BLOCK_IS_MIDI1 (1U << 0) /* MIDI 1.0 port w/o restrict */ +#define SNDRV_UMP_BLOCK_IS_LOWSPEED (1U << 1) /* 31.25Kbps B/W MIDI1 port */ + +/* UMP block user-interface hint */ +#define SNDRV_UMP_BLOCK_UI_HINT_UNKNOWN 0x00 +#define SNDRV_UMP_BLOCK_UI_HINT_RECEIVER 0x01 +#define SNDRV_UMP_BLOCK_UI_HINT_SENDER 0x02 +#define SNDRV_UMP_BLOCK_UI_HINT_BOTH 0x03 + +/* UMP groups and blocks */ +#define SNDRV_UMP_MAX_GROUPS 16 +#define SNDRV_UMP_MAX_BLOCKS 32 + +/* UMP Block information */ +struct snd_ump_block_info { + int card; /* card number */ + int device; /* device number */ + unsigned char block_id; /* block ID (R/W) */ + unsigned char direction; /* UMP direction */ + unsigned char active; /* Activeness */ + unsigned char first_group; /* first group ID */ + unsigned char num_groups; /* number of groups */ + unsigned char midi_ci_version; /* MIDI-CI support version */ + unsigned char sysex8_streams; /* max number of sysex8 streams */ + unsigned char ui_hint; /* user interface hint */ + unsigned int flags; /* various info flags */ + unsigned char name[128]; /* block name string */ + unsigned char reserved[32]; +} __packed; + +#define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) +#define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) +#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) +#define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) +#define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) +#define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) +#define SNDRV_RAWMIDI_IOCTL_DRAIN _IOW('W', 0x31, int) +/* Additional ioctls for UMP rawmidi devices */ +#define SNDRV_UMP_IOCTL_ENDPOINT_INFO _IOR('W', 0x40, struct snd_ump_endpoint_info) +#define SNDRV_UMP_IOCTL_BLOCK_INFO _IOR('W', 0x41, struct snd_ump_block_info) + +/* + * Timer section - /dev/snd/timer + */ + +#define SNDRV_TIMER_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 7) + +enum { + SNDRV_TIMER_CLASS_NONE = -1, + SNDRV_TIMER_CLASS_SLAVE = 0, + SNDRV_TIMER_CLASS_GLOBAL, + SNDRV_TIMER_CLASS_CARD, + SNDRV_TIMER_CLASS_PCM, + SNDRV_TIMER_CLASS_LAST = SNDRV_TIMER_CLASS_PCM, +}; + +/* slave timer classes */ +enum { + SNDRV_TIMER_SCLASS_NONE = 0, + SNDRV_TIMER_SCLASS_APPLICATION, + SNDRV_TIMER_SCLASS_SEQUENCER, /* alias */ + SNDRV_TIMER_SCLASS_OSS_SEQUENCER, /* alias */ + SNDRV_TIMER_SCLASS_LAST = SNDRV_TIMER_SCLASS_OSS_SEQUENCER, +}; + +/* global timers (device member) */ +#define SNDRV_TIMER_GLOBAL_SYSTEM 0 +#define SNDRV_TIMER_GLOBAL_RTC 1 /* unused */ +#define SNDRV_TIMER_GLOBAL_HPET 2 +#define SNDRV_TIMER_GLOBAL_HRTIMER 3 + +/* info flags */ +#define SNDRV_TIMER_FLG_SLAVE (1<<0) /* cannot be controlled */ + +struct snd_timer_id { + int dev_class; + int dev_sclass; + int card; + int device; + int subdevice; +}; + +struct snd_timer_ginfo { + struct snd_timer_id tid; /* requested timer ID */ + unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ + int card; /* card number */ + unsigned char id[64]; /* timer identification */ + unsigned char name[80]; /* timer name */ + unsigned long reserved0; /* reserved for future use */ + unsigned long resolution; /* average period resolution in ns */ + unsigned long resolution_min; /* minimal period resolution in ns */ + unsigned long resolution_max; /* maximal period resolution in ns */ + unsigned int clients; /* active timer clients */ + unsigned char reserved[32]; +}; + +struct snd_timer_gparams { + struct snd_timer_id tid; /* requested timer ID */ + unsigned long period_num; /* requested precise period duration (in seconds) - numerator */ + unsigned long period_den; /* requested precise period duration (in seconds) - denominator */ + unsigned char reserved[32]; +}; + +struct snd_timer_gstatus { + struct snd_timer_id tid; /* requested timer ID */ + unsigned long resolution; /* current period resolution in ns */ + unsigned long resolution_num; /* precise current period resolution (in seconds) - numerator */ + unsigned long resolution_den; /* precise current period resolution (in seconds) - denominator */ + unsigned char reserved[32]; +}; + +struct snd_timer_select { + struct snd_timer_id id; /* bind to timer ID */ + unsigned char reserved[32]; /* reserved */ +}; + +struct snd_timer_info { + unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ + int card; /* card number */ + unsigned char id[64]; /* timer identificator */ + unsigned char name[80]; /* timer name */ + unsigned long reserved0; /* reserved for future use */ + unsigned long resolution; /* average period resolution in ns */ + unsigned char reserved[64]; /* reserved */ +}; + +#define SNDRV_TIMER_PSFLG_AUTO (1<<0) /* auto start, otherwise one-shot */ +#define SNDRV_TIMER_PSFLG_EXCLUSIVE (1<<1) /* exclusive use, precise start/stop/pause/continue */ +#define SNDRV_TIMER_PSFLG_EARLY_EVENT (1<<2) /* write early event to the poll queue */ + +struct snd_timer_params { + unsigned int flags; /* flags - SNDRV_TIMER_PSFLG_* */ + unsigned int ticks; /* requested resolution in ticks */ + unsigned int queue_size; /* total size of queue (32-1024) */ + unsigned int reserved0; /* reserved, was: failure locations */ + unsigned int filter; /* event filter (bitmask of SNDRV_TIMER_EVENT_*) */ + unsigned char reserved[60]; /* reserved */ +}; + +#ifndef __KERNEL__ +struct snd_timer_status { + struct timespec tstamp; /* Timestamp - last update */ + unsigned int resolution; /* current period resolution in ns */ + unsigned int lost; /* counter of master tick lost */ + unsigned int overrun; /* count of read queue overruns */ + unsigned int queue; /* used queue size */ + unsigned char reserved[64]; /* reserved */ +}; +#endif + +#define SNDRV_TIMER_IOCTL_PVERSION _IOR('T', 0x00, int) +#define SNDRV_TIMER_IOCTL_NEXT_DEVICE _IOWR('T', 0x01, struct snd_timer_id) +#define SNDRV_TIMER_IOCTL_TREAD_OLD _IOW('T', 0x02, int) +#define SNDRV_TIMER_IOCTL_GINFO _IOWR('T', 0x03, struct snd_timer_ginfo) +#define SNDRV_TIMER_IOCTL_GPARAMS _IOW('T', 0x04, struct snd_timer_gparams) +#define SNDRV_TIMER_IOCTL_GSTATUS _IOWR('T', 0x05, struct snd_timer_gstatus) +#define SNDRV_TIMER_IOCTL_SELECT _IOW('T', 0x10, struct snd_timer_select) +#define SNDRV_TIMER_IOCTL_INFO _IOR('T', 0x11, struct snd_timer_info) +#define SNDRV_TIMER_IOCTL_PARAMS _IOW('T', 0x12, struct snd_timer_params) +#define SNDRV_TIMER_IOCTL_STATUS _IOR('T', 0x14, struct snd_timer_status) +/* The following four ioctls are changed since 1.0.9 due to confliction */ +#define SNDRV_TIMER_IOCTL_START _IO('T', 0xa0) +#define SNDRV_TIMER_IOCTL_STOP _IO('T', 0xa1) +#define SNDRV_TIMER_IOCTL_CONTINUE _IO('T', 0xa2) +#define SNDRV_TIMER_IOCTL_PAUSE _IO('T', 0xa3) +#define SNDRV_TIMER_IOCTL_TREAD64 _IOW('T', 0xa4, int) + +#if __BITS_PER_LONG == 64 +#define SNDRV_TIMER_IOCTL_TREAD SNDRV_TIMER_IOCTL_TREAD_OLD +#else +#define SNDRV_TIMER_IOCTL_TREAD ((sizeof(__kernel_long_t) >= sizeof(time_t)) ? \ + SNDRV_TIMER_IOCTL_TREAD_OLD : \ + SNDRV_TIMER_IOCTL_TREAD64) +#endif + +struct snd_timer_read { + unsigned int resolution; + unsigned int ticks; +}; + +enum { + SNDRV_TIMER_EVENT_RESOLUTION = 0, /* val = resolution in ns */ + SNDRV_TIMER_EVENT_TICK, /* val = ticks */ + SNDRV_TIMER_EVENT_START, /* val = resolution in ns */ + SNDRV_TIMER_EVENT_STOP, /* val = 0 */ + SNDRV_TIMER_EVENT_CONTINUE, /* val = resolution in ns */ + SNDRV_TIMER_EVENT_PAUSE, /* val = 0 */ + SNDRV_TIMER_EVENT_EARLY, /* val = 0, early event */ + SNDRV_TIMER_EVENT_SUSPEND, /* val = 0 */ + SNDRV_TIMER_EVENT_RESUME, /* val = resolution in ns */ + /* master timer events for slave timer instances */ + SNDRV_TIMER_EVENT_MSTART = SNDRV_TIMER_EVENT_START + 10, + SNDRV_TIMER_EVENT_MSTOP = SNDRV_TIMER_EVENT_STOP + 10, + SNDRV_TIMER_EVENT_MCONTINUE = SNDRV_TIMER_EVENT_CONTINUE + 10, + SNDRV_TIMER_EVENT_MPAUSE = SNDRV_TIMER_EVENT_PAUSE + 10, + SNDRV_TIMER_EVENT_MSUSPEND = SNDRV_TIMER_EVENT_SUSPEND + 10, + SNDRV_TIMER_EVENT_MRESUME = SNDRV_TIMER_EVENT_RESUME + 10, +}; + +#ifndef __KERNEL__ +struct snd_timer_tread { + int event; + __time_pad pad1; + struct timespec tstamp; + unsigned int val; + __time_pad pad2; +}; +#endif + +/**************************************************************************** + * * + * Section for driver control interface - /dev/snd/control? * + * * + ****************************************************************************/ + +#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 9) + +struct snd_ctl_card_info { + int card; /* card number */ + int pad; /* reserved for future (was type) */ + unsigned char id[16]; /* ID of card (user selectable) */ + unsigned char driver[16]; /* Driver name */ + unsigned char name[32]; /* Short name of soundcard */ + unsigned char longname[80]; /* name + info text about soundcard */ + unsigned char reserved_[16]; /* reserved for future (was ID of mixer) */ + unsigned char mixername[80]; /* visual mixer identification */ + unsigned char components[128]; /* card components / fine identification, delimited with one space (AC97 etc..) */ +}; + +typedef int __bitwise snd_ctl_elem_type_t; +#define SNDRV_CTL_ELEM_TYPE_NONE ((__force snd_ctl_elem_type_t) 0) /* invalid */ +#define SNDRV_CTL_ELEM_TYPE_BOOLEAN ((__force snd_ctl_elem_type_t) 1) /* boolean type */ +#define SNDRV_CTL_ELEM_TYPE_INTEGER ((__force snd_ctl_elem_type_t) 2) /* integer type */ +#define SNDRV_CTL_ELEM_TYPE_ENUMERATED ((__force snd_ctl_elem_type_t) 3) /* enumerated type */ +#define SNDRV_CTL_ELEM_TYPE_BYTES ((__force snd_ctl_elem_type_t) 4) /* byte array */ +#define SNDRV_CTL_ELEM_TYPE_IEC958 ((__force snd_ctl_elem_type_t) 5) /* IEC958 (S/PDIF) setup */ +#define SNDRV_CTL_ELEM_TYPE_INTEGER64 ((__force snd_ctl_elem_type_t) 6) /* 64-bit integer type */ +#define SNDRV_CTL_ELEM_TYPE_LAST SNDRV_CTL_ELEM_TYPE_INTEGER64 + +typedef int __bitwise snd_ctl_elem_iface_t; +#define SNDRV_CTL_ELEM_IFACE_CARD ((__force snd_ctl_elem_iface_t) 0) /* global control */ +#define SNDRV_CTL_ELEM_IFACE_HWDEP ((__force snd_ctl_elem_iface_t) 1) /* hardware dependent device */ +#define SNDRV_CTL_ELEM_IFACE_MIXER ((__force snd_ctl_elem_iface_t) 2) /* virtual mixer device */ +#define SNDRV_CTL_ELEM_IFACE_PCM ((__force snd_ctl_elem_iface_t) 3) /* PCM device */ +#define SNDRV_CTL_ELEM_IFACE_RAWMIDI ((__force snd_ctl_elem_iface_t) 4) /* RawMidi device */ +#define SNDRV_CTL_ELEM_IFACE_TIMER ((__force snd_ctl_elem_iface_t) 5) /* timer device */ +#define SNDRV_CTL_ELEM_IFACE_SEQUENCER ((__force snd_ctl_elem_iface_t) 6) /* sequencer client */ +#define SNDRV_CTL_ELEM_IFACE_LAST SNDRV_CTL_ELEM_IFACE_SEQUENCER + +#define SNDRV_CTL_ELEM_ACCESS_READ (1<<0) +#define SNDRV_CTL_ELEM_ACCESS_WRITE (1<<1) +#define SNDRV_CTL_ELEM_ACCESS_READWRITE (SNDRV_CTL_ELEM_ACCESS_READ|SNDRV_CTL_ELEM_ACCESS_WRITE) +#define SNDRV_CTL_ELEM_ACCESS_VOLATILE (1<<2) /* control value may be changed without a notification */ +/* (1 << 3) is unused. */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_READ (1<<4) /* TLV read is possible */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_WRITE (1<<5) /* TLV write is possible */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE (SNDRV_CTL_ELEM_ACCESS_TLV_READ|SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) +#define SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND (1<<6) /* TLV command is possible */ +#define SNDRV_CTL_ELEM_ACCESS_INACTIVE (1<<8) /* control does actually nothing, but may be updated */ +#define SNDRV_CTL_ELEM_ACCESS_LOCK (1<<9) /* write lock */ +#define SNDRV_CTL_ELEM_ACCESS_OWNER (1<<10) /* write lock owner */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK (1<<28) /* kernel use a TLV callback */ +#define SNDRV_CTL_ELEM_ACCESS_USER (1<<29) /* user space element */ +/* bits 30 and 31 are obsoleted (for indirect access) */ + +/* for further details see the ACPI and PCI power management specification */ +#define SNDRV_CTL_POWER_D0 0x0000 /* full On */ +#define SNDRV_CTL_POWER_D1 0x0100 /* partial On */ +#define SNDRV_CTL_POWER_D2 0x0200 /* partial On */ +#define SNDRV_CTL_POWER_D3 0x0300 /* Off */ +#define SNDRV_CTL_POWER_D3hot (SNDRV_CTL_POWER_D3|0x0000) /* Off, with power */ +#define SNDRV_CTL_POWER_D3cold (SNDRV_CTL_POWER_D3|0x0001) /* Off, without power */ + +#define SNDRV_CTL_ELEM_ID_NAME_MAXLEN 44 + +struct snd_ctl_elem_id { + unsigned int numid; /* numeric identifier, zero = invalid */ + snd_ctl_elem_iface_t iface; /* interface identifier */ + unsigned int device; /* device/client number */ + unsigned int subdevice; /* subdevice (substream) number */ + unsigned char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; /* ASCII name of item */ + unsigned int index; /* index of item */ +}; + +struct snd_ctl_elem_list { + unsigned int offset; /* W: first element ID to get */ + unsigned int space; /* W: count of element IDs to get */ + unsigned int used; /* R: count of element IDs set */ + unsigned int count; /* R: count of all elements */ + struct snd_ctl_elem_id __user *pids; /* R: IDs */ + unsigned char reserved[50]; +}; + +struct snd_ctl_elem_info { + struct snd_ctl_elem_id id; /* W: element ID */ + snd_ctl_elem_type_t type; /* R: value type - SNDRV_CTL_ELEM_TYPE_* */ + unsigned int access; /* R: value access (bitmask) - SNDRV_CTL_ELEM_ACCESS_* */ + unsigned int count; /* count of values */ + __kernel_pid_t owner; /* owner's PID of this control */ + union { + struct { + long min; /* R: minimum value */ + long max; /* R: maximum value */ + long step; /* R: step (0 variable) */ + } integer; + struct { + long long min; /* R: minimum value */ + long long max; /* R: maximum value */ + long long step; /* R: step (0 variable) */ + } integer64; + struct { + unsigned int items; /* R: number of items */ + unsigned int item; /* W: item number */ + char name[64]; /* R: value name */ + __u64 names_ptr; /* W: names list (ELEM_ADD only) */ + unsigned int names_length; + } enumerated; + unsigned char reserved[128]; + } value; + unsigned char reserved[64]; +}; + +struct snd_ctl_elem_value { + struct snd_ctl_elem_id id; /* W: element ID */ + unsigned int indirect: 1; /* W: indirect access - obsoleted */ + union { + union { + long value[128]; + long *value_ptr; /* obsoleted */ + } integer; + union { + long long value[64]; + long long *value_ptr; /* obsoleted */ + } integer64; + union { + unsigned int item[128]; + unsigned int *item_ptr; /* obsoleted */ + } enumerated; + union { + unsigned char data[512]; + unsigned char *data_ptr; /* obsoleted */ + } bytes; + struct snd_aes_iec958 iec958; + } value; /* RO */ + unsigned char reserved[128]; +}; + +struct snd_ctl_tlv { + unsigned int numid; /* control element numeric identification */ + unsigned int length; /* in bytes aligned to 4 */ + unsigned int tlv[]; /* first TLV */ +}; + +#define SNDRV_CTL_IOCTL_PVERSION _IOR('U', 0x00, int) +#define SNDRV_CTL_IOCTL_CARD_INFO _IOR('U', 0x01, struct snd_ctl_card_info) +#define SNDRV_CTL_IOCTL_ELEM_LIST _IOWR('U', 0x10, struct snd_ctl_elem_list) +#define SNDRV_CTL_IOCTL_ELEM_INFO _IOWR('U', 0x11, struct snd_ctl_elem_info) +#define SNDRV_CTL_IOCTL_ELEM_READ _IOWR('U', 0x12, struct snd_ctl_elem_value) +#define SNDRV_CTL_IOCTL_ELEM_WRITE _IOWR('U', 0x13, struct snd_ctl_elem_value) +#define SNDRV_CTL_IOCTL_ELEM_LOCK _IOW('U', 0x14, struct snd_ctl_elem_id) +#define SNDRV_CTL_IOCTL_ELEM_UNLOCK _IOW('U', 0x15, struct snd_ctl_elem_id) +#define SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS _IOWR('U', 0x16, int) +#define SNDRV_CTL_IOCTL_ELEM_ADD _IOWR('U', 0x17, struct snd_ctl_elem_info) +#define SNDRV_CTL_IOCTL_ELEM_REPLACE _IOWR('U', 0x18, struct snd_ctl_elem_info) +#define SNDRV_CTL_IOCTL_ELEM_REMOVE _IOWR('U', 0x19, struct snd_ctl_elem_id) +#define SNDRV_CTL_IOCTL_TLV_READ _IOWR('U', 0x1a, struct snd_ctl_tlv) +#define SNDRV_CTL_IOCTL_TLV_WRITE _IOWR('U', 0x1b, struct snd_ctl_tlv) +#define SNDRV_CTL_IOCTL_TLV_COMMAND _IOWR('U', 0x1c, struct snd_ctl_tlv) +#define SNDRV_CTL_IOCTL_HWDEP_NEXT_DEVICE _IOWR('U', 0x20, int) +#define SNDRV_CTL_IOCTL_HWDEP_INFO _IOR('U', 0x21, struct snd_hwdep_info) +#define SNDRV_CTL_IOCTL_PCM_NEXT_DEVICE _IOR('U', 0x30, int) +#define SNDRV_CTL_IOCTL_PCM_INFO _IOWR('U', 0x31, struct snd_pcm_info) +#define SNDRV_CTL_IOCTL_PCM_PREFER_SUBDEVICE _IOW('U', 0x32, int) +#define SNDRV_CTL_IOCTL_RAWMIDI_NEXT_DEVICE _IOWR('U', 0x40, int) +#define SNDRV_CTL_IOCTL_RAWMIDI_INFO _IOWR('U', 0x41, struct snd_rawmidi_info) +#define SNDRV_CTL_IOCTL_RAWMIDI_PREFER_SUBDEVICE _IOW('U', 0x42, int) +#define SNDRV_CTL_IOCTL_UMP_NEXT_DEVICE _IOWR('U', 0x43, int) +#define SNDRV_CTL_IOCTL_UMP_ENDPOINT_INFO _IOWR('U', 0x44, struct snd_ump_endpoint_info) +#define SNDRV_CTL_IOCTL_UMP_BLOCK_INFO _IOWR('U', 0x45, struct snd_ump_block_info) +#define SNDRV_CTL_IOCTL_POWER _IOWR('U', 0xd0, int) +#define SNDRV_CTL_IOCTL_POWER_STATE _IOR('U', 0xd1, int) + +/* + * Read interface. + */ + +enum sndrv_ctl_event_type { + SNDRV_CTL_EVENT_ELEM = 0, + SNDRV_CTL_EVENT_LAST = SNDRV_CTL_EVENT_ELEM, +}; + +#define SNDRV_CTL_EVENT_MASK_VALUE (1<<0) /* element value was changed */ +#define SNDRV_CTL_EVENT_MASK_INFO (1<<1) /* element info was changed */ +#define SNDRV_CTL_EVENT_MASK_ADD (1<<2) /* element was added */ +#define SNDRV_CTL_EVENT_MASK_TLV (1<<3) /* element TLV tree was changed */ +#define SNDRV_CTL_EVENT_MASK_REMOVE (~0U) /* element was removed */ + +struct snd_ctl_event { + int type; /* event type - SNDRV_CTL_EVENT_* */ + union { + struct { + unsigned int mask; + struct snd_ctl_elem_id id; + } elem; + unsigned char data8[60]; + } data; +}; + +/* + * Control names + */ + +#define SNDRV_CTL_NAME_NONE "" +#define SNDRV_CTL_NAME_PLAYBACK "Playback " +#define SNDRV_CTL_NAME_CAPTURE "Capture " + +#define SNDRV_CTL_NAME_IEC958_NONE "" +#define SNDRV_CTL_NAME_IEC958_SWITCH "Switch" +#define SNDRV_CTL_NAME_IEC958_VOLUME "Volume" +#define SNDRV_CTL_NAME_IEC958_DEFAULT "Default" +#define SNDRV_CTL_NAME_IEC958_MASK "Mask" +#define SNDRV_CTL_NAME_IEC958_CON_MASK "Con Mask" +#define SNDRV_CTL_NAME_IEC958_PRO_MASK "Pro Mask" +#define SNDRV_CTL_NAME_IEC958_PCM_STREAM "PCM Stream" +#define SNDRV_CTL_NAME_IEC958(expl,direction,what) "IEC958 " expl SNDRV_CTL_NAME_##direction SNDRV_CTL_NAME_IEC958_##what + +#endif /* _UAPI__SOUND_ASOUND_H */ diff --git a/tools/perf/trace/beauty/mount_flags.sh b/tools/perf/trace/beauty/mount_flags.sh index 730099a9a6..ff578f7b45 100755 --- a/tools/perf/trace/beauty/mount_flags.sh +++ b/tools/perf/trace/beauty/mount_flags.sh @@ -1,15 +1,15 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ printf "static const char *mount_flags[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+([[:digit:]]+)[[:space:]]*.*' -grep -E $regex ${header_dir}/mount.h | grep -E -v '(MSK|VERBOSE|MGC_VAL)\>' | \ +grep -E $regex ${beauty_uapi_linux_dir}/mount.h | grep -E -v '(MSK|VERBOSE|MGC_VAL)\>' | \ sed -r "s/$regex/\2 \2 \1/g" | sort -n | \ xargs printf "\t[%s ? (ilog2(%s) + 1) : 0] = \"%s\",\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+\(1<<([[:digit:]]+)\)[[:space:]]*.*' -grep -E $regex ${header_dir}/mount.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/mount.h | \ sed -r "s/$regex/\2 \1/g" | \ xargs printf "\t[%s + 1] = \"%s\",\n" printf "};\n" diff --git a/tools/perf/trace/beauty/move_mount_flags.sh b/tools/perf/trace/beauty/move_mount_flags.sh index ce5e632d14..c0dde9020b 100755 --- a/tools/perf/trace/beauty/move_mount_flags.sh +++ b/tools/perf/trace/beauty/move_mount_flags.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h printf "static const char *move_mount_flags[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MOVE_MOUNT_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' diff --git a/tools/perf/trace/beauty/prctl.c b/tools/perf/trace/beauty/prctl.c index 6fe5ad5f5d..7d1aa9fd03 100644 --- a/tools/perf/trace/beauty/prctl.c +++ b/tools/perf/trace/beauty/prctl.c @@ -7,7 +7,7 @@ #include "trace/beauty/beauty.h" #include -#include +#include #include "trace/beauty/generated/prctl_option_array.c" diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh index 9455d9672f..e049f5e9c0 100755 --- a/tools/perf/trace/beauty/prctl_option.sh +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -1,18 +1,18 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ printf "static const char *prctl_options[] = {\n" regex='^#define[[:space:]]{1}PR_(\w+)[[:space:]]*([[:xdigit:]]+)([[:space:]]*/.*)?$' -grep -E $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ +grep -E $regex ${beauty_uapi_linux_dir}/prctl.h | grep -v PR_SET_PTRACER | \ sed -E "s%$regex%\2 \1%g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" printf "static const char *prctl_set_mm_options[] = {\n" regex='^#[[:space:]]+define[[:space:]]+PR_SET_MM_(\w+)[[:space:]]*([[:digit:]]+).*' -grep -E $regex ${header_dir}/prctl.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/prctl.h | \ sed -r "s/$regex/\2 \1/g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" diff --git a/tools/perf/trace/beauty/rename_flags.sh b/tools/perf/trace/beauty/rename_flags.sh index 94bf7f45d2..702411dd7a 100755 --- a/tools/perf/trace/beauty/rename_flags.sh +++ b/tools/perf/trace/beauty/rename_flags.sh @@ -2,7 +2,7 @@ # Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/perf/trace/beauty/include/uapi/linux/ fs_header=${header_dir}/fs.h diff --git a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh index e0803b9575..572939a128 100755 --- a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh @@ -1,9 +1,9 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ +[ $# -eq 1 ] && beauty_uapi_sound_dir=$1 || beauty_uapi_sound_dir=tools/perf/trace/beauty/include/uapi/sound/ printf "static const char *sndrv_ctl_ioctl_cmds[] = {\n" -grep "^#define[\t ]\+SNDRV_CTL_IOCTL_" $header_dir/asound.h | \ +grep "^#define[\t ]\+SNDRV_CTL_IOCTL_" $beauty_uapi_sound_dir/asound.h | \ sed -r 's/^#define +SNDRV_CTL_IOCTL_([A-Z0-9_]+)[\t ]+_IO[RW]*\( *.U., *(0x[[:xdigit:]]+),?.*/\t[\2] = \"\1\",/g' printf "};\n" diff --git a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh index 7a464a7bf9..33afae9a1c 100755 --- a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh @@ -1,9 +1,9 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ +[ $# -eq 1 ] && beauty_uapi_sound_dir=$1 || beauty_uapi_sound_dir=tools/perf/trace/beauty/include/uapi/sound/ printf "static const char *sndrv_pcm_ioctl_cmds[] = {\n" -grep "^#define[\t ]\+SNDRV_PCM_IOCTL_" $header_dir/asound.h | \ +grep "^#define[\t ]\+SNDRV_PCM_IOCTL_" $beauty_uapi_sound_dir/asound.h | \ sed -r 's/^#define +SNDRV_PCM_IOCTL_([A-Z0-9_]+)[\t ]+_IO[RW]*\( *.A., *(0x[[:xdigit:]]+),?.*/\t[\2] = \"\1\",/g' printf "};\n" diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c index dc5943a635..24843e614b 100644 --- a/tools/perf/trace/beauty/statx.c +++ b/tools/perf/trace/beauty/statx.c @@ -6,73 +6,20 @@ */ #include "trace/beauty/beauty.h" -#include #include -#include -#include +#include -size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg) +static size_t statx__scnprintf_mask(unsigned long mask, char *bf, size_t size, bool show_prefix) { - bool show_prefix = arg->show_string_prefix; - const char *prefix = "AT_"; - int printed = 0, flags = arg->val; - - if (flags == 0) - return scnprintf(bf, size, "%s%s", show_prefix ? "AT_STATX_" : "", "SYNC_AS_STAT"); -#define P_FLAG(n) \ - if (flags & AT_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ - flags &= ~AT_##n; \ - } - - P_FLAG(SYMLINK_NOFOLLOW); - P_FLAG(REMOVEDIR); - P_FLAG(SYMLINK_FOLLOW); - P_FLAG(NO_AUTOMOUNT); - P_FLAG(EMPTY_PATH); - P_FLAG(STATX_FORCE_SYNC); - P_FLAG(STATX_DONT_SYNC); - -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; + #include "trace/beauty/generated/statx_mask_array.c" + static DEFINE_STRARRAY(statx_mask, "STATX_"); + return strarray__scnprintf_flags(&strarray__statx_mask, bf, size, show_prefix, mask); } size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg) { bool show_prefix = arg->show_string_prefix; - const char *prefix = "STATX_"; - int printed = 0, flags = arg->val; - -#define P_FLAG(n) \ - if (flags & STATX_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ - flags &= ~STATX_##n; \ - } - - P_FLAG(TYPE); - P_FLAG(MODE); - P_FLAG(NLINK); - P_FLAG(UID); - P_FLAG(GID); - P_FLAG(ATIME); - P_FLAG(MTIME); - P_FLAG(CTIME); - P_FLAG(INO); - P_FLAG(SIZE); - P_FLAG(BLOCKS); - P_FLAG(BTIME); - P_FLAG(MNT_ID); - P_FLAG(DIOALIGN); - P_FLAG(MNT_ID_UNIQUE); - -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + int mask = arg->val; - return printed; + return statx__scnprintf_mask(mask, bf, size, show_prefix); } diff --git a/tools/perf/trace/beauty/statx_mask.sh b/tools/perf/trace/beauty/statx_mask.sh new file mode 100755 index 0000000000..18c802ed0c --- /dev/null +++ b/tools/perf/trace/beauty/statx_mask.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 1 ] ; then + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ +else + beauty_uapi_linux_dir=$1 +fi + +linux_stat=${beauty_uapi_linux_dir}/stat.h + +printf "static const char *statx_mask[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+STATX_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +# STATX_BASIC_STATS its a bitmask formed by the mask in the normal stat struct +# STATX_ALL is another bitmask and deprecated +# STATX_ATTR_*: Attributes to be found in stx_attributes and masked in stx_attributes_mask +grep -E $regex ${linux_stat} | \ + grep -v STATX_ALL | \ + grep -v STATX_BASIC_STATS | \ + grep -v '\ -#include +#include + +#ifndef SYNC_FILE_RANGE_WRITE_AND_WAIT +#define SYNC_FILE_RANGE_WAIT_BEFORE 1 +#define SYNC_FILE_RANGE_WRITE 2 +#define SYNC_FILE_RANGE_WAIT_AFTER 4 +#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \ + SYNC_FILE_RANGE_WAIT_BEFORE | \ + SYNC_FILE_RANGE_WAIT_AFTER) +#endif static size_t sync_file_range__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) { diff --git a/tools/perf/trace/beauty/sync_file_range.sh b/tools/perf/trace/beauty/sync_file_range.sh index 90bf633be8..b1084c4cab 100755 --- a/tools/perf/trace/beauty/sync_file_range.sh +++ b/tools/perf/trace/beauty/sync_file_range.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + linux_header_dir=tools/perf/trace/beauty/include/uapi/linux/ else linux_header_dir=$1 fi diff --git a/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh b/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh index 87dc68c7de..d8e927dd2b 100755 --- a/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh +++ b/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh @@ -3,12 +3,12 @@ # (C) 2019, Arnaldo Carvalho de Melo if [ $# -ne 1 ] ; then - arch_x86_header_dir=tools/arch/x86/include/asm/ + beauty_arch_asm_dir=tools/perf/trace/beauty/arch/x86/include/asm/ else - arch_x86_header_dir=$1 + beauty_arch_asm_dir=$1 fi -x86_irq_vectors=${arch_x86_header_dir}/irq_vectors.h +x86_irq_vectors=${beauty_arch_asm_dir}/irq_vectors.h # FIRST_EXTERNAL_VECTOR is not that useful, find what is its number # and then replace whatever is using it and that is useful, which at diff --git a/tools/perf/trace/beauty/usbdevfs_ioctl.sh b/tools/perf/trace/beauty/usbdevfs_ioctl.sh index b39cfb3720..12a30a9a8e 100755 --- a/tools/perf/trace/beauty/usbdevfs_ioctl.sh +++ b/tools/perf/trace/beauty/usbdevfs_ioctl.sh @@ -1,21 +1,21 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ # also as: # #define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) printf "static const char *usbdevfs_ioctl_cmds[] = {\n" regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)(\(\w+\))?[[:space:]]+_IO[CWR]{0,2}\([[:space:]]*(_IOC_\w+,[[:space:]]*)?'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*" -grep -E "$regex" ${header_dir}/usbdevice_fs.h | grep -E -v 'USBDEVFS_\w+32[[:space:]]' | \ +grep -E "$regex" ${beauty_uapi_linux_dir}/usbdevice_fs.h | grep -E -v 'USBDEVFS_\w+32[[:space:]]' | \ sed -r "s/$regex/\4 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n\n" printf "#if 0\n" printf "static const char *usbdevfs_ioctl_32_cmds[] = {\n" regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)[[:space:]]+_IO[WR]{0,2}\([[:space:]]*'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*" -grep -E $regex ${header_dir}/usbdevice_fs.h | grep -E 'USBDEVFS_\w+32[[:space:]]' | \ +grep -E $regex ${beauty_uapi_linux_dir}/usbdevice_fs.h | grep -E 'USBDEVFS_\w+32[[:space:]]' | \ sed -r "s/$regex/\2 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" diff --git a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh index 2dd0a3b1f5..e4f395e765 100755 --- a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh +++ b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh @@ -1,18 +1,18 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux printf "static const char *vhost_virtio_ioctl_cmds[] = {\n" regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*' -grep -E $regex ${header_dir}/vhost.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/vhost.h | \ sed -r "s/$regex/\2 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" printf "static const char *vhost_virtio_ioctl_read_cmds[] = {\n" regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?R\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*' -grep -E $regex ${header_dir}/vhost.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/vhost.h | \ sed -r "s/$regex/\2 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" diff --git a/tools/perf/trace/beauty/x86_arch_prctl.sh b/tools/perf/trace/beauty/x86_arch_prctl.sh index b1596df251..b714ffa3cb 100755 --- a/tools/perf/trace/beauty/x86_arch_prctl.sh +++ b/tools/perf/trace/beauty/x86_arch_prctl.sh @@ -2,9 +2,9 @@ # Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && x86_header_dir=$1 || x86_header_dir=tools/arch/x86/include/uapi/asm/ +[ $# -eq 1 ] && beauty_x86_arch_asm_uapi_dir=$1 || beauty_x86_arch_asm_uapi_dir=tools/perf/trace/beauty/arch/x86/include/uapi/asm/ -prctl_arch_header=${x86_header_dir}/prctl.h +prctl_arch_header=${beauty_x86_arch_asm_uapi_dir}/prctl.h print_range () { idx=$1 diff --git a/tools/perf/ui/browsers/Build b/tools/perf/ui/browsers/Build index 7a1d5ddaf6..2608b5da31 100644 --- a/tools/perf/ui/browsers/Build +++ b/tools/perf/ui/browsers/Build @@ -1,4 +1,5 @@ perf-y += annotate.o +perf-y += annotate-data.o perf-y += hists.o perf-y += map.o perf-y += scripts.o diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c new file mode 100644 index 0000000000..8d6bf08d37 --- /dev/null +++ b/tools/perf/ui/browsers/annotate-data.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "ui/browser.h" +#include "ui/helpline.h" +#include "ui/keysyms.h" +#include "ui/ui.h" +#include "util/annotate.h" +#include "util/annotate-data.h" +#include "util/evsel.h" +#include "util/evlist.h" +#include "util/sort.h" + +struct annotated_data_browser { + struct ui_browser b; + struct list_head entries; + int nr_events; +}; + +struct browser_entry { + struct list_head node; + struct annotated_member *data; + struct type_hist_entry *hists; + int indent; +}; + +static struct annotated_data_browser *get_browser(struct ui_browser *uib) +{ + return container_of(uib, struct annotated_data_browser, b); +} + +static void update_hist_entry(struct type_hist_entry *dst, + struct type_hist_entry *src) +{ + dst->nr_samples += src->nr_samples; + dst->period += src->period; +} + +static int get_member_overhead(struct annotated_data_type *adt, + struct browser_entry *entry, + struct evsel *leader) +{ + struct annotated_member *member = entry->data; + int i, k; + + for (i = 0; i < member->size; i++) { + struct type_hist *h; + struct evsel *evsel; + int offset = member->offset + i; + + for_each_group_evsel(evsel, leader) { + h = adt->histograms[evsel->core.idx]; + k = evsel__group_idx(evsel); + update_hist_entry(&entry->hists[k], &h->addr[offset]); + } + } + return 0; +} + +static int add_child_entries(struct annotated_data_browser *browser, + struct annotated_data_type *adt, + struct annotated_member *member, + struct evsel *evsel, int indent) +{ + struct annotated_member *pos; + struct browser_entry *entry; + int nr_entries = 0; + + entry = zalloc(sizeof(*entry)); + if (entry == NULL) + return -1; + + entry->hists = calloc(browser->nr_events, sizeof(*entry->hists)); + if (entry->hists == NULL) { + free(entry); + return -1; + } + + entry->data = member; + entry->indent = indent; + if (get_member_overhead(adt, entry, evsel) < 0) { + free(entry); + return -1; + } + + list_add_tail(&entry->node, &browser->entries); + nr_entries++; + + list_for_each_entry(pos, &member->children, node) { + int nr = add_child_entries(browser, adt, pos, evsel, indent + 1); + + if (nr < 0) + return nr; + + nr_entries += nr; + } + + /* add an entry for the closing bracket ("}") */ + if (!list_empty(&member->children)) { + entry = zalloc(sizeof(*entry)); + if (entry == NULL) + return -1; + + entry->indent = indent; + list_add_tail(&entry->node, &browser->entries); + nr_entries++; + } + + return nr_entries; +} + +static int annotated_data_browser__collect_entries(struct annotated_data_browser *browser) +{ + struct hist_entry *he = browser->b.priv; + struct annotated_data_type *adt = he->mem_type; + struct evsel *evsel = hists_to_evsel(he->hists); + + INIT_LIST_HEAD(&browser->entries); + browser->b.entries = &browser->entries; + browser->b.nr_entries = add_child_entries(browser, adt, &adt->self, + evsel, /*indent=*/0); + return 0; +} + +static void annotated_data_browser__delete_entries(struct annotated_data_browser *browser) +{ + struct browser_entry *pos, *tmp; + + list_for_each_entry_safe(pos, tmp, &browser->entries, node) { + list_del_init(&pos->node); + zfree(&pos->hists); + free(pos); + } +} + +static unsigned int browser__refresh(struct ui_browser *uib) +{ + return ui_browser__list_head_refresh(uib); +} + +static int browser__show(struct ui_browser *uib) +{ + struct hist_entry *he = uib->priv; + struct annotated_data_type *adt = he->mem_type; + struct annotated_data_browser *browser = get_browser(uib); + const char *help = "Press 'h' for help on key bindings"; + char title[256]; + + snprintf(title, sizeof(title), "Annotate type: '%s' (%d samples)", + adt->self.type_name, he->stat.nr_events); + + if (ui_browser__show(uib, title, help) < 0) + return -1; + + /* second line header */ + ui_browser__gotorc_title(uib, 0, 0); + ui_browser__set_color(uib, HE_COLORSET_ROOT); + + if (symbol_conf.show_total_period) + strcpy(title, "Period"); + else if (symbol_conf.show_nr_samples) + strcpy(title, "Samples"); + else + strcpy(title, "Percent"); + + ui_browser__printf(uib, "%*s %10s %10s %10s %s", + 11 * (browser->nr_events - 1), "", + title, "Offset", "Size", "Field"); + ui_browser__write_nstring(uib, "", uib->width); + return 0; +} + +static void browser__write_overhead(struct ui_browser *uib, + struct type_hist *total, + struct type_hist_entry *hist, int row) +{ + u64 period = hist->period; + double percent = total->period ? (100.0 * period / total->period) : 0; + bool current = ui_browser__is_current_entry(uib, row); + int nr_samples = 0; + + ui_browser__set_percent_color(uib, percent, current); + + if (symbol_conf.show_total_period) + ui_browser__printf(uib, " %10" PRIu64, period); + else if (symbol_conf.show_nr_samples) + ui_browser__printf(uib, " %10d", nr_samples); + else + ui_browser__printf(uib, " %10.2f", percent); + + ui_browser__set_percent_color(uib, 0, current); +} + +static void browser__write(struct ui_browser *uib, void *entry, int row) +{ + struct annotated_data_browser *browser = get_browser(uib); + struct browser_entry *be = entry; + struct annotated_member *member = be->data; + struct hist_entry *he = uib->priv; + struct annotated_data_type *adt = he->mem_type; + struct evsel *leader = hists_to_evsel(he->hists); + struct evsel *evsel; + + if (member == NULL) { + bool current = ui_browser__is_current_entry(uib, row); + + /* print the closing bracket */ + ui_browser__set_percent_color(uib, 0, current); + ui_browser__write_nstring(uib, "", 11 * browser->nr_events); + ui_browser__printf(uib, " %10s %10s %*s};", + "", "", be->indent * 4, ""); + ui_browser__write_nstring(uib, "", uib->width); + return; + } + + /* print the number */ + for_each_group_evsel(evsel, leader) { + struct type_hist *h = adt->histograms[evsel->core.idx]; + int idx = evsel__group_idx(evsel); + + browser__write_overhead(uib, h, &be->hists[idx], row); + } + + /* print type info */ + if (be->indent == 0 && !member->var_name) { + ui_browser__printf(uib, " %10d %10d %s%s", + member->offset, member->size, + member->type_name, + list_empty(&member->children) ? ";" : " {"); + } else { + ui_browser__printf(uib, " %10d %10d %*s%s\t%s%s", + member->offset, member->size, + be->indent * 4, "", member->type_name, + member->var_name ?: "", + list_empty(&member->children) ? ";" : " {"); + } + /* fill the rest */ + ui_browser__write_nstring(uib, "", uib->width); +} + +static int annotated_data_browser__run(struct annotated_data_browser *browser, + struct evsel *evsel __maybe_unused, + struct hist_browser_timer *hbt) +{ + int delay_secs = hbt ? hbt->refresh : 0; + int key; + + if (browser__show(&browser->b) < 0) + return -1; + + while (1) { + key = ui_browser__run(&browser->b, delay_secs); + + switch (key) { + case K_TIMER: + if (hbt) + hbt->timer(hbt->arg); + continue; + case K_F1: + case 'h': + ui_browser__help_window(&browser->b, + "UP/DOWN/PGUP\n" + "PGDN/SPACE Navigate\n" + " Move to prev/next symbol\n" + "q/ESC/CTRL+C Exit\n\n"); + continue; + case K_LEFT: + case '<': + case '>': + case K_ESC: + case 'q': + case CTRL('c'): + goto out; + default: + continue; + } + } +out: + ui_browser__hide(&browser->b); + return key; +} + +int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, + struct hist_browser_timer *hbt) +{ + struct annotated_data_browser browser = { + .b = { + .refresh = browser__refresh, + .seek = ui_browser__list_head_seek, + .write = browser__write, + .priv = he, + .extra_title_lines = 1, + }, + .nr_events = 1, + }; + int ret; + + ui_helpline__push("Press ESC to exit"); + + if (evsel__is_group_event(evsel)) + browser.nr_events = evsel->core.nr_members; + + ret = annotated_data_browser__collect_entries(&browser); + if (ret == 0) + ret = annotated_data_browser__run(&browser, evsel, hbt); + + annotated_data_browser__delete_entries(&browser); + + return ret; +} diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 4790c73559..ea98643024 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -49,7 +49,7 @@ static int ui_browser__jumps_percent_color(struct ui_browser *browser, int nr, b if (current && (!browser->use_navkeypressed || browser->navkeypressed)) return HE_COLORSET_SELECTED; - if (nr == notes->max_jump_sources) + if (nr == notes->src->max_jump_sources) return HE_COLORSET_TOP; if (nr > 1) return HE_COLORSET_MEDIUM; @@ -186,7 +186,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) * name right after the '<' token and probably treating this like a * 'call' instruction. */ - target = notes->src->offsets[cursor->ops.target.offset]; + target = annotated_source__get_line(notes->src, cursor->ops.target.offset); if (target == NULL) { ui_helpline__printf("WARN: jump target inconsistency, press 'o', notes->offsets[%#x] = NULL\n", cursor->ops.target.offset); @@ -205,13 +205,13 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS); __ui_browser__line_arrow(browser, - pcnt_width + 2 + notes->widths.addr + width, + pcnt_width + 2 + notes->src->widths.addr + width, from, to); diff = is_fused(ab, cursor); if (diff > 0) { ui_browser__mark_fused(browser, - pcnt_width + 3 + notes->widths.addr + width, + pcnt_width + 3 + notes->src->widths.addr + width, from - diff, diff, to > from); } } @@ -438,7 +438,7 @@ static int sym_title(struct symbol *sym, struct map *map, char *title, size_t sz, int percent_type) { return snprintf(title, sz, "%s %s [Percent: %s]", sym->name, - map__dso(map)->long_name, + dso__long_name(map__dso(map)), percent_type_str(percent_type)); } @@ -967,23 +967,23 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, return -1; dso = map__dso(ms->map); - if (dso->annotate_warned) + if (dso__annotate_warned(dso)) return -1; if (not_annotated || !sym->annotate2) { err = symbol__annotate2(ms, evsel, &browser.arch); if (err) { char msg[BUFSIZ]; - dso->annotate_warned = true; + dso__set_annotate_warned(dso); symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s:\n%s", sym->name, msg); - goto out_free_offsets; + return -1; } } ui_helpline__push("Press ESC to exit"); - browser.b.width = notes->src->max_line_len; + browser.b.width = notes->src->widths.max_line_len; browser.b.nr_entries = notes->src->nr_entries; browser.b.entries = ¬es->src->source, browser.b.width += 18; /* Percentage */ @@ -996,8 +996,5 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, if(not_annotated) annotated_source__purge(notes->src); -out_free_offsets: - if(not_annotated) - zfree(¬es->src->offsets); return ret; } diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 0c02b3a8e1..b7219df512 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -38,6 +38,7 @@ #include "../ui.h" #include "map.h" #include "annotate.h" +#include "annotate-data.h" #include "srcline.h" #include "string2.h" #include "units.h" @@ -2488,7 +2489,7 @@ add_annotate_opt(struct hist_browser *browser __maybe_unused, { struct dso *dso; - if (!ms->map || (dso = map__dso(ms->map)) == NULL || dso->annotate_warned) + if (!ms->map || (dso = map__dso(ms->map)) == NULL || dso__annotate_warned(dso)) return 0; if (!ms->sym) @@ -2505,6 +2506,32 @@ add_annotate_opt(struct hist_browser *browser __maybe_unused, return 1; } +static int +do_annotate_type(struct hist_browser *browser, struct popup_action *act) +{ + struct hist_entry *he = browser->he_selection; + + hist_entry__annotate_data_tui(he, act->evsel, browser->hbt); + ui_browser__handle_resize(&browser->b); + return 0; +} + +static int +add_annotate_type_opt(struct hist_browser *browser, + struct popup_action *act, char **optstr, + struct hist_entry *he) +{ + if (he == NULL || he->mem_type == NULL || he->mem_type->histograms == NULL) + return 0; + + if (asprintf(optstr, "Annotate type %s", he->mem_type->self.type_name) < 0) + return 0; + + act->evsel = hists_to_evsel(browser->hists); + act->fn = do_annotate_type; + return 1; +} + static int do_zoom_thread(struct hist_browser *browser, struct popup_action *act) { @@ -2581,7 +2608,7 @@ static int hists_browser__zoom_map(struct hist_browser *browser, struct map *map } else { struct dso *dso = map__dso(map); ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s DSO\"", - __map__is_kernel(map) ? "the Kernel" : dso->short_name); + __map__is_kernel(map) ? "the Kernel" : dso__short_name(dso)); browser->hists->dso_filter = dso; perf_hpp__set_elide(HISTC_DSO, true); pstack__push(browser->pstack, &browser->hists->dso_filter); @@ -2607,7 +2634,7 @@ add_dso_opt(struct hist_browser *browser, struct popup_action *act, if (asprintf(optstr, "Zoom %s %s DSO (use the 'k' hotkey to zoom directly into the kernel)", browser->hists->dso_filter ? "out of" : "into", - __map__is_kernel(map) ? "the Kernel" : map__dso(map)->short_name) < 0) + __map__is_kernel(map) ? "the Kernel" : dso__short_name(map__dso(map))) < 0) return 0; act->ms.map = map; @@ -3083,7 +3110,7 @@ do_hotkey: // key came straight from options ui__popup_menu() if (!browser->selection || !browser->selection->map || !map__dso(browser->selection->map) || - map__dso(browser->selection->map)->annotate_warned) { + dso__annotate_warned(map__dso(browser->selection->map))) { continue; } @@ -3307,6 +3334,10 @@ do_hotkey: // key came straight from options ui__popup_menu() browser->he_selection->ip); } skip_annotation: + nr_options += add_annotate_type_opt(browser, + &actions[nr_options], + &options[nr_options], + browser->he_selection); nr_options += add_thread_opt(browser, &actions[nr_options], &options[nr_options], thread); nr_options += add_dso_opt(browser, &actions[nr_options], diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c index 3d1b958d88..fba55175a9 100644 --- a/tools/perf/ui/browsers/map.c +++ b/tools/perf/ui/browsers/map.c @@ -76,7 +76,7 @@ static int map_browser__run(struct map_browser *browser) { int key; - if (ui_browser__show(&browser->b, map__dso(browser->map)->long_name, + if (ui_browser__show(&browser->b, dso__long_name(map__dso(browser->map)), "Press ESC to exit, %s / to search", verbose > 0 ? "" : "restart with -v to use") < 0) return -1; @@ -106,7 +106,7 @@ int map__browse(struct map *map) { struct map_browser mb = { .b = { - .entries = &map__dso(map)->symbols, + .entries = dso__symbols(map__dso(map)), .refresh = ui_browser__rb_tree_refresh, .seek = ui_browser__rb_tree_seek, .write = map_browser__write, diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c index 93ce3d47e4..6da24aa039 100644 --- a/tools/perf/ui/gtk/annotate.c +++ b/tools/perf/ui/gtk/annotate.c @@ -180,13 +180,14 @@ static int symbol__gtk_annotate(struct map_symbol *ms, struct evsel *evsel, GtkWidget *tab_label; int err; - if (dso->annotate_warned) + if (dso__annotate_warned(dso)) return -1; err = symbol__annotate(ms, evsel, NULL); if (err) { char msg[BUFSIZ]; - dso->annotate_warned = true; + + dso__set_annotate_warned(dso); symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s: %s\n", sym->name, msg); return -1; diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 2bf959d083..685ba2a54f 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -25,7 +25,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, const char *fmt, int len, - hpp_snprint_fn print_fn, bool fmt_percent) + hpp_snprint_fn print_fn, enum perf_hpp_fmt_type fmtype) { int ret; struct hists *hists = he->hists; @@ -33,7 +33,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, char *buf = hpp->buf; size_t size = hpp->size; - if (fmt_percent) { + if (fmtype == PERF_HPP_FMT_TYPE__PERCENT) { double percent = 0.0; u64 total = hists__total_period(hists); @@ -41,8 +41,16 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, percent = 100.0 * get_field(he) / total; ret = hpp__call_print_fn(hpp, print_fn, fmt, len, percent); - } else + } else if (fmtype == PERF_HPP_FMT_TYPE__AVERAGE) { + double average = 0; + + if (he->stat.nr_events) + average = 1.0 * get_field(he) / he->stat.nr_events; + + ret = hpp__call_print_fn(hpp, print_fn, fmt, len, average); + } else { ret = hpp__call_print_fn(hpp, print_fn, fmt, len, get_field(he)); + } if (evsel__is_group_event(evsel)) { int prev_idx, idx_delta; @@ -54,6 +62,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, list_for_each_entry(pair, &he->pairs.head, pairs.node) { u64 period = get_field(pair); u64 total = hists__total_period(pair->hists); + int nr_samples = pair->stat.nr_events; if (!total) continue; @@ -66,7 +75,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, * zero-fill group members in the middle which * have no sample */ - if (fmt_percent) { + if (fmtype != PERF_HPP_FMT_TYPE__RAW) { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, 0.0); } else { @@ -75,9 +84,14 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, } } - if (fmt_percent) { + if (fmtype == PERF_HPP_FMT_TYPE__PERCENT) { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, 100.0 * period / total); + } else if (fmtype == PERF_HPP_FMT_TYPE__AVERAGE) { + double avg = nr_samples ? (period / nr_samples) : 0; + + ret += hpp__call_print_fn(hpp, print_fn, fmt, + len, avg); } else { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, period); @@ -92,7 +106,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, /* * zero-fill group members at last which have no sample */ - if (fmt_percent) { + if (fmtype != PERF_HPP_FMT_TYPE__RAW) { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, 0.0); } else { @@ -114,33 +128,35 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent) + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype) { int len = fmt->user_len ?: fmt->len; if (symbol_conf.field_sep) { return __hpp__fmt(hpp, he, get_field, fmtstr, 1, - print_fn, fmt_percent); + print_fn, fmtype); } - if (fmt_percent) + if (fmtype == PERF_HPP_FMT_TYPE__PERCENT) len -= 2; /* 2 for a space and a % sign */ else len -= 1; - return __hpp__fmt(hpp, he, get_field, fmtstr, len, print_fn, fmt_percent); + return __hpp__fmt(hpp, he, get_field, fmtstr, len, print_fn, fmtype); } int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent) + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype) { if (!symbol_conf.cumulate_callchain) { int len = fmt->user_len ?: fmt->len; return snprintf(hpp->buf, hpp->size, " %*s", len - 1, "N/A"); } - return hpp__fmt(fmt, hpp, he, get_field, fmtstr, print_fn, fmt_percent); + return hpp__fmt(fmt, hpp, he, get_field, fmtstr, print_fn, fmtype); } static int field_cmp(u64 field_a, u64 field_b) @@ -350,7 +366,7 @@ static int hpp__color_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%", \ - hpp_color_scnprintf, true); \ + hpp_color_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_ENTRY_PERCENT_FN(_type, _field) \ @@ -358,7 +374,7 @@ static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%", \ - hpp_entry_scnprintf, true); \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_SORT_FN(_type, _field) \ @@ -378,7 +394,7 @@ static int hpp__color_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", \ - hpp_color_scnprintf, true); \ + hpp_color_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_ENTRY_ACC_PERCENT_FN(_type, _field) \ @@ -386,7 +402,7 @@ static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", \ - hpp_entry_scnprintf, true); \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_SORT_ACC_FN(_type, _field) \ @@ -406,7 +422,7 @@ static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt(fmt, hpp, he, he_get_raw_##_field, " %*"PRIu64, \ - hpp_entry_scnprintf, false); \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__RAW); \ } #define __HPP_SORT_RAW_FN(_type, _field) \ @@ -416,6 +432,26 @@ static int64_t hpp__sort_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \ return __hpp__sort(a, b, he_get_raw_##_field); \ } +#define __HPP_ENTRY_AVERAGE_FN(_type, _field) \ +static u64 he_get_##_field(struct hist_entry *he) \ +{ \ + return he->stat._field; \ +} \ + \ +static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ + struct perf_hpp *hpp, struct hist_entry *he) \ +{ \ + return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.1f", \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__AVERAGE); \ +} + +#define __HPP_SORT_AVERAGE_FN(_type, _field) \ +static int64_t hpp__sort_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \ + struct hist_entry *a, struct hist_entry *b) \ +{ \ + return __hpp__sort(a, b, he_get_##_field); \ +} + #define HPP_PERCENT_FNS(_type, _field) \ __HPP_COLOR_PERCENT_FN(_type, _field) \ @@ -431,6 +467,10 @@ __HPP_SORT_ACC_FN(_type, _field) __HPP_ENTRY_RAW_FN(_type, _field) \ __HPP_SORT_RAW_FN(_type, _field) +#define HPP_AVERAGE_FNS(_type, _field) \ +__HPP_ENTRY_AVERAGE_FN(_type, _field) \ +__HPP_SORT_AVERAGE_FN(_type, _field) + HPP_PERCENT_FNS(overhead, period) HPP_PERCENT_FNS(overhead_sys, period_sys) HPP_PERCENT_FNS(overhead_us, period_us) @@ -441,6 +481,10 @@ HPP_PERCENT_ACC_FNS(overhead_acc, period) HPP_RAW_FNS(samples, nr_events) HPP_RAW_FNS(period, period) +HPP_AVERAGE_FNS(weight1, weight1) +HPP_AVERAGE_FNS(weight2, weight2) +HPP_AVERAGE_FNS(weight3, weight3) + static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused, struct hist_entry *a __maybe_unused, struct hist_entry *b __maybe_unused) @@ -510,7 +554,10 @@ struct perf_hpp_fmt perf_hpp__format[] = { HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us, OVERHEAD_GUEST_US), HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc, OVERHEAD_ACC), HPP__PRINT_FNS("Samples", samples, SAMPLES), - HPP__PRINT_FNS("Period", period, PERIOD) + HPP__PRINT_FNS("Period", period, PERIOD), + HPP__PRINT_FNS("Weight1", weight1, WEIGHT1), + HPP__PRINT_FNS("Weight2", weight2, WEIGHT2), + HPP__PRINT_FNS("Weight3", weight3, WEIGHT3), }; struct perf_hpp_list perf_hpp_list = { @@ -526,6 +573,7 @@ struct perf_hpp_list perf_hpp_list = { #undef HPP_PERCENT_FNS #undef HPP_PERCENT_ACC_FNS #undef HPP_RAW_FNS +#undef HPP_AVERAGE_FNS #undef __HPP_HEADER_FN #undef __HPP_WIDTH_FN @@ -534,9 +582,11 @@ struct perf_hpp_list perf_hpp_list = { #undef __HPP_COLOR_ACC_PERCENT_FN #undef __HPP_ENTRY_ACC_PERCENT_FN #undef __HPP_ENTRY_RAW_FN +#undef __HPP_ENTRY_AVERAGE_FN #undef __HPP_SORT_FN #undef __HPP_SORT_ACC_FN #undef __HPP_SORT_RAW_FN +#undef __HPP_SORT_AVERAGE_FN static void fmt_free(struct perf_hpp_fmt *fmt) { @@ -785,6 +835,12 @@ void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists) fmt->len = 12; break; + case PERF_HPP__WEIGHT1: + case PERF_HPP__WEIGHT2: + case PERF_HPP__WEIGHT3: + fmt->len = 8; + break; + default: break; } diff --git a/tools/perf/util/Build b/tools/perf/util/Build index e0a723e245..da64efd871 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -12,6 +12,7 @@ perf-y += config.o perf-y += copyfile.o perf-y += ctype.o perf-y += db-export.o +perf-y += disasm.o perf-y += env.o perf-y += event.o perf-y += evlist.o @@ -140,6 +141,7 @@ perf-y += term.o perf-y += help-unknown-cmd.o perf-y += dlfilter.o perf-y += mem-events.o +perf-y += mem-info.o perf-y += vsprintf.o perf-y += units.o perf-y += time-utils.o @@ -388,3 +390,17 @@ $(OUTPUT)util/vsprintf.o: ../lib/vsprintf.c FORCE $(OUTPUT)util/list_sort.o: ../lib/list_sort.c FORCE $(call rule_mkdir) $(call if_changed_dep,cc_o_c) + +ifdef SHELLCHECK + SHELL_TESTS := generate-cmdlist.sh + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 30c4d19fcf..965da6c0b5 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "annotate.h" #include "annotate-data.h" @@ -19,9 +20,202 @@ #include "evlist.h" #include "map.h" #include "map_symbol.h" +#include "sort.h" #include "strbuf.h" #include "symbol.h" #include "symbol_conf.h" +#include "thread.h" + +/* register number of the stack pointer */ +#define X86_REG_SP 7 + +static void delete_var_types(struct die_var_type *var_types); + +enum type_state_kind { + TSR_KIND_INVALID = 0, + TSR_KIND_TYPE, + TSR_KIND_PERCPU_BASE, + TSR_KIND_CONST, + TSR_KIND_POINTER, + TSR_KIND_CANARY, +}; + +#define pr_debug_dtp(fmt, ...) \ +do { \ + if (debug_type_profile) \ + pr_info(fmt, ##__VA_ARGS__); \ + else \ + pr_debug3(fmt, ##__VA_ARGS__); \ +} while (0) + +static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) +{ + struct strbuf sb; + char *str; + Dwarf_Word size = 0; + + if (!debug_type_profile && verbose < 3) + return; + + switch (kind) { + case TSR_KIND_INVALID: + pr_info("\n"); + return; + case TSR_KIND_PERCPU_BASE: + pr_info(" percpu base\n"); + return; + case TSR_KIND_CONST: + pr_info(" constant\n"); + return; + case TSR_KIND_POINTER: + pr_info(" pointer"); + /* it also prints the type info */ + break; + case TSR_KIND_CANARY: + pr_info(" stack canary\n"); + return; + case TSR_KIND_TYPE: + default: + break; + } + + dwarf_aggregate_size(die, &size); + + strbuf_init(&sb, 32); + die_get_typename_from_type(die, &sb); + str = strbuf_detach(&sb, NULL); + pr_info(" type='%s' size=%#lx (die:%#lx)\n", + str, (long)size, (long)dwarf_dieoffset(die)); + free(str); +} + +static void pr_debug_location(Dwarf_Die *die, u64 pc, int reg) +{ + ptrdiff_t off = 0; + Dwarf_Attribute attr; + Dwarf_Addr base, start, end; + Dwarf_Op *ops; + size_t nops; + + if (!debug_type_profile && verbose < 3) + return; + + if (dwarf_attr(die, DW_AT_location, &attr) == NULL) + return; + + while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) { + if (reg != DWARF_REG_PC && end < pc) + continue; + if (reg != DWARF_REG_PC && start > pc) + break; + + pr_info(" variable location: "); + switch (ops->atom) { + case DW_OP_reg0 ...DW_OP_reg31: + pr_info("reg%d\n", ops->atom - DW_OP_reg0); + break; + case DW_OP_breg0 ...DW_OP_breg31: + pr_info("base=reg%d, offset=%#lx\n", + ops->atom - DW_OP_breg0, (long)ops->number); + break; + case DW_OP_regx: + pr_info("reg%ld\n", (long)ops->number); + break; + case DW_OP_bregx: + pr_info("base=reg%ld, offset=%#lx\n", + (long)ops->number, (long)ops->number2); + break; + case DW_OP_fbreg: + pr_info("use frame base, offset=%#lx\n", (long)ops->number); + break; + case DW_OP_addr: + pr_info("address=%#lx\n", (long)ops->number); + break; + default: + pr_info("unknown: code=%#x, number=%#lx\n", + ops->atom, (long)ops->number); + break; + } + break; + } +} + +/* + * Type information in a register, valid when @ok is true. + * The @caller_saved registers are invalidated after a function call. + */ +struct type_state_reg { + Dwarf_Die type; + u32 imm_value; + bool ok; + bool caller_saved; + u8 kind; +}; + +/* Type information in a stack location, dynamically allocated */ +struct type_state_stack { + struct list_head list; + Dwarf_Die type; + int offset; + int size; + bool compound; + u8 kind; +}; + +/* FIXME: This should be arch-dependent */ +#define TYPE_STATE_MAX_REGS 16 + +/* + * State table to maintain type info in each register and stack location. + * It'll be updated when new variable is allocated or type info is moved + * to a new location (register or stack). As it'd be used with the + * shortest path of basic blocks, it only maintains a single table. + */ +struct type_state { + /* state of general purpose registers */ + struct type_state_reg regs[TYPE_STATE_MAX_REGS]; + /* state of stack location */ + struct list_head stack_vars; + /* return value register */ + int ret_reg; + /* stack pointer register */ + int stack_reg; +}; + +static bool has_reg_type(struct type_state *state, int reg) +{ + return (unsigned)reg < ARRAY_SIZE(state->regs); +} + +static void init_type_state(struct type_state *state, struct arch *arch) +{ + memset(state, 0, sizeof(*state)); + INIT_LIST_HEAD(&state->stack_vars); + + if (arch__is(arch, "x86")) { + state->regs[0].caller_saved = true; + state->regs[1].caller_saved = true; + state->regs[2].caller_saved = true; + state->regs[4].caller_saved = true; + state->regs[5].caller_saved = true; + state->regs[8].caller_saved = true; + state->regs[9].caller_saved = true; + state->regs[10].caller_saved = true; + state->regs[11].caller_saved = true; + state->ret_reg = 0; + state->stack_reg = X86_REG_SP; + } +} + +static void exit_type_state(struct type_state *state) +{ + struct type_state_stack *stack, *tmp; + + list_for_each_entry_safe(stack, tmp, &state->stack_vars, list) { + list_del(&stack->list); + free(stack); + } +} /* * Compare type name and size to maintain them in a tree. @@ -118,8 +312,8 @@ static void delete_members(struct annotated_member *member) list_for_each_entry_safe(child, tmp, &member->children, node) { list_del(&child->node); delete_members(child); - free(child->type_name); - free(child->var_name); + zfree(&child->type_name); + zfree(&child->var_name); free(child); } } @@ -143,7 +337,7 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, /* Check existing nodes in dso->data_types tree */ key.self.type_name = type_name; key.self.size = size; - node = rb_find(&key, &dso->data_types, data_type_cmp); + node = rb_find(&key, dso__data_types(dso), data_type_cmp); if (node) { result = rb_entry(node, struct annotated_data_type, node); free(type_name); @@ -164,7 +358,7 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, if (symbol_conf.annotate_data_member) add_member_types(result, type_die); - rb_add(&result->node, &dso->data_types, data_type_less); + rb_add(&result->node, dso__data_types(dso), data_type_less); return result; } @@ -194,14 +388,22 @@ static bool find_cu_die(struct debuginfo *di, u64 pc, Dwarf_Die *cu_die) } /* The type info will be saved in @type_die */ -static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, - bool is_pointer) +static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, + Dwarf_Die *type_die, int reg, int offset, bool is_fbreg) { Dwarf_Word size; + bool is_pointer = true; + + if (reg == DWARF_REG_PC) + is_pointer = false; + else if (reg == dloc->fbreg || is_fbreg) + is_pointer = false; + else if (arch__is(dloc->arch, "x86") && reg == X86_REG_SP) + is_pointer = false; /* Get the type of the variable */ if (die_get_real_type(var_die, type_die) == NULL) { - pr_debug("variable has no type\n"); + pr_debug_dtp("variable has no type\n"); ann_data_stat.no_typeinfo++; return -1; } @@ -215,7 +417,7 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, if ((dwarf_tag(type_die) != DW_TAG_pointer_type && dwarf_tag(type_die) != DW_TAG_array_type) || die_get_real_type(type_die, type_die) == NULL) { - pr_debug("no pointer or no type\n"); + pr_debug_dtp("no pointer or no type\n"); ann_data_stat.no_typeinfo++; return -1; } @@ -223,14 +425,15 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, /* Get the size of the actual type */ if (dwarf_aggregate_size(type_die, &size) < 0) { - pr_debug("type size is unknown\n"); + pr_debug_dtp("type size is unknown\n"); ann_data_stat.invalid_size++; return -1; } /* Minimal sanity check */ if ((unsigned)offset >= size) { - pr_debug("offset: %d is bigger than size: %" PRIu64 "\n", offset, size); + pr_debug_dtp("offset: %d is bigger than size: %"PRIu64"\n", + offset, size); ann_data_stat.bad_offset++; return -1; } @@ -238,23 +441,1191 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, return 0; } +static struct type_state_stack *find_stack_state(struct type_state *state, + int offset) +{ + struct type_state_stack *stack; + + list_for_each_entry(stack, &state->stack_vars, list) { + if (offset == stack->offset) + return stack; + + if (stack->compound && stack->offset < offset && + offset < stack->offset + stack->size) + return stack; + } + return NULL; +} + +static void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, + Dwarf_Die *type_die) +{ + int tag; + Dwarf_Word size; + + if (dwarf_aggregate_size(type_die, &size) < 0) + size = 0; + + tag = dwarf_tag(type_die); + + stack->type = *type_die; + stack->size = size; + stack->offset = offset; + stack->kind = kind; + + switch (tag) { + case DW_TAG_structure_type: + case DW_TAG_union_type: + stack->compound = (kind != TSR_KIND_POINTER); + break; + default: + stack->compound = false; + break; + } +} + +static struct type_state_stack *findnew_stack_state(struct type_state *state, + int offset, u8 kind, + Dwarf_Die *type_die) +{ + struct type_state_stack *stack = find_stack_state(state, offset); + + if (stack) { + set_stack_state(stack, offset, kind, type_die); + return stack; + } + + stack = malloc(sizeof(*stack)); + if (stack) { + set_stack_state(stack, offset, kind, type_die); + list_add(&stack->list, &state->stack_vars); + } + return stack; +} + +/* Maintain a cache for quick global variable lookup */ +struct global_var_entry { + struct rb_node node; + char *name; + u64 start; + u64 end; + u64 die_offset; +}; + +static int global_var_cmp(const void *_key, const struct rb_node *node) +{ + const u64 addr = (uintptr_t)_key; + struct global_var_entry *gvar; + + gvar = rb_entry(node, struct global_var_entry, node); + + if (gvar->start <= addr && addr < gvar->end) + return 0; + return gvar->start > addr ? -1 : 1; +} + +static bool global_var_less(struct rb_node *node_a, const struct rb_node *node_b) +{ + struct global_var_entry *gvar_a, *gvar_b; + + gvar_a = rb_entry(node_a, struct global_var_entry, node); + gvar_b = rb_entry(node_b, struct global_var_entry, node); + + return gvar_a->start < gvar_b->start; +} + +static struct global_var_entry *global_var__find(struct data_loc_info *dloc, u64 addr) +{ + struct dso *dso = map__dso(dloc->ms->map); + struct rb_node *node; + + node = rb_find((void *)(uintptr_t)addr, dso__global_vars(dso), global_var_cmp); + if (node == NULL) + return NULL; + + return rb_entry(node, struct global_var_entry, node); +} + +static bool global_var__add(struct data_loc_info *dloc, u64 addr, + const char *name, Dwarf_Die *type_die) +{ + struct dso *dso = map__dso(dloc->ms->map); + struct global_var_entry *gvar; + Dwarf_Word size; + + if (dwarf_aggregate_size(type_die, &size) < 0) + return false; + + gvar = malloc(sizeof(*gvar)); + if (gvar == NULL) + return false; + + gvar->name = name ? strdup(name) : NULL; + if (name && gvar->name == NULL) { + free(gvar); + return false; + } + + gvar->start = addr; + gvar->end = addr + size; + gvar->die_offset = dwarf_dieoffset(type_die); + + rb_add(&gvar->node, dso__global_vars(dso), global_var_less); + return true; +} + +void global_var_type__tree_delete(struct rb_root *root) +{ + struct global_var_entry *gvar; + + while (!RB_EMPTY_ROOT(root)) { + struct rb_node *node = rb_first(root); + + rb_erase(node, root); + gvar = rb_entry(node, struct global_var_entry, node); + zfree(&gvar->name); + free(gvar); + } +} + +static bool get_global_var_info(struct data_loc_info *dloc, u64 addr, + const char **var_name, int *var_offset) +{ + struct addr_location al; + struct symbol *sym; + u64 mem_addr; + + /* Kernel symbols might be relocated */ + mem_addr = addr + map__reloc(dloc->ms->map); + + addr_location__init(&al); + sym = thread__find_symbol_fb(dloc->thread, dloc->cpumode, + mem_addr, &al); + if (sym) { + *var_name = sym->name; + /* Calculate type offset from the start of variable */ + *var_offset = mem_addr - map__unmap_ip(al.map, sym->start); + } else { + *var_name = NULL; + } + addr_location__exit(&al); + if (*var_name == NULL) + return false; + + return true; +} + +static void global_var__collect(struct data_loc_info *dloc) +{ + Dwarf *dwarf = dloc->di->dbg; + Dwarf_Off off, next_off; + Dwarf_Die cu_die, type_die; + size_t header_size; + + /* Iterate all CU and collect global variables that have no location in a register. */ + off = 0; + while (dwarf_nextcu(dwarf, off, &next_off, &header_size, + NULL, NULL, NULL) == 0) { + struct die_var_type *var_types = NULL; + struct die_var_type *pos; + + if (dwarf_offdie(dwarf, off + header_size, &cu_die) == NULL) { + off = next_off; + continue; + } + + die_collect_global_vars(&cu_die, &var_types); + + for (pos = var_types; pos; pos = pos->next) { + const char *var_name = NULL; + int var_offset = 0; + + if (pos->reg != -1) + continue; + + if (!dwarf_offdie(dwarf, pos->die_off, &type_die)) + continue; + + if (!get_global_var_info(dloc, pos->addr, &var_name, + &var_offset)) + continue; + + if (var_offset != 0) + continue; + + global_var__add(dloc, pos->addr, var_name, &type_die); + } + + delete_var_types(var_types); + + off = next_off; + } +} + +static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, + u64 ip, u64 var_addr, int *var_offset, + Dwarf_Die *type_die) +{ + u64 pc; + int offset; + const char *var_name = NULL; + struct global_var_entry *gvar; + struct dso *dso = map__dso(dloc->ms->map); + Dwarf_Die var_die; + + if (RB_EMPTY_ROOT(dso__global_vars(dso))) + global_var__collect(dloc); + + gvar = global_var__find(dloc, var_addr); + if (gvar) { + if (!dwarf_offdie(dloc->di->dbg, gvar->die_offset, type_die)) + return false; + + *var_offset = var_addr - gvar->start; + return true; + } + + /* Try to get the variable by address first */ + if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) && + check_variable(dloc, &var_die, type_die, DWARF_REG_PC, offset, + /*is_fbreg=*/false) == 0) { + var_name = dwarf_diename(&var_die); + *var_offset = offset; + goto ok; + } + + if (!get_global_var_info(dloc, var_addr, &var_name, var_offset)) + return false; + + pc = map__rip_2objdump(dloc->ms->map, ip); + + /* Try to get the name of global variable */ + if (die_find_variable_at(cu_die, var_name, pc, &var_die) && + check_variable(dloc, &var_die, type_die, DWARF_REG_PC, *var_offset, + /*is_fbreg=*/false) == 0) + goto ok; + + return false; + +ok: + /* The address should point to the start of the variable */ + global_var__add(dloc, var_addr - *var_offset, var_name, type_die); + return true; +} + +/** + * update_var_state - Update type state using given variables + * @state: type state table + * @dloc: data location info + * @addr: instruction address to match with variable + * @insn_offset: instruction offset (for debug) + * @var_types: list of variables with type info + * + * This function fills the @state table using @var_types info. Each variable + * is used only at the given location and updates an entry in the table. + */ +static void update_var_state(struct type_state *state, struct data_loc_info *dloc, + u64 addr, u64 insn_offset, struct die_var_type *var_types) +{ + Dwarf_Die mem_die; + struct die_var_type *var; + int fbreg = dloc->fbreg; + int fb_offset = 0; + + if (dloc->fb_cfa) { + if (die_get_cfa(dloc->di->dbg, addr, &fbreg, &fb_offset) < 0) + fbreg = -1; + } + + for (var = var_types; var != NULL; var = var->next) { + if (var->addr != addr) + continue; + /* Get the type DIE using the offset */ + if (!dwarf_offdie(dloc->di->dbg, var->die_off, &mem_die)) + continue; + + if (var->reg == DWARF_REG_FB) { + findnew_stack_state(state, var->offset, TSR_KIND_TYPE, + &mem_die); + + pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", + insn_offset, -var->offset); + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); + } else if (var->reg == fbreg) { + findnew_stack_state(state, var->offset - fb_offset, + TSR_KIND_TYPE, &mem_die); + + pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", + insn_offset, -var->offset + fb_offset); + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); + } else if (has_reg_type(state, var->reg) && var->offset == 0) { + struct type_state_reg *reg; + + reg = &state->regs[var->reg]; + reg->type = mem_die; + reg->kind = TSR_KIND_TYPE; + reg->ok = true; + + pr_debug_dtp("var [%"PRIx64"] reg%d", + insn_offset, var->reg); + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); + } + } +} + +static void update_insn_state_x86(struct type_state *state, + struct data_loc_info *dloc, Dwarf_Die *cu_die, + struct disasm_line *dl) +{ + struct annotated_insn_loc loc; + struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; + struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; + struct type_state_reg *tsr; + Dwarf_Die type_die; + u32 insn_offset = dl->al.offset; + int fbreg = dloc->fbreg; + int fboff = 0; + + if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) + return; + + if (ins__is_call(&dl->ins)) { + struct symbol *func = dl->ops.target.sym; + + if (func == NULL) + return; + + /* __fentry__ will preserve all registers */ + if (!strcmp(func->name, "__fentry__")) + return; + + pr_debug_dtp("call [%x] %s\n", insn_offset, func->name); + + /* Otherwise invalidate caller-saved registers after call */ + for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) { + if (state->regs[i].caller_saved) + state->regs[i].ok = false; + } + + /* Update register with the return type (if any) */ + if (die_find_func_rettype(cu_die, func->name, &type_die)) { + tsr = &state->regs[state->ret_reg]; + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("call [%x] return -> reg%d", + insn_offset, state->ret_reg); + pr_debug_type_name(&type_die, tsr->kind); + } + return; + } + + if (!strncmp(dl->ins.name, "add", 3)) { + u64 imm_value = -1ULL; + int offset; + const char *var_name = NULL; + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + + if (src->imm) + imm_value = src->offset; + else if (has_reg_type(state, src->reg1) && + state->regs[src->reg1].kind == TSR_KIND_CONST) + imm_value = state->regs[src->reg1].imm_value; + else if (src->reg1 == DWARF_REG_PC) { + u64 var_addr = annotate_calc_pcrel(dloc->ms, ip, + src->offset, dl); + + if (get_global_var_info(dloc, var_addr, + &var_name, &offset) && + !strcmp(var_name, "this_cpu_off") && + tsr->kind == TSR_KIND_CONST) { + tsr->kind = TSR_KIND_PERCPU_BASE; + imm_value = tsr->imm_value; + } + } + else + return; + + if (tsr->kind != TSR_KIND_PERCPU_BASE) + return; + + if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset, + &type_die) && offset == 0) { + /* + * This is not a pointer type, but it should be treated + * as a pointer. + */ + tsr->type = type_die; + tsr->kind = TSR_KIND_POINTER; + tsr->ok = true; + + pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d", + insn_offset, imm_value, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + return; + } + + if (strncmp(dl->ins.name, "mov", 3)) + return; + + if (dloc->fb_cfa) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 pc = map__rip_2objdump(dloc->ms->map, ip); + + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) + fbreg = -1; + } + + /* Case 1. register to register or segment:offset to register transfers */ + if (!src->mem_ref && !dst->mem_ref) { + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + if (dso__kernel(map__dso(dloc->ms->map)) && + src->segment == INSN_SEG_X86_GS && src->imm) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr; + int offset; + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + var_addr = src->offset; + + if (var_addr == 40) { + tsr->kind = TSR_KIND_CANARY; + tsr->ok = true; + + pr_debug_dtp("mov [%x] stack canary -> reg%d\n", + insn_offset, dst->reg1); + return; + } + + if (!get_global_var_type(cu_die, dloc, ip, var_addr, + &offset, &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d", + insn_offset, var_addr, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + return; + } + + if (src->imm) { + tsr->kind = TSR_KIND_CONST; + tsr->imm_value = src->offset; + tsr->ok = true; + + pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n", + insn_offset, tsr->imm_value, dst->reg1); + return; + } + + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) { + tsr->ok = false; + return; + } + + tsr->type = state->regs[src->reg1].type; + tsr->kind = state->regs[src->reg1].kind; + tsr->ok = true; + + pr_debug_dtp("mov [%x] reg%d -> reg%d", + insn_offset, src->reg1, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Case 2. memory to register transers */ + if (src->mem_ref && !dst->mem_ref) { + int sreg = src->reg1; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + +retry: + /* Check stack variables with offset */ + if (sreg == fbreg) { + struct type_state_stack *stack; + int offset = src->offset - fboff; + + stack = find_stack_state(state, offset); + if (stack == NULL) { + tsr->ok = false; + return; + } else if (!stack->compound) { + tsr->type = stack->type; + tsr->kind = stack->kind; + tsr->ok = true; + } else if (die_get_member_type(&stack->type, + offset - stack->offset, + &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + } else { + tsr->ok = false; + return; + } + + pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d", + insn_offset, -offset, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* And then dereference the pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_TYPE && + die_deref_ptr_type(&state->regs[sreg].type, + src->offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Or check if it's a global variable */ + else if (sreg == DWARF_REG_PC) { + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + u64 addr; + int offset; + + addr = annotate_calc_pcrel(ms, ip, src->offset, dl); + + if (!get_global_var_type(cu_die, dloc, ip, addr, &offset, + &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d", + insn_offset, addr, dst->reg1); + pr_debug_type_name(&type_die, tsr->kind); + } + /* And check percpu access with base register */ + else if (has_reg_type(state, sreg) && + state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr = src->offset; + int offset; + + if (src->multi_regs) { + int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1; + + if (has_reg_type(state, reg2) && state->regs[reg2].ok && + state->regs[reg2].kind == TSR_KIND_CONST) + var_addr += state->regs[reg2].imm_value; + } + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + if (get_global_var_type(cu_die, dloc, ip, var_addr, + &offset, &type_die) && + die_get_member_type(&type_die, offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + if (src->multi_regs) { + pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d", + insn_offset, src->offset, src->reg1, + src->reg2, dst->reg1); + } else { + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + } + pr_debug_type_name(&tsr->type, tsr->kind); + } else { + tsr->ok = false; + } + } + /* And then dereference the calculated pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_POINTER && + die_get_member_type(&state->regs[sreg].type, + src->offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Or try another register if any */ + else if (src->multi_regs && sreg == src->reg1 && + src->reg1 != src->reg2) { + sreg = src->reg2; + goto retry; + } + else { + int offset; + const char *var_name = NULL; + + /* it might be per-cpu variable (in kernel) access */ + if (src->offset < 0) { + if (get_global_var_info(dloc, (s64)src->offset, + &var_name, &offset) && + !strcmp(var_name, "__per_cpu_offset")) { + tsr->kind = TSR_KIND_PERCPU_BASE; + + pr_debug_dtp("mov [%x] percpu base reg%d\n", + insn_offset, dst->reg1); + } + } + + tsr->ok = false; + } + } + /* Case 3. register to memory transfers */ + if (!src->mem_ref && dst->mem_ref) { + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) + return; + + /* Check stack variables with offset */ + if (dst->reg1 == fbreg) { + struct type_state_stack *stack; + int offset = dst->offset - fboff; + + tsr = &state->regs[src->reg1]; + + stack = find_stack_state(state, offset); + if (stack) { + /* + * The source register is likely to hold a type + * of member if it's a compound type. Do not + * update the stack variable type since we can + * get the member type later by using the + * die_get_member_type(). + */ + if (!stack->compound) + set_stack_state(stack, offset, tsr->kind, + &tsr->type); + } else { + findnew_stack_state(state, offset, tsr->kind, + &tsr->type); + } + + pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)", + insn_offset, src->reg1, -offset); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* + * Ignore other transfers since it'd set a value in a struct + * and won't change the type. + */ + } + /* Case 4. memory to memory transfers (not handled for now) */ +} + +/** + * update_insn_state - Update type state for an instruction + * @state: type state table + * @dloc: data location info + * @cu_die: compile unit debug entry + * @dl: disasm line for the instruction + * + * This function updates the @state table for the target operand of the + * instruction at @dl if it transfers the type like MOV on x86. Since it + * tracks the type, it won't care about the values like in arithmetic + * instructions like ADD/SUB/MUL/DIV and INC/DEC. + * + * Note that ops->reg2 is only available when both mem_ref and multi_regs + * are true. + */ +static void update_insn_state(struct type_state *state, struct data_loc_info *dloc, + Dwarf_Die *cu_die, struct disasm_line *dl) +{ + if (arch__is(dloc->arch, "x86")) + update_insn_state_x86(state, dloc, cu_die, dl); +} + +/* + * Prepend this_blocks (from the outer scope) to full_blocks, removing + * duplicate disasm line. + */ +static void prepend_basic_blocks(struct list_head *this_blocks, + struct list_head *full_blocks) +{ + struct annotated_basic_block *first_bb, *last_bb; + + last_bb = list_last_entry(this_blocks, typeof(*last_bb), list); + first_bb = list_first_entry(full_blocks, typeof(*first_bb), list); + + if (list_empty(full_blocks)) + goto out; + + /* Last insn in this_blocks should be same as first insn in full_blocks */ + if (last_bb->end != first_bb->begin) { + pr_debug("prepend basic blocks: mismatched disasm line %"PRIx64" -> %"PRIx64"\n", + last_bb->end->al.offset, first_bb->begin->al.offset); + goto out; + } + + /* Is the basic block have only one disasm_line? */ + if (last_bb->begin == last_bb->end) { + list_del(&last_bb->list); + free(last_bb); + goto out; + } + + /* Point to the insn before the last when adding this block to full_blocks */ + last_bb->end = list_prev_entry(last_bb->end, al.node); + +out: + list_splice(this_blocks, full_blocks); +} + +static void delete_basic_blocks(struct list_head *basic_blocks) +{ + struct annotated_basic_block *bb, *tmp; + + list_for_each_entry_safe(bb, tmp, basic_blocks, list) { + list_del(&bb->list); + free(bb); + } +} + +/* Make sure all variables have a valid start address */ +static void fixup_var_address(struct die_var_type *var_types, u64 addr) +{ + while (var_types) { + /* + * Some variables have no address range meaning it's always + * available in the whole scope. Let's adjust the start + * address to the start of the scope. + */ + if (var_types->addr == 0) + var_types->addr = addr; + + var_types = var_types->next; + } +} + +static void delete_var_types(struct die_var_type *var_types) +{ + while (var_types) { + struct die_var_type *next = var_types->next; + + free(var_types); + var_types = next; + } +} + +/* should match to is_stack_canary() in util/annotate.c */ +static void setup_stack_canary(struct data_loc_info *dloc) +{ + if (arch__is(dloc->arch, "x86")) { + dloc->op->segment = INSN_SEG_X86_GS; + dloc->op->imm = true; + dloc->op->offset = 40; + } +} + +/* + * It's at the target address, check if it has a matching type. + * It returns 1 if found, 0 if not or -1 if not found but no need to + * repeat the search. The last case is for per-cpu variables which + * are similar to global variables and no additional info is needed. + */ +static int check_matching_type(struct type_state *state, + struct data_loc_info *dloc, + Dwarf_Die *cu_die, Dwarf_Die *type_die) +{ + Dwarf_Word size; + u32 insn_offset = dloc->ip - dloc->ms->sym->start; + int reg = dloc->op->reg1; + + pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d", + insn_offset, reg, dloc->op->offset, + state->regs[reg].ok, state->regs[reg].kind); + + if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) { + int tag = dwarf_tag(&state->regs[reg].type); + + /* + * Normal registers should hold a pointer (or array) to + * dereference a memory location. + */ + if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) { + if (dloc->op->offset < 0 && reg != state->stack_reg) + goto check_kernel; + + pr_debug_dtp("\n"); + return -1; + } + + pr_debug_dtp("\n"); + + /* Remove the pointer and get the target type */ + if (die_get_real_type(&state->regs[reg].type, type_die) == NULL) + return -1; + + dloc->type_offset = dloc->op->offset; + + /* Get the size of the actual type */ + if (dwarf_aggregate_size(type_die, &size) < 0 || + (unsigned)dloc->type_offset >= size) + return -1; + + return 1; + } + + if (reg == dloc->fbreg) { + struct type_state_stack *stack; + + pr_debug_dtp(" fbreg\n"); + + stack = find_stack_state(state, dloc->type_offset); + if (stack == NULL) + return 0; + + if (stack->kind == TSR_KIND_CANARY) { + setup_stack_canary(dloc); + return -1; + } + + if (stack->kind != TSR_KIND_TYPE) + return 0; + + *type_die = stack->type; + /* Update the type offset from the start of slot */ + dloc->type_offset -= stack->offset; + + return 1; + } + + if (dloc->fb_cfa) { + struct type_state_stack *stack; + u64 pc = map__rip_2objdump(dloc->ms->map, dloc->ip); + int fbreg, fboff; + + pr_debug_dtp(" cfa\n"); + + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) + fbreg = -1; + + if (reg != fbreg) + return 0; + + stack = find_stack_state(state, dloc->type_offset - fboff); + if (stack == NULL) + return 0; + + if (stack->kind == TSR_KIND_CANARY) { + setup_stack_canary(dloc); + return -1; + } + + if (stack->kind != TSR_KIND_TYPE) + return 0; + + *type_die = stack->type; + /* Update the type offset from the start of slot */ + dloc->type_offset -= fboff + stack->offset; + + return 1; + } + + if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) { + u64 var_addr = dloc->op->offset; + int var_offset; + + pr_debug_dtp(" percpu var\n"); + + if (dloc->op->multi_regs) { + int reg2 = dloc->op->reg2; + + if (dloc->op->reg2 == reg) + reg2 = dloc->op->reg1; + + if (has_reg_type(state, reg2) && state->regs[reg2].ok && + state->regs[reg2].kind == TSR_KIND_CONST) + var_addr += state->regs[reg2].imm_value; + } + + if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr, + &var_offset, type_die)) { + dloc->type_offset = var_offset; + return 1; + } + /* No need to retry per-cpu (global) variables */ + return -1; + } + + if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_POINTER) { + pr_debug_dtp(" percpu ptr\n"); + + /* + * It's actaully pointer but the address was calculated using + * some arithmetic. So it points to the actual type already. + */ + *type_die = state->regs[reg].type; + + dloc->type_offset = dloc->op->offset; + + /* Get the size of the actual type */ + if (dwarf_aggregate_size(type_die, &size) < 0 || + (unsigned)dloc->type_offset >= size) + return -1; + + return 1; + } + + if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_CANARY) { + pr_debug_dtp(" stack canary\n"); + + /* + * This is a saved value of the stack canary which will be handled + * in the outer logic when it returns failure here. Pretend it's + * from the stack canary directly. + */ + setup_stack_canary(dloc); + + return -1; + } + +check_kernel: + if (dso__kernel(map__dso(dloc->ms->map))) { + u64 addr; + int offset; + + /* Direct this-cpu access like "%gs:0x34740" */ + if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm && + arch__is(dloc->arch, "x86")) { + pr_debug_dtp(" this-cpu var\n"); + + addr = dloc->op->offset; + + if (get_global_var_type(cu_die, dloc, dloc->ip, addr, + &offset, type_die)) { + dloc->type_offset = offset; + return 1; + } + return -1; + } + + /* Access to global variable like "-0x7dcf0500(,%rdx,8)" */ + if (dloc->op->offset < 0 && reg != state->stack_reg) { + addr = (s64) dloc->op->offset; + + if (get_global_var_type(cu_die, dloc, dloc->ip, addr, + &offset, type_die)) { + pr_debug_dtp(" global var\n"); + + dloc->type_offset = offset; + return 1; + } + pr_debug_dtp(" negative offset\n"); + return -1; + } + } + + pr_debug_dtp("\n"); + return 0; +} + +/* Iterate instructions in basic blocks and update type table */ +static int find_data_type_insn(struct data_loc_info *dloc, + struct list_head *basic_blocks, + struct die_var_type *var_types, + Dwarf_Die *cu_die, Dwarf_Die *type_die) +{ + struct type_state state; + struct symbol *sym = dloc->ms->sym; + struct annotation *notes = symbol__annotation(sym); + struct annotated_basic_block *bb; + int ret = 0; + + init_type_state(&state, dloc->arch); + + list_for_each_entry(bb, basic_blocks, list) { + struct disasm_line *dl = bb->begin; + + BUG_ON(bb->begin->al.offset == -1 || bb->end->al.offset == -1); + + pr_debug_dtp("bb: [%"PRIx64" - %"PRIx64"]\n", + bb->begin->al.offset, bb->end->al.offset); + + list_for_each_entry_from(dl, ¬es->src->source, al.node) { + u64 this_ip = sym->start + dl->al.offset; + u64 addr = map__rip_2objdump(dloc->ms->map, this_ip); + + /* Skip comment or debug info lines */ + if (dl->al.offset == -1) + continue; + + /* Update variable type at this address */ + update_var_state(&state, dloc, addr, dl->al.offset, var_types); + + if (this_ip == dloc->ip) { + ret = check_matching_type(&state, dloc, + cu_die, type_die); + goto out; + } + + /* Update type table after processing the instruction */ + update_insn_state(&state, dloc, cu_die, dl); + if (dl == bb->end) + break; + } + } + +out: + exit_type_state(&state); + return ret; +} + +/* + * Construct a list of basic blocks for each scope with variables and try to find + * the data type by updating a type state table through instructions. + */ +static int find_data_type_block(struct data_loc_info *dloc, + Dwarf_Die *cu_die, Dwarf_Die *scopes, + int nr_scopes, Dwarf_Die *type_die) +{ + LIST_HEAD(basic_blocks); + struct die_var_type *var_types = NULL; + u64 src_ip, dst_ip, prev_dst_ip; + int ret = -1; + + /* TODO: other architecture support */ + if (!arch__is(dloc->arch, "x86")) + return -1; + + prev_dst_ip = dst_ip = dloc->ip; + for (int i = nr_scopes - 1; i >= 0; i--) { + Dwarf_Addr base, start, end; + LIST_HEAD(this_blocks); + int found; + + if (dwarf_ranges(&scopes[i], 0, &base, &start, &end) < 0) + break; + + pr_debug_dtp("scope: [%d/%d] (die:%lx)\n", + i + 1, nr_scopes, (long)dwarf_dieoffset(&scopes[i])); + src_ip = map__objdump_2rip(dloc->ms->map, start); + +again: + /* Get basic blocks for this scope */ + if (annotate_get_basic_blocks(dloc->ms->sym, src_ip, dst_ip, + &this_blocks) < 0) { + /* Try previous block if they are not connected */ + if (prev_dst_ip != dst_ip) { + dst_ip = prev_dst_ip; + goto again; + } + + pr_debug_dtp("cannot find a basic block from %"PRIx64" to %"PRIx64"\n", + src_ip - dloc->ms->sym->start, + dst_ip - dloc->ms->sym->start); + continue; + } + prepend_basic_blocks(&this_blocks, &basic_blocks); + + /* Get variable info for this scope and add to var_types list */ + die_collect_vars(&scopes[i], &var_types); + fixup_var_address(var_types, start); + + /* Find from start of this scope to the target instruction */ + found = find_data_type_insn(dloc, &basic_blocks, var_types, + cu_die, type_die); + if (found > 0) { + char buf[64]; + + if (dloc->op->multi_regs) + snprintf(buf, sizeof(buf), "reg%d, reg%d", + dloc->op->reg1, dloc->op->reg2); + else + snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1); + + pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n", + dloc->op->offset, buf, dloc->type_offset); + pr_debug_type_name(type_die, TSR_KIND_TYPE); + ret = 0; + break; + } + + if (found < 0) + break; + + /* Go up to the next scope and find blocks to the start */ + prev_dst_ip = dst_ip; + dst_ip = src_ip; + } + + delete_basic_blocks(&basic_blocks); + delete_var_types(var_types); + return ret; +} + /* The result will be saved in @type_die */ -static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr, - const char *var_name, struct annotated_op_loc *loc, - Dwarf_Die *type_die) +static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) { + struct annotated_op_loc *loc = dloc->op; Dwarf_Die cu_die, var_die; Dwarf_Die *scopes = NULL; int reg, offset; int ret = -1; int i, nr_scopes; int fbreg = -1; - bool is_fbreg = false; int fb_offset = 0; + bool is_fbreg = false; + u64 pc; + char buf[64]; + + if (dloc->op->multi_regs) + snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2); + else if (dloc->op->reg1 == DWARF_REG_PC) + snprintf(buf, sizeof(buf), "PC"); + else + snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1); + + pr_debug_dtp("-----------------------------------------------------------\n"); + pr_debug_dtp("find data type for %#x(%s) at %s+%#"PRIx64"\n", + dloc->op->offset, buf, dloc->ms->sym->name, + dloc->ip - dloc->ms->sym->start); + + /* + * IP is a relative instruction address from the start of the map, as + * it can be randomized/relocated, it needs to translate to PC which is + * a file address for DWARF processing. + */ + pc = map__rip_2objdump(dloc->ms->map, dloc->ip); /* Get a compile_unit for this address */ - if (!find_cu_die(di, pc, &cu_die)) { - pr_debug("cannot find CU for address %" PRIx64 "\n", pc); + if (!find_cu_die(dloc->di, pc, &cu_die)) { + pr_debug_dtp("cannot find CU for address %"PRIx64"\n", pc); ann_data_stat.no_cuinfo++; return -1; } @@ -262,19 +1633,18 @@ static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr, reg = loc->reg1; offset = loc->offset; - if (reg == DWARF_REG_PC) { - if (die_find_variable_by_addr(&cu_die, pc, addr, &var_die, &offset)) { - ret = check_variable(&var_die, type_die, offset, - /*is_pointer=*/false); - loc->offset = offset; - goto out; - } + pr_debug_dtp("CU for %s (die:%#lx)\n", + dwarf_diename(&cu_die), (long)dwarf_dieoffset(&cu_die)); - if (var_name && die_find_variable_at(&cu_die, var_name, pc, - &var_die)) { - ret = check_variable(&var_die, type_die, 0, - /*is_pointer=*/false); - /* loc->offset will be updated by the caller */ + if (reg == DWARF_REG_PC) { + if (get_global_var_type(&cu_die, dloc, dloc->ip, dloc->var_addr, + &offset, type_die)) { + dloc->type_offset = offset; + + pr_debug_dtp("found by addr=%#"PRIx64" type_offset=%#x\n", + dloc->var_addr, offset); + pr_debug_type_name(type_die, TSR_KIND_TYPE); + ret = 0; goto out; } } @@ -291,16 +1661,20 @@ static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr, dwarf_formblock(&attr, &block) == 0 && block.length == 1) { switch (*block.data) { case DW_OP_reg0 ... DW_OP_reg31: - fbreg = *block.data - DW_OP_reg0; + fbreg = dloc->fbreg = *block.data - DW_OP_reg0; break; case DW_OP_call_frame_cfa: - if (die_get_cfa(di->dbg, pc, &fbreg, + dloc->fb_cfa = true; + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fb_offset) < 0) fbreg = -1; break; default: break; } + + pr_debug_dtp("frame base: cfa=%d fbreg=%d\n", + dloc->fb_cfa, fbreg); } } @@ -312,7 +1686,7 @@ retry: /* Search from the inner-most scope to the outer */ for (i = nr_scopes - 1; i >= 0; i--) { if (reg == DWARF_REG_PC) { - if (!die_find_variable_by_addr(&scopes[i], pc, addr, + if (!die_find_variable_by_addr(&scopes[i], dloc->var_addr, &var_die, &offset)) continue; } else { @@ -323,9 +1697,30 @@ retry: } /* Found a variable, see if it's correct */ - ret = check_variable(&var_die, type_die, offset, - reg != DWARF_REG_PC && !is_fbreg); - loc->offset = offset; + ret = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg); + if (ret == 0) { + pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ", + dwarf_diename(&var_die), i+1, nr_scopes, + (long)dwarf_dieoffset(&scopes[i])); + if (reg == DWARF_REG_PC) { + pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n", + dloc->var_addr, offset); + } else if (reg == DWARF_REG_FB || is_fbreg) { + pr_debug_dtp("stack_offset=%#x type_offset=%#x\n", + fb_offset, offset); + } else { + pr_debug_dtp("type_offset=%#x\n", offset); + } + pr_debug_location(&var_die, pc, reg); + pr_debug_type_name(type_die, TSR_KIND_TYPE); + } else { + pr_debug_dtp("check variable \"%s\" failed (die: %#lx)\n", + dwarf_diename(&var_die), + (long)dwarf_dieoffset(&var_die)); + pr_debug_location(&var_die, pc, reg); + pr_debug_type_name(type_die, TSR_KIND_TYPE); + } + dloc->type_offset = offset; goto out; } @@ -334,8 +1729,19 @@ retry: goto retry; } - if (ret < 0) + if (reg != DWARF_REG_PC) { + ret = find_data_type_block(dloc, &cu_die, scopes, + nr_scopes, type_die); + if (ret == 0) { + ann_data_stat.insn_track++; + goto out; + } + } + + if (ret < 0) { + pr_debug_dtp("no variable found\n"); ann_data_stat.no_var++; + } out: free(scopes); @@ -344,50 +1750,45 @@ out: /** * find_data_type - Return a data type at the location - * @ms: map and symbol at the location - * @ip: instruction address of the memory access - * @loc: instruction operand location - * @addr: data address of the memory access - * @var_name: global variable name + * @dloc: data location * * This functions searches the debug information of the binary to get the data - * type it accesses. The exact location is expressed by (@ip, reg, offset) - * for pointer variables or (@ip, @addr) for global variables. Note that global - * variables might update the @loc->offset after finding the start of the variable. - * If it cannot find a global variable by address, it tried to fine a declaration - * of the variable using @var_name. In that case, @loc->offset won't be updated. + * type it accesses. The exact location is expressed by (ip, reg, offset) + * for pointer variables or (ip, addr) for global variables. Note that global + * variables might update the @dloc->type_offset after finding the start of the + * variable. If it cannot find a global variable by address, it tried to find + * a declaration of the variable using var_name. In that case, @dloc->offset + * won't be updated. * * It return %NULL if not found. */ -struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, - struct annotated_op_loc *loc, u64 addr, - const char *var_name) +struct annotated_data_type *find_data_type(struct data_loc_info *dloc) { struct annotated_data_type *result = NULL; - struct dso *dso = map__dso(ms->map); - struct debuginfo *di; + struct dso *dso = map__dso(dloc->ms->map); Dwarf_Die type_die; - u64 pc; - di = debuginfo__new(dso->long_name); - if (di == NULL) { - pr_debug("cannot get the debug info\n"); + dloc->di = debuginfo__new(dso__long_name(dso)); + if (dloc->di == NULL) { + pr_debug_dtp("cannot get the debug info\n"); return NULL; } /* - * IP is a relative instruction address from the start of the map, as - * it can be randomized/relocated, it needs to translate to PC which is - * a file address for DWARF processing. + * The type offset is the same as instruction offset by default. + * But when finding a global variable, the offset won't be valid. */ - pc = map__rip_2objdump(ms->map, ip); - if (find_data_type_die(di, pc, addr, var_name, loc, &type_die) < 0) + dloc->type_offset = dloc->op->offset; + + dloc->fbreg = -1; + + if (find_data_type_die(dloc, &type_die) < 0) goto out; result = dso__findnew_data_type(dso, &type_die); out: - debuginfo__delete(di); + debuginfo__delete(dloc->di); return result; } @@ -399,7 +1800,6 @@ static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_en sz += sizeof(struct type_hist_entry) * adt->self.size; /* Allocate a table of pointers for each event */ - adt->nr_histograms = nr_entries; adt->histograms = calloc(nr_entries, sizeof(*adt->histograms)); if (adt->histograms == NULL) return -ENOMEM; @@ -413,20 +1813,24 @@ static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_en if (adt->histograms[i] == NULL) goto err; } + + adt->nr_histograms = nr_entries; return 0; err: while (--i >= 0) - free(adt->histograms[i]); - free(adt->histograms); + zfree(&(adt->histograms[i])); + zfree(&adt->histograms); return -ENOMEM; } static void delete_data_type_histograms(struct annotated_data_type *adt) { for (int i = 0; i < adt->nr_histograms; i++) - free(adt->histograms[i]); - free(adt->histograms); + zfree(&(adt->histograms[i])); + + zfree(&adt->histograms); + adt->nr_histograms = 0; } void annotated_data_type__tree_delete(struct rb_root *root) @@ -440,7 +1844,7 @@ void annotated_data_type__tree_delete(struct rb_root *root) pos = rb_entry(node, struct annotated_data_type, node); delete_members(&pos->self); delete_data_type_histograms(pos); - free(pos->self.type_name); + zfree(&pos->self.type_name); free(pos); } } @@ -484,3 +1888,115 @@ int annotated_data_type__update_samples(struct annotated_data_type *adt, h->addr[offset].period += period; return 0; } + +static void print_annotated_data_header(struct hist_entry *he, struct evsel *evsel) +{ + struct dso *dso = map__dso(he->ms.map); + int nr_members = 1; + int nr_samples = he->stat.nr_events; + int width = 7; + const char *val_hdr = "Percent"; + + if (evsel__is_group_event(evsel)) { + struct hist_entry *pair; + + list_for_each_entry(pair, &he->pairs.head, pairs.node) + nr_samples += pair->stat.nr_events; + } + + printf("Annotate type: '%s' in %s (%d samples):\n", + he->mem_type->self.type_name, dso__name(dso), nr_samples); + + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + int i = 0; + + for_each_group_evsel(pos, evsel) + printf(" event[%d] = %s\n", i++, pos->name); + + nr_members = evsel->core.nr_members; + } + + if (symbol_conf.show_total_period) { + width = 11; + val_hdr = "Period"; + } else if (symbol_conf.show_nr_samples) { + width = 7; + val_hdr = "Samples"; + } + + printf("============================================================================\n"); + printf("%*s %10s %10s %s\n", (width + 1) * nr_members, val_hdr, + "offset", "size", "field"); +} + +static void print_annotated_data_value(struct type_hist *h, u64 period, int nr_samples) +{ + double percent = h->period ? (100.0 * period / h->period) : 0; + const char *color = get_percent_color(percent); + + if (symbol_conf.show_total_period) + color_fprintf(stdout, color, " %11" PRIu64, period); + else if (symbol_conf.show_nr_samples) + color_fprintf(stdout, color, " %7d", nr_samples); + else + color_fprintf(stdout, color, " %7.2f", percent); +} + +static void print_annotated_data_type(struct annotated_data_type *mem_type, + struct annotated_member *member, + struct evsel *evsel, int indent) +{ + struct annotated_member *child; + struct type_hist *h = mem_type->histograms[evsel->core.idx]; + int i, nr_events = 1, samples = 0; + u64 period = 0; + int width = symbol_conf.show_total_period ? 11 : 7; + + for (i = 0; i < member->size; i++) { + samples += h->addr[member->offset + i].nr_samples; + period += h->addr[member->offset + i].period; + } + print_annotated_data_value(h, period, samples); + + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + + for_each_group_member(pos, evsel) { + h = mem_type->histograms[pos->core.idx]; + + samples = 0; + period = 0; + for (i = 0; i < member->size; i++) { + samples += h->addr[member->offset + i].nr_samples; + period += h->addr[member->offset + i].period; + } + print_annotated_data_value(h, period, samples); + } + nr_events = evsel->core.nr_members; + } + + printf(" %10d %10d %*s%s\t%s", + member->offset, member->size, indent, "", member->type_name, + member->var_name ?: ""); + + if (!list_empty(&member->children)) + printf(" {\n"); + + list_for_each_entry(child, &member->children, node) + print_annotated_data_type(mem_type, child, evsel, indent + 4); + + if (!list_empty(&member->children)) + printf("%*s}", (width + 1) * nr_events + 24 + indent, ""); + printf(";\n"); +} + +int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel) +{ + print_annotated_data_header(he, evsel); + print_annotated_data_type(he->mem_type, &he->mem_type->self, evsel, 0); + printf("\n"); + + /* move to the next entry */ + return '>'; +} diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 1b0db8e8c4..0a57d9f5ee 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -8,8 +8,12 @@ #include struct annotated_op_loc; +struct debuginfo; struct evsel; +struct hist_browser_timer; +struct hist_entry; struct map_symbol; +struct thread; /** * struct annotated_member - Type of member field @@ -71,6 +75,40 @@ struct annotated_data_type { extern struct annotated_data_type unknown_type; extern struct annotated_data_type stackop_type; +extern struct annotated_data_type canary_type; + +/** + * struct data_loc_info - Data location information + * @arch: CPU architecture info + * @thread: Thread info + * @ms: Map and Symbol info + * @ip: Instruction address + * @var_addr: Data address (for global variables) + * @cpumode: CPU execution mode + * @op: Instruction operand location (regs and offset) + * @di: Debug info + * @fbreg: Frame base register + * @fb_cfa: Whether the frame needs to check CFA + * @type_offset: Final offset in the type + */ +struct data_loc_info { + /* These are input field, should be filled by caller */ + struct arch *arch; + struct thread *thread; + struct map_symbol *ms; + u64 ip; + u64 var_addr; + u8 cpumode; + struct annotated_op_loc *op; + + /* These are used internally */ + struct debuginfo *di; + int fbreg; + bool fb_cfa; + + /* This is for the result */ + int type_offset; +}; /** * struct annotated_data_stat - Debug statistics @@ -100,15 +138,14 @@ struct annotated_data_stat { int no_typeinfo; int invalid_size; int bad_offset; + int insn_track; }; extern struct annotated_data_stat ann_data_stat; #ifdef HAVE_DWARF_SUPPORT /* Returns data type at the location (ip, reg, offset) */ -struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, - struct annotated_op_loc *loc, u64 addr, - const char *var_name); +struct annotated_data_type *find_data_type(struct data_loc_info *dloc); /* Update type access histogram at the given offset */ int annotated_data_type__update_samples(struct annotated_data_type *adt, @@ -118,12 +155,15 @@ int annotated_data_type__update_samples(struct annotated_data_type *adt, /* Release all data type information in the tree */ void annotated_data_type__tree_delete(struct rb_root *root); +/* Release all global variable information in the tree */ +void global_var_type__tree_delete(struct rb_root *root); + +int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel); + #else /* HAVE_DWARF_SUPPORT */ static inline struct annotated_data_type * -find_data_type(struct map_symbol *ms __maybe_unused, u64 ip __maybe_unused, - struct annotated_op_loc *loc __maybe_unused, - u64 addr __maybe_unused, const char *var_name __maybe_unused) +find_data_type(struct data_loc_info *dloc __maybe_unused) { return NULL; } @@ -142,6 +182,28 @@ static inline void annotated_data_type__tree_delete(struct rb_root *root __maybe { } +static inline void global_var_type__tree_delete(struct rb_root *root __maybe_unused) +{ +} + +static inline int hist_entry__annotate_data_tty(struct hist_entry *he __maybe_unused, + struct evsel *evsel __maybe_unused) +{ + return -1; +} + #endif /* HAVE_DWARF_SUPPORT */ +#ifdef HAVE_SLANG_SUPPORT +int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, + struct hist_browser_timer *hbt); +#else +static inline int hist_entry__annotate_data_tui(struct hist_entry *he __maybe_unused, + struct evsel *evsel __maybe_unused, + struct hist_browser_timer *hbt __maybe_unused) +{ + return -1; +} +#endif /* HAVE_SLANG_SUPPORT */ + #endif /* _PERF_ANNOTATE_DATA_H */ diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 79d082155c..1451caf25e 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -16,6 +16,7 @@ #include "build-id.h" #include "color.h" #include "config.h" +#include "disasm.h" #include "dso.h" #include "env.h" #include "map.h" @@ -64,47 +65,6 @@ /* global annotation options */ struct annotation_options annotate_opts; -static regex_t file_lineno; - -static struct ins_ops *ins__find(struct arch *arch, const char *name); -static void ins__sort(struct arch *arch); -static int disasm_line__parse(char *line, const char **namep, char **rawp); -static int call__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); -static int jump__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); - -struct arch { - const char *name; - struct ins *instructions; - size_t nr_instructions; - size_t nr_instructions_allocated; - struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); - bool sorted_instructions; - bool initialized; - const char *insn_suffix; - void *priv; - unsigned int model; - unsigned int family; - int (*init)(struct arch *arch, char *cpuid); - bool (*ins_is_fused)(struct arch *arch, const char *ins1, - const char *ins2); - struct { - char comment_char; - char skip_functions_char; - char register_char; - char memory_ref_char; - } objdump; -}; - -static struct ins_ops call_ops; -static struct ins_ops dec_ops; -static struct ins_ops jump_ops; -static struct ins_ops mov_ops; -static struct ins_ops nop_ops; -static struct ins_ops lock_ops; -static struct ins_ops ret_ops; - /* Data type collection debug statistics */ struct annotated_data_stat ann_data_stat; LIST_HEAD(ann_insn_stat); @@ -117,753 +77,13 @@ struct annotated_data_type stackop_type = { }, }; -static int arch__grow_instructions(struct arch *arch) -{ - struct ins *new_instructions; - size_t new_nr_allocated; - - if (arch->nr_instructions_allocated == 0 && arch->instructions) - goto grow_from_non_allocated_table; - - new_nr_allocated = arch->nr_instructions_allocated + 128; - new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins)); - if (new_instructions == NULL) - return -1; - -out_update_instructions: - arch->instructions = new_instructions; - arch->nr_instructions_allocated = new_nr_allocated; - return 0; - -grow_from_non_allocated_table: - new_nr_allocated = arch->nr_instructions + 128; - new_instructions = calloc(new_nr_allocated, sizeof(struct ins)); - if (new_instructions == NULL) - return -1; - - memcpy(new_instructions, arch->instructions, arch->nr_instructions); - goto out_update_instructions; -} - -static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops) -{ - struct ins *ins; - - if (arch->nr_instructions == arch->nr_instructions_allocated && - arch__grow_instructions(arch)) - return -1; - - ins = &arch->instructions[arch->nr_instructions]; - ins->name = strdup(name); - if (!ins->name) - return -1; - - ins->ops = ops; - arch->nr_instructions++; - - ins__sort(arch); - return 0; -} - -#include "arch/arc/annotate/instructions.c" -#include "arch/arm/annotate/instructions.c" -#include "arch/arm64/annotate/instructions.c" -#include "arch/csky/annotate/instructions.c" -#include "arch/loongarch/annotate/instructions.c" -#include "arch/mips/annotate/instructions.c" -#include "arch/x86/annotate/instructions.c" -#include "arch/powerpc/annotate/instructions.c" -#include "arch/riscv64/annotate/instructions.c" -#include "arch/s390/annotate/instructions.c" -#include "arch/sparc/annotate/instructions.c" - -static struct arch architectures[] = { - { - .name = "arc", - .init = arc__annotate_init, - }, - { - .name = "arm", - .init = arm__annotate_init, - }, - { - .name = "arm64", - .init = arm64__annotate_init, - }, - { - .name = "csky", - .init = csky__annotate_init, - }, - { - .name = "mips", - .init = mips__annotate_init, - .objdump = { - .comment_char = '#', - }, - }, - { - .name = "x86", - .init = x86__annotate_init, - .instructions = x86__instructions, - .nr_instructions = ARRAY_SIZE(x86__instructions), - .insn_suffix = "bwlq", - .objdump = { - .comment_char = '#', - .register_char = '%', - .memory_ref_char = '(', - }, - }, - { - .name = "powerpc", - .init = powerpc__annotate_init, - }, - { - .name = "riscv64", - .init = riscv64__annotate_init, - }, - { - .name = "s390", - .init = s390__annotate_init, - .objdump = { - .comment_char = '#', - }, - }, - { - .name = "sparc", - .init = sparc__annotate_init, - .objdump = { - .comment_char = '#', - }, - }, - { - .name = "loongarch", - .init = loongarch__annotate_init, - .objdump = { - .comment_char = '#', - }, +struct annotated_data_type canary_type = { + .self = { + .type_name = (char *)"(stack canary)", + .children = LIST_HEAD_INIT(canary_type.self.children), }, }; -static void ins__delete(struct ins_operands *ops) -{ - if (ops == NULL) - return; - zfree(&ops->source.raw); - zfree(&ops->source.name); - zfree(&ops->target.raw); - zfree(&ops->target.name); -} - -static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw); -} - -int ins__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - if (ins->ops->scnprintf) - return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name); - - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); -} - -bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2) -{ - if (!arch || !arch->ins_is_fused) - return false; - - return arch->ins_is_fused(arch, ins1, ins2); -} - -static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) -{ - char *endptr, *tok, *name; - struct map *map = ms->map; - struct addr_map_symbol target = { - .ms = { .map = map, }, - }; - - ops->target.addr = strtoull(ops->raw, &endptr, 16); - - name = strchr(endptr, '<'); - if (name == NULL) - goto indirect_call; - - name++; - - if (arch->objdump.skip_functions_char && - strchr(name, arch->objdump.skip_functions_char)) - return -1; - - tok = strchr(name, '>'); - if (tok == NULL) - return -1; - - *tok = '\0'; - ops->target.name = strdup(name); - *tok = '>'; - - if (ops->target.name == NULL) - return -1; -find_target: - target.addr = map__objdump_2mem(map, ops->target.addr); - - if (maps__find_ams(ms->maps, &target) == 0 && - map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) - ops->target.sym = target.ms.sym; - - return 0; - -indirect_call: - tok = strchr(endptr, '*'); - if (tok != NULL) { - endptr++; - - /* Indirect call can use a non-rip register and offset: callq *0x8(%rbx). - * Do not parse such instruction. */ - if (strstr(endptr, "(%r") == NULL) - ops->target.addr = strtoull(endptr, NULL, 16); - } - goto find_target; -} - -static int call__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - if (ops->target.sym) - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); - - if (ops->target.addr == 0) - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); - - if (ops->target.name) - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.name); - - return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr); -} - -static struct ins_ops call_ops = { - .parse = call__parse, - .scnprintf = call__scnprintf, -}; - -bool ins__is_call(const struct ins *ins) -{ - return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops; -} - -/* - * Prevents from matching commas in the comment section, e.g.: - * ffff200008446e70: b.cs ffff2000084470f4 // b.hs, b.nlast - * - * and skip comma as part of function arguments, e.g.: - * 1d8b4ac - */ -static inline const char *validate_comma(const char *c, struct ins_operands *ops) -{ - if (ops->jump.raw_comment && c > ops->jump.raw_comment) - return NULL; - - if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) - return NULL; - - return c; -} - -static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) -{ - struct map *map = ms->map; - struct symbol *sym = ms->sym; - struct addr_map_symbol target = { - .ms = { .map = map, }, - }; - const char *c = strchr(ops->raw, ','); - u64 start, end; - - ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); - ops->jump.raw_func_start = strchr(ops->raw, '<'); - - c = validate_comma(c, ops); - - /* - * Examples of lines to parse for the _cpp_lex_token@@Base - * function: - * - * 1159e6c: jne 115aa32 <_cpp_lex_token@@Base+0xf92> - * 1159e8b: jne c469be - * - * The first is a jump to an offset inside the same function, - * the second is to another function, i.e. that 0xa72 is an - * offset in the cpp_named_operator2name@@base function. - */ - /* - * skip over possible up to 2 operands to get to address, e.g.: - * tbnz w0, #26, ffff0000083cd190 - */ - if (c++ != NULL) { - ops->target.addr = strtoull(c, NULL, 16); - if (!ops->target.addr) { - c = strchr(c, ','); - c = validate_comma(c, ops); - if (c++ != NULL) - ops->target.addr = strtoull(c, NULL, 16); - } - } else { - ops->target.addr = strtoull(ops->raw, NULL, 16); - } - - target.addr = map__objdump_2mem(map, ops->target.addr); - start = map__unmap_ip(map, sym->start); - end = map__unmap_ip(map, sym->end); - - ops->target.outside = target.addr < start || target.addr > end; - - /* - * FIXME: things like this in _cpp_lex_token (gcc's cc1 program): - - cpp_named_operator2name@@Base+0xa72 - - * Point to a place that is after the cpp_named_operator2name - * boundaries, i.e. in the ELF symbol table for cc1 - * cpp_named_operator2name is marked as being 32-bytes long, but it in - * fact is much larger than that, so we seem to need a symbols__find() - * routine that looks for >= current->start and < next_symbol->start, - * possibly just for C++ objects? - * - * For now lets just make some progress by marking jumps to outside the - * current function as call like. - * - * Actual navigation will come next, with further understanding of how - * the symbol searching and disassembly should be done. - */ - if (maps__find_ams(ms->maps, &target) == 0 && - map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) - ops->target.sym = target.ms.sym; - - if (!ops->target.outside) { - ops->target.offset = target.addr - start; - ops->target.offset_avail = true; - } else { - ops->target.offset_avail = false; - } - - return 0; -} - -static int jump__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - const char *c; - - if (!ops->target.addr || ops->target.offset < 0) - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); - - if (ops->target.outside && ops->target.sym != NULL) - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); - - c = strchr(ops->raw, ','); - c = validate_comma(c, ops); - - if (c != NULL) { - const char *c2 = strchr(c + 1, ','); - - c2 = validate_comma(c2, ops); - /* check for 3-op insn */ - if (c2 != NULL) - c = c2; - c++; - - /* mirror arch objdump's space-after-comma style */ - if (*c == ' ') - c++; - } - - return scnprintf(bf, size, "%-*s %.*s%" PRIx64, max_ins_name, - ins->name, c ? c - ops->raw : 0, ops->raw, - ops->target.offset); -} - -static void jump__delete(struct ins_operands *ops __maybe_unused) -{ - /* - * The ops->jump.raw_comment and ops->jump.raw_func_start belong to the - * raw string, don't free them. - */ -} - -static struct ins_ops jump_ops = { - .free = jump__delete, - .parse = jump__parse, - .scnprintf = jump__scnprintf, -}; - -bool ins__is_jump(const struct ins *ins) -{ - return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops; -} - -static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) -{ - char *endptr, *name, *t; - - if (strstr(raw, "(%rip)") == NULL) - return 0; - - *addrp = strtoull(comment, &endptr, 16); - if (endptr == comment) - return 0; - name = strchr(endptr, '<'); - if (name == NULL) - return -1; - - name++; - - t = strchr(name, '>'); - if (t == NULL) - return 0; - - *t = '\0'; - *namep = strdup(name); - *t = '>'; - - return 0; -} - -static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) -{ - ops->locked.ops = zalloc(sizeof(*ops->locked.ops)); - if (ops->locked.ops == NULL) - return 0; - - if (disasm_line__parse(ops->raw, &ops->locked.ins.name, &ops->locked.ops->raw) < 0) - goto out_free_ops; - - ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name); - - if (ops->locked.ins.ops == NULL) - goto out_free_ops; - - if (ops->locked.ins.ops->parse && - ops->locked.ins.ops->parse(arch, ops->locked.ops, ms) < 0) - goto out_free_ops; - - return 0; - -out_free_ops: - zfree(&ops->locked.ops); - return 0; -} - -static int lock__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - int printed; - - if (ops->locked.ins.ops == NULL) - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); - - printed = scnprintf(bf, size, "%-*s ", max_ins_name, ins->name); - return printed + ins__scnprintf(&ops->locked.ins, bf + printed, - size - printed, ops->locked.ops, max_ins_name); -} - -static void lock__delete(struct ins_operands *ops) -{ - struct ins *ins = &ops->locked.ins; - - if (ins->ops && ins->ops->free) - ins->ops->free(ops->locked.ops); - else - ins__delete(ops->locked.ops); - - zfree(&ops->locked.ops); - zfree(&ops->target.raw); - zfree(&ops->target.name); -} - -static struct ins_ops lock_ops = { - .free = lock__delete, - .parse = lock__parse, - .scnprintf = lock__scnprintf, -}; - -/* - * Check if the operand has more than one registers like x86 SIB addressing: - * 0x1234(%rax, %rbx, 8) - * - * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check - * the input string after 'memory_ref_char' if exists. - */ -static bool check_multi_regs(struct arch *arch, const char *op) -{ - int count = 0; - - if (arch->objdump.register_char == 0) - return false; - - if (arch->objdump.memory_ref_char) { - op = strchr(op, arch->objdump.memory_ref_char); - if (op == NULL) - return false; - } - - while ((op = strchr(op, arch->objdump.register_char)) != NULL) { - count++; - op++; - } - - return count > 1; -} - -static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) -{ - char *s = strchr(ops->raw, ','), *target, *comment, prev; - - if (s == NULL) - return -1; - - *s = '\0'; - - /* - * x86 SIB addressing has something like 0x8(%rax, %rcx, 1) - * then it needs to have the closing parenthesis. - */ - if (strchr(ops->raw, '(')) { - *s = ','; - s = strchr(ops->raw, ')'); - if (s == NULL || s[1] != ',') - return -1; - *++s = '\0'; - } - - ops->source.raw = strdup(ops->raw); - *s = ','; - - if (ops->source.raw == NULL) - return -1; - - ops->source.multi_regs = check_multi_regs(arch, ops->source.raw); - - target = skip_spaces(++s); - comment = strchr(s, arch->objdump.comment_char); - - if (comment != NULL) - s = comment - 1; - else - s = strchr(s, '\0') - 1; - - while (s > target && isspace(s[0])) - --s; - s++; - prev = *s; - *s = '\0'; - - ops->target.raw = strdup(target); - *s = prev; - - if (ops->target.raw == NULL) - goto out_free_source; - - ops->target.multi_regs = check_multi_regs(arch, ops->target.raw); - - if (comment == NULL) - return 0; - - comment = skip_spaces(comment); - comment__symbol(ops->source.raw, comment + 1, &ops->source.addr, &ops->source.name); - comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); - - return 0; - -out_free_source: - zfree(&ops->source.raw); - return -1; -} - -static int mov__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name, - ops->source.name ?: ops->source.raw, - ops->target.name ?: ops->target.raw); -} - -static struct ins_ops mov_ops = { - .parse = mov__parse, - .scnprintf = mov__scnprintf, -}; - -static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) -{ - char *target, *comment, *s, prev; - - target = s = ops->raw; - - while (s[0] != '\0' && !isspace(s[0])) - ++s; - prev = *s; - *s = '\0'; - - ops->target.raw = strdup(target); - *s = prev; - - if (ops->target.raw == NULL) - return -1; - - comment = strchr(s, arch->objdump.comment_char); - if (comment == NULL) - return 0; - - comment = skip_spaces(comment); - comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); - - return 0; -} - -static int dec__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, - ops->target.name ?: ops->target.raw); -} - -static struct ins_ops dec_ops = { - .parse = dec__parse, - .scnprintf = dec__scnprintf, -}; - -static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size, - struct ins_operands *ops __maybe_unused, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s", max_ins_name, "nop"); -} - -static struct ins_ops nop_ops = { - .scnprintf = nop__scnprintf, -}; - -static struct ins_ops ret_ops = { - .scnprintf = ins__raw_scnprintf, -}; - -bool ins__is_ret(const struct ins *ins) -{ - return ins->ops == &ret_ops; -} - -bool ins__is_lock(const struct ins *ins) -{ - return ins->ops == &lock_ops; -} - -static int ins__key_cmp(const void *name, const void *insp) -{ - const struct ins *ins = insp; - - return strcmp(name, ins->name); -} - -static int ins__cmp(const void *a, const void *b) -{ - const struct ins *ia = a; - const struct ins *ib = b; - - return strcmp(ia->name, ib->name); -} - -static void ins__sort(struct arch *arch) -{ - const int nmemb = arch->nr_instructions; - - qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); -} - -static struct ins_ops *__ins__find(struct arch *arch, const char *name) -{ - struct ins *ins; - const int nmemb = arch->nr_instructions; - - if (!arch->sorted_instructions) { - ins__sort(arch); - arch->sorted_instructions = true; - } - - ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); - if (ins) - return ins->ops; - - if (arch->insn_suffix) { - char tmp[32]; - char suffix; - size_t len = strlen(name); - - if (len == 0 || len >= sizeof(tmp)) - return NULL; - - suffix = name[len - 1]; - if (strchr(arch->insn_suffix, suffix) == NULL) - return NULL; - - strcpy(tmp, name); - tmp[len - 1] = '\0'; /* remove the suffix and check again */ - - ins = bsearch(tmp, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); - } - return ins ? ins->ops : NULL; -} - -static struct ins_ops *ins__find(struct arch *arch, const char *name) -{ - struct ins_ops *ops = __ins__find(arch, name); - - if (!ops && arch->associate_instruction_ops) - ops = arch->associate_instruction_ops(arch, name); - - return ops; -} - -static int arch__key_cmp(const void *name, const void *archp) -{ - const struct arch *arch = archp; - - return strcmp(name, arch->name); -} - -static int arch__cmp(const void *a, const void *b) -{ - const struct arch *aa = a; - const struct arch *ab = b; - - return strcmp(aa->name, ab->name); -} - -static void arch__sort(void) -{ - const int nmemb = ARRAY_SIZE(architectures); - - qsort(architectures, nmemb, sizeof(struct arch), arch__cmp); -} - -static struct arch *arch__find(const char *name) -{ - const int nmemb = ARRAY_SIZE(architectures); - static bool sorted; - - if (!sorted) { - arch__sort(); - sorted = true; - } - - return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); -} - -bool arch__is(struct arch *arch, const char *name) -{ - return !strcmp(arch->name, name); -} - /* symbol histogram: key = offset << 16 | evsel->core.idx */ static size_t sym_hist_hash(long key, void *ctx __maybe_unused) { @@ -1156,14 +376,33 @@ int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, return err; } +struct annotation_line *annotated_source__get_line(struct annotated_source *src, + s64 offset) +{ + struct annotation_line *al; + + list_for_each_entry(al, &src->source, node) { + if (al->offset == offset) + return al; + } + return NULL; +} + static unsigned annotation__count_insn(struct annotation *notes, u64 start, u64 end) { + struct annotation_line *al; unsigned n_insn = 0; - u64 offset; - for (offset = start; offset <= end; offset++) { - if (notes->src->offsets[offset]) - n_insn++; + al = annotated_source__get_line(notes->src, start); + if (al == NULL) + return 0; + + list_for_each_entry_from(al, ¬es->src->source, node) { + if (al->offset == -1) + continue; + if ((u64)al->offset > end) + break; + n_insn++; } return n_insn; } @@ -1180,10 +419,10 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 { unsigned n_insn; unsigned int cover_insn = 0; - u64 offset; n_insn = annotation__count_insn(notes, start, end); if (n_insn && ch->num && ch->cycles) { + struct annotation_line *al; struct annotated_branch *branch; float ipc = n_insn / ((double)ch->cycles / (double)ch->num); @@ -1191,10 +430,16 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 if (ch->reset >= 0x7fff) return; - for (offset = start; offset <= end; offset++) { - struct annotation_line *al = notes->src->offsets[offset]; + al = annotated_source__get_line(notes->src, start); + if (al == NULL) + return; - if (al && al->cycles && al->cycles->ipc == 0.0) { + list_for_each_entry_from(al, ¬es->src->source, node) { + if (al->offset == -1) + continue; + if ((u64)al->offset > end) + break; + if (al->cycles && al->cycles->ipc == 0.0) { al->cycles->ipc = ipc; cover_insn++; } @@ -1230,7 +475,7 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) if (ch && ch->cycles) { struct annotation_line *al; - al = notes->src->offsets[offset]; + al = annotated_source__get_line(notes->src, offset); if (al && al->cycles == NULL) { al->cycles = zalloc(sizeof(*al->cycles)); if (al->cycles == NULL) { @@ -1241,178 +486,44 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) if (ch->have_start) annotation__count_and_fill(notes, ch->start, offset, ch); if (al && ch->num_aggr) { - al->cycles->avg = ch->cycles_aggr / ch->num_aggr; - al->cycles->max = ch->cycles_max; - al->cycles->min = ch->cycles_min; - } - } - } - - if (err) { - while (++offset < (s64)size) { - struct cyc_hist *ch = ¬es->branch->cycles_hist[offset]; - - if (ch && ch->cycles) { - struct annotation_line *al = notes->src->offsets[offset]; - if (al) - zfree(&al->cycles); - } - } - } - - annotation__unlock(notes); - return 0; -} - -int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, - struct evsel *evsel) -{ - return symbol__inc_addr_samples(&ams->ms, evsel, ams->al_addr, sample); -} - -int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample, - struct evsel *evsel, u64 ip) -{ - return symbol__inc_addr_samples(&he->ms, evsel, ip, sample); -} - -static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms) -{ - dl->ins.ops = ins__find(arch, dl->ins.name); - - if (!dl->ins.ops) - return; - - if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms) < 0) - dl->ins.ops = NULL; -} - -static int disasm_line__parse(char *line, const char **namep, char **rawp) -{ - char tmp, *name = skip_spaces(line); - - if (name[0] == '\0') - return -1; - - *rawp = name + 1; - - while ((*rawp)[0] != '\0' && !isspace((*rawp)[0])) - ++*rawp; - - tmp = (*rawp)[0]; - (*rawp)[0] = '\0'; - *namep = strdup(name); - - if (*namep == NULL) - goto out; - - (*rawp)[0] = tmp; - *rawp = strim(*rawp); - - return 0; - -out: - return -1; -} - -struct annotate_args { - struct arch *arch; - struct map_symbol ms; - struct evsel *evsel; - struct annotation_options *options; - s64 offset; - char *line; - int line_nr; - char *fileloc; -}; - -static void annotation_line__init(struct annotation_line *al, - struct annotate_args *args, - int nr) -{ - al->offset = args->offset; - al->line = strdup(args->line); - al->line_nr = args->line_nr; - al->fileloc = args->fileloc; - al->data_nr = nr; -} - -static void annotation_line__exit(struct annotation_line *al) -{ - zfree_srcline(&al->path); - zfree(&al->line); - zfree(&al->cycles); -} - -static size_t disasm_line_size(int nr) -{ - struct annotation_line *al; - - return (sizeof(struct disasm_line) + (sizeof(al->data[0]) * nr)); -} - -/* - * Allocating the disasm annotation line data with - * following structure: - * - * ------------------------------------------- - * struct disasm_line | struct annotation_line - * ------------------------------------------- - * - * We have 'struct annotation_line' member as last member - * of 'struct disasm_line' to have an easy access. - */ -static struct disasm_line *disasm_line__new(struct annotate_args *args) -{ - struct disasm_line *dl = NULL; - int nr = 1; - - if (evsel__is_group_event(args->evsel)) - nr = args->evsel->core.nr_members; - - dl = zalloc(disasm_line_size(nr)); - if (!dl) - return NULL; + al->cycles->avg = ch->cycles_aggr / ch->num_aggr; + al->cycles->max = ch->cycles_max; + al->cycles->min = ch->cycles_min; + } + } + } - annotation_line__init(&dl->al, args, nr); - if (dl->al.line == NULL) - goto out_delete; + if (err) { + while (++offset < (s64)size) { + struct cyc_hist *ch = ¬es->branch->cycles_hist[offset]; - if (args->offset != -1) { - if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) - goto out_free_line; + if (ch && ch->cycles) { + struct annotation_line *al; - disasm_line__init_ins(dl, args->arch, &args->ms); + al = annotated_source__get_line(notes->src, offset); + if (al) + zfree(&al->cycles); + } + } } - return dl; - -out_free_line: - zfree(&dl->al.line); -out_delete: - free(dl); - return NULL; + annotation__unlock(notes); + return 0; } -void disasm_line__free(struct disasm_line *dl) +int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, + struct evsel *evsel) { - if (dl->ins.ops && dl->ins.ops->free) - dl->ins.ops->free(&dl->ops); - else - ins__delete(&dl->ops); - zfree(&dl->ins.name); - annotation_line__exit(&dl->al); - free(dl); + return symbol__inc_addr_samples(&ams->ms, evsel, ams->al_addr, sample); } -int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw, int max_ins_name) +int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample, + struct evsel *evsel, u64 ip) { - if (raw || !dl->ins.ops) - return scnprintf(bf, size, "%-*s %s", max_ins_name, dl->ins.name, dl->ops.raw); - - return ins__scnprintf(&dl->ins, bf, size, &dl->ops, max_ins_name); + return symbol__inc_addr_samples(&he->ms, evsel, ip, sample); } + void annotation__exit(struct annotation *notes) { annotated_source__delete(notes->src); @@ -1471,8 +582,7 @@ bool annotation__trylock(struct annotation *notes) return mutex_trylock(mutex); } - -static void annotation_line__add(struct annotation_line *al, struct list_head *head) +void annotation_line__add(struct annotation_line *al, struct list_head *head) { list_add_tail(&al->node, head); } @@ -1613,740 +723,73 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start if (max_percent < min_pcnt) return -1; - if (max_lines && printed >= max_lines) - return 1; - - if (queue != NULL) { - list_for_each_entry_from(queue, ¬es->src->source, node) { - if (queue == al) - break; - annotation_line__print(queue, sym, start, evsel, len, - 0, 0, 1, NULL, addr_fmt_width, - percent_type); - } - } - - color = get_percent_color(max_percent); - - for (i = 0; i < nr_percent; i++) { - struct annotation_data *data = &al->data[i]; - double percent; - - percent = annotation_data__percent(data, percent_type); - color = get_percent_color(percent); - - if (symbol_conf.show_total_period) - color_fprintf(stdout, color, " %11" PRIu64, - data->he.period); - else if (symbol_conf.show_nr_samples) - color_fprintf(stdout, color, " %7" PRIu64, - data->he.nr_samples); - else - color_fprintf(stdout, color, " %7.2f", percent); - } - - printf(" : "); - - disasm_line__print(dl, start, addr_fmt_width); - - /* - * Also color the filename and line if needed, with - * the same color than the percentage. Don't print it - * twice for close colored addr with the same filename:line - */ - if (al->path) { - if (!prev_line || strcmp(prev_line, al->path)) { - color_fprintf(stdout, color, " // %s", al->path); - prev_line = al->path; - } - } - - printf("\n"); - } else if (max_lines && printed >= max_lines) - return 1; - else { - int width = symbol_conf.show_total_period ? 12 : 8; - - if (queue) - return -1; - - if (evsel__is_group_event(evsel)) - width *= evsel->core.nr_members; - - if (!*al->line) - printf(" %*s:\n", width, " "); - else - printf(" %*s: %-*d %s\n", width, " ", addr_fmt_width, al->line_nr, al->line); - } - - return 0; -} - -/* - * symbol__parse_objdump_line() parses objdump output (with -d --no-show-raw) - * which looks like following - * - * 0000000000415500 <_init>: - * 415500: sub $0x8,%rsp - * 415504: mov 0x2f5ad5(%rip),%rax # 70afe0 <_DYNAMIC+0x2f8> - * 41550b: test %rax,%rax - * 41550e: je 415515 <_init+0x15> - * 415510: callq 416e70 <__gmon_start__@plt> - * 415515: add $0x8,%rsp - * 415519: retq - * - * it will be parsed and saved into struct disasm_line as - * - * - * The offset will be a relative offset from the start of the symbol and -1 - * means that it's not a disassembly line so should be treated differently. - * The ops.raw part will be parsed further according to type of the instruction. - */ -static int symbol__parse_objdump_line(struct symbol *sym, - struct annotate_args *args, - char *parsed_line, int *line_nr, char **fileloc) -{ - struct map *map = args->ms.map; - struct annotation *notes = symbol__annotation(sym); - struct disasm_line *dl; - char *tmp; - s64 line_ip, offset = -1; - regmatch_t match[2]; - - /* /filename:linenr ? Save line number and ignore. */ - if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) { - *line_nr = atoi(parsed_line + match[1].rm_so); - free(*fileloc); - *fileloc = strdup(parsed_line); - return 0; - } - - /* Process hex address followed by ':'. */ - line_ip = strtoull(parsed_line, &tmp, 16); - if (parsed_line != tmp && tmp[0] == ':' && tmp[1] != '\0') { - u64 start = map__rip_2objdump(map, sym->start), - end = map__rip_2objdump(map, sym->end); - - offset = line_ip - start; - if ((u64)line_ip < start || (u64)line_ip >= end) - offset = -1; - else - parsed_line = tmp + 1; - } - - args->offset = offset; - args->line = parsed_line; - args->line_nr = *line_nr; - args->fileloc = *fileloc; - args->ms.sym = sym; - - dl = disasm_line__new(args); - (*line_nr)++; - - if (dl == NULL) - return -1; - - if (!disasm_line__has_local_offset(dl)) { - dl->ops.target.offset = dl->ops.target.addr - - map__rip_2objdump(map, sym->start); - dl->ops.target.offset_avail = true; - } - - /* kcore has no symbols, so add the call target symbol */ - if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.sym) { - struct addr_map_symbol target = { - .addr = dl->ops.target.addr, - .ms = { .map = map, }, - }; - - if (!maps__find_ams(args->ms.maps, &target) && - target.ms.sym->start == target.al_addr) - dl->ops.target.sym = target.ms.sym; - } - - annotation_line__add(&dl->al, ¬es->src->source); - return 0; -} - -static __attribute__((constructor)) void symbol__init_regexpr(void) -{ - regcomp(&file_lineno, "^/[^:]+:([0-9]+)", REG_EXTENDED); -} - -static void delete_last_nop(struct symbol *sym) -{ - struct annotation *notes = symbol__annotation(sym); - struct list_head *list = ¬es->src->source; - struct disasm_line *dl; - - while (!list_empty(list)) { - dl = list_entry(list->prev, struct disasm_line, al.node); - - if (dl->ins.ops) { - if (dl->ins.ops != &nop_ops) - return; - } else { - if (!strstr(dl->al.line, " nop ") && - !strstr(dl->al.line, " nopl ") && - !strstr(dl->al.line, " nopw ")) - return; - } - - list_del_init(&dl->al.node); - disasm_line__free(dl); - } -} - -int symbol__strerror_disassemble(struct map_symbol *ms, int errnum, char *buf, size_t buflen) -{ - struct dso *dso = map__dso(ms->map); - - BUG_ON(buflen == 0); - - if (errnum >= 0) { - str_error_r(errnum, buf, buflen); - return 0; - } - - switch (errnum) { - case SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX: { - char bf[SBUILD_ID_SIZE + 15] = " with build id "; - char *build_id_msg = NULL; - - if (dso->has_build_id) { - build_id__sprintf(&dso->bid, bf + 15); - build_id_msg = bf; - } - scnprintf(buf, buflen, - "No vmlinux file%s\nwas found in the path.\n\n" - "Note that annotation using /proc/kcore requires CAP_SYS_RAWIO capability.\n\n" - "Please use:\n\n" - " perf buildid-cache -vu vmlinux\n\n" - "or:\n\n" - " --vmlinux vmlinux\n", build_id_msg ?: ""); - } - break; - case SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF: - scnprintf(buf, buflen, "Please link with binutils's libopcode to enable BPF annotation"); - break; - case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP: - scnprintf(buf, buflen, "Problems with arch specific instruction name regular expressions."); - break; - case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING: - scnprintf(buf, buflen, "Problems while parsing the CPUID in the arch specific initialization."); - break; - case SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE: - scnprintf(buf, buflen, "Invalid BPF file: %s.", dso->long_name); - break; - case SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF: - scnprintf(buf, buflen, "The %s BPF file has no BTF section, compile with -g or use pahole -J.", - dso->long_name); - break; - default: - scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); - break; - } - - return 0; -} - -static int dso__disassemble_filename(struct dso *dso, char *filename, size_t filename_size) -{ - char linkname[PATH_MAX]; - char *build_id_filename; - char *build_id_path = NULL; - char *pos; - int len; - - if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && - !dso__is_kcore(dso)) - return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX; - - build_id_filename = dso__build_id_filename(dso, NULL, 0, false); - if (build_id_filename) { - __symbol__join_symfs(filename, filename_size, build_id_filename); - free(build_id_filename); - } else { - if (dso->has_build_id) - return ENOMEM; - goto fallback; - } - - build_id_path = strdup(filename); - if (!build_id_path) - return ENOMEM; - - /* - * old style build-id cache has name of XX/XXXXXXX.. while - * new style has XX/XXXXXXX../{elf,kallsyms,vdso}. - * extract the build-id part of dirname in the new style only. - */ - pos = strrchr(build_id_path, '/'); - if (pos && strlen(pos) < SBUILD_ID_SIZE - 2) - dirname(build_id_path); - - if (dso__is_kcore(dso)) - goto fallback; - - len = readlink(build_id_path, linkname, sizeof(linkname) - 1); - if (len < 0) - goto fallback; - - linkname[len] = '\0'; - if (strstr(linkname, DSO__NAME_KALLSYMS) || - access(filename, R_OK)) { -fallback: - /* - * If we don't have build-ids or the build-id file isn't in the - * cache, or is just a kallsyms file, well, lets hope that this - * DSO is the same as when 'perf record' ran. - */ - if (dso->kernel && dso->long_name[0] == '/') - snprintf(filename, filename_size, "%s", dso->long_name); - else - __symbol__join_symfs(filename, filename_size, dso->long_name); - - mutex_lock(&dso->lock); - if (access(filename, R_OK) && errno == ENOENT && dso->nsinfo) { - char *new_name = dso__filename_with_chroot(dso, filename); - if (new_name) { - strlcpy(filename, new_name, filename_size); - free(new_name); - } - } - mutex_unlock(&dso->lock); - } - - free(build_id_path); - return 0; -} - -#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -#define PACKAGE "perf" -#include -#include -#include -#include -#include -#include -#include - -static int symbol__disassemble_bpf(struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct bpf_prog_linfo *prog_linfo = NULL; - struct bpf_prog_info_node *info_node; - int len = sym->end - sym->start; - disassembler_ftype disassemble; - struct map *map = args->ms.map; - struct perf_bpil *info_linear; - struct disassemble_info info; - struct dso *dso = map__dso(map); - int pc = 0, count, sub_id; - struct btf *btf = NULL; - char tpath[PATH_MAX]; - size_t buf_size; - int nr_skip = 0; - char *buf; - bfd *bfdf; - int ret; - FILE *s; - - if (dso->binary_type != DSO_BINARY_TYPE__BPF_PROG_INFO) - return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; - - pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, - sym->name, sym->start, sym->end - sym->start); - - memset(tpath, 0, sizeof(tpath)); - perf_exe(tpath, sizeof(tpath)); - - bfdf = bfd_openr(tpath, NULL); - if (bfdf == NULL) - abort(); - - if (!bfd_check_format(bfdf, bfd_object)) - abort(); - - s = open_memstream(&buf, &buf_size); - if (!s) { - ret = errno; - goto out; - } - init_disassemble_info_compat(&info, s, - (fprintf_ftype) fprintf, - fprintf_styled); - info.arch = bfd_get_arch(bfdf); - info.mach = bfd_get_mach(bfdf); - - info_node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, - dso->bpf_prog.id); - if (!info_node) { - ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; - goto out; - } - info_linear = info_node->info_linear; - sub_id = dso->bpf_prog.sub_id; - - info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); - info.buffer_length = info_linear->info.jited_prog_len; - - if (info_linear->info.nr_line_info) - prog_linfo = bpf_prog_linfo__new(&info_linear->info); - - if (info_linear->info.btf_id) { - struct btf_node *node; - - node = perf_env__find_btf(dso->bpf_prog.env, - info_linear->info.btf_id); - if (node) - btf = btf__new((__u8 *)(node->data), - node->data_size); - } - - disassemble_init_for_target(&info); - -#ifdef DISASM_FOUR_ARGS_SIGNATURE - disassemble = disassembler(info.arch, - bfd_big_endian(bfdf), - info.mach, - bfdf); -#else - disassemble = disassembler(bfdf); -#endif - if (disassemble == NULL) - abort(); - - fflush(s); - do { - const struct bpf_line_info *linfo = NULL; - struct disasm_line *dl; - size_t prev_buf_size; - const char *srcline; - u64 addr; - - addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; - count = disassemble(pc, &info); - - if (prog_linfo) - linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, - addr, sub_id, - nr_skip); - - if (linfo && btf) { - srcline = btf__name_by_offset(btf, linfo->line_off); - nr_skip++; - } else - srcline = NULL; - - fprintf(s, "\n"); - prev_buf_size = buf_size; - fflush(s); - - if (!annotate_opts.hide_src_code && srcline) { - args->offset = -1; - args->line = strdup(srcline); - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) { - annotation_line__add(&dl->al, - ¬es->src->source); - } - } - - args->offset = pc; - args->line = buf + prev_buf_size; - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - pc += count; - } while (count > 0 && pc < len); - - ret = 0; -out: - free(prog_linfo); - btf__free(btf); - fclose(s); - bfd_close(bfdf); - return ret; -} -#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, - struct annotate_args *args __maybe_unused) -{ - return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; -} -#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) - -static int -symbol__disassemble_bpf_image(struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct disasm_line *dl; - - args->offset = -1; - args->line = strdup("to be implemented"); - args->line_nr = 0; - args->fileloc = NULL; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - zfree(&args->line); - return 0; -} - -/* - * Possibly create a new version of line with tabs expanded. Returns the - * existing or new line, storage is updated if a new line is allocated. If - * allocation fails then NULL is returned. - */ -static char *expand_tabs(char *line, char **storage, size_t *storage_len) -{ - size_t i, src, dst, len, new_storage_len, num_tabs; - char *new_line; - size_t line_len = strlen(line); - - for (num_tabs = 0, i = 0; i < line_len; i++) - if (line[i] == '\t') - num_tabs++; - - if (num_tabs == 0) - return line; - - /* - * Space for the line and '\0', less the leading and trailing - * spaces. Each tab may introduce 7 additional spaces. - */ - new_storage_len = line_len + 1 + (num_tabs * 7); - - new_line = malloc(new_storage_len); - if (new_line == NULL) { - pr_err("Failure allocating memory for tab expansion\n"); - return NULL; - } - - /* - * Copy regions starting at src and expand tabs. If there are two - * adjacent tabs then 'src == i', the memcpy is of size 0 and the spaces - * are inserted. - */ - for (i = 0, src = 0, dst = 0; i < line_len && num_tabs; i++) { - if (line[i] == '\t') { - len = i - src; - memcpy(&new_line[dst], &line[src], len); - dst += len; - new_line[dst++] = ' '; - while (dst % 8 != 0) - new_line[dst++] = ' '; - src = i + 1; - num_tabs--; - } - } - - /* Expand the last region. */ - len = line_len - src; - memcpy(&new_line[dst], &line[src], len); - dst += len; - new_line[dst] = '\0'; - - free(*storage); - *storage = new_line; - *storage_len = new_storage_len; - return new_line; - -} - -static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) -{ - struct annotation_options *opts = &annotate_opts; - struct map *map = args->ms.map; - struct dso *dso = map__dso(map); - char *command; - FILE *file; - char symfs_filename[PATH_MAX]; - struct kcore_extract kce; - bool delete_extract = false; - bool decomp = false; - int lineno = 0; - char *fileloc = NULL; - int nline; - char *line; - size_t line_len; - const char *objdump_argv[] = { - "/bin/sh", - "-c", - NULL, /* Will be the objdump command to run. */ - "--", - NULL, /* Will be the symfs path. */ - NULL, - }; - struct child_process objdump_process; - int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename)); - - if (err) - return err; + if (max_lines && printed >= max_lines) + return 1; - pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, - symfs_filename, sym->name, map__unmap_ip(map, sym->start), - map__unmap_ip(map, sym->end)); - - pr_debug("annotating [%p] %30s : [%p] %30s\n", - dso, dso->long_name, sym, sym->name); - - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { - return symbol__disassemble_bpf(sym, args); - } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { - return symbol__disassemble_bpf_image(sym, args); - } else if (dso__is_kcore(dso)) { - kce.kcore_filename = symfs_filename; - kce.addr = map__rip_2objdump(map, sym->start); - kce.offs = sym->start; - kce.len = sym->end - sym->start; - if (!kcore_extract__create(&kce)) { - delete_extract = true; - strlcpy(symfs_filename, kce.extract_filename, - sizeof(symfs_filename)); + if (queue != NULL) { + list_for_each_entry_from(queue, ¬es->src->source, node) { + if (queue == al) + break; + annotation_line__print(queue, sym, start, evsel, len, + 0, 0, 1, NULL, addr_fmt_width, + percent_type); + } } - } else if (dso__needs_decompress(dso)) { - char tmp[KMOD_DECOMP_LEN]; - - if (dso__decompress_kmodule_path(dso, symfs_filename, - tmp, sizeof(tmp)) < 0) - return -1; - decomp = true; - strcpy(symfs_filename, tmp); - } - - err = asprintf(&command, - "%s %s%s --start-address=0x%016" PRIx64 - " --stop-address=0x%016" PRIx64 - " %s -d %s %s %s %c%s%c %s%s -C \"$1\"", - opts->objdump_path ?: "objdump", - opts->disassembler_style ? "-M " : "", - opts->disassembler_style ?: "", - map__rip_2objdump(map, sym->start), - map__rip_2objdump(map, sym->end), - opts->show_linenr ? "-l" : "", - opts->show_asm_raw ? "" : "--no-show-raw-insn", - opts->annotate_src ? "-S" : "", - opts->prefix ? "--prefix " : "", - opts->prefix ? '"' : ' ', - opts->prefix ?: "", - opts->prefix ? '"' : ' ', - opts->prefix_strip ? "--prefix-strip=" : "", - opts->prefix_strip ?: ""); - - if (err < 0) { - pr_err("Failure allocating memory for the command to run\n"); - goto out_remove_tmp; - } - - pr_debug("Executing: %s\n", command); - - objdump_argv[2] = command; - objdump_argv[4] = symfs_filename; - - /* Create a pipe to read from for stdout */ - memset(&objdump_process, 0, sizeof(objdump_process)); - objdump_process.argv = objdump_argv; - objdump_process.out = -1; - objdump_process.err = -1; - objdump_process.no_stderr = 1; - if (start_command(&objdump_process)) { - pr_err("Failure starting to run %s\n", command); - err = -1; - goto out_free_command; - } - - file = fdopen(objdump_process.out, "r"); - if (!file) { - pr_err("Failure creating FILE stream for %s\n", command); - /* - * If we were using debug info should retry with - * original binary. - */ - err = -1; - goto out_close_stdout; - } + color = get_percent_color(max_percent); - /* Storage for getline. */ - line = NULL; - line_len = 0; + for (i = 0; i < nr_percent; i++) { + struct annotation_data *data = &al->data[i]; + double percent; - nline = 0; - while (!feof(file)) { - const char *match; - char *expanded_line; + percent = annotation_data__percent(data, percent_type); + color = get_percent_color(percent); - if (getline(&line, &line_len, file) < 0 || !line) - break; + if (symbol_conf.show_total_period) + color_fprintf(stdout, color, " %11" PRIu64, + data->he.period); + else if (symbol_conf.show_nr_samples) + color_fprintf(stdout, color, " %7" PRIu64, + data->he.nr_samples); + else + color_fprintf(stdout, color, " %7.2f", percent); + } - /* Skip lines containing "filename:" */ - match = strstr(line, symfs_filename); - if (match && match[strlen(symfs_filename)] == ':') - continue; + printf(" : "); - expanded_line = strim(line); - expanded_line = expand_tabs(expanded_line, &line, &line_len); - if (!expanded_line) - break; + disasm_line__print(dl, start, addr_fmt_width); /* - * The source code line number (lineno) needs to be kept in - * across calls to symbol__parse_objdump_line(), so that it - * can associate it with the instructions till the next one. - * See disasm_line__new() and struct disasm_line::line_nr. + * Also color the filename and line if needed, with + * the same color than the percentage. Don't print it + * twice for close colored addr with the same filename:line */ - if (symbol__parse_objdump_line(sym, args, expanded_line, - &lineno, &fileloc) < 0) - break; - nline++; - } - free(line); - free(fileloc); - - err = finish_command(&objdump_process); - if (err) - pr_err("Error running %s\n", command); - - if (nline == 0) { - err = -1; - pr_err("No output from %s\n", command); - } - - /* - * kallsyms does not have symbol sizes so there may a nop at the end. - * Remove it. - */ - if (dso__is_kcore(dso)) - delete_last_nop(sym); - - fclose(file); + if (al->path) { + if (!prev_line || strcmp(prev_line, al->path)) { + color_fprintf(stdout, color, " // %s", al->path); + prev_line = al->path; + } + } -out_close_stdout: - close(objdump_process.out); + printf("\n"); + } else if (max_lines && printed >= max_lines) + return 1; + else { + int width = symbol_conf.show_total_period ? 12 : 8; -out_free_command: - free(command); + if (queue) + return -1; -out_remove_tmp: - if (decomp) - unlink(symfs_filename); + if (evsel__is_group_event(evsel)) + width *= evsel->core.nr_members; - if (delete_extract) - kcore_extract__delete(&kce); + if (!*al->line) + printf(" %*s:\n", width, " "); + else + printf(" %*s: %-*d %s\n", width, " ", addr_fmt_width, al->line_nr, al->line); + } - return err; + return 0; } static void calc_percent(struct annotation *notes, @@ -2429,8 +872,10 @@ static int evsel__get_arch(struct evsel *evsel, struct arch **parch) struct arch *arch; int err; - if (!arch_name) + if (!arch_name) { + *parch = NULL; return errno; + } *parch = arch = arch__find(arch_name); if (arch == NULL) { @@ -2468,15 +913,22 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, if (parch) *parch = arch; - if (!list_empty(¬es->src->source)) + if (notes->src && !list_empty(¬es->src->source)) return 0; args.arch = arch; args.ms = *ms; + + if (notes->src == NULL) { + notes->src = annotated_source__new(); + if (notes->src == NULL) + return -1; + } + if (annotate_opts.full_addr) - notes->start = map__objdump_2mem(ms->map, ms->sym->start); + notes->src->start = map__objdump_2mem(ms->map, ms->sym->start); else - notes->start = map__rip_2objdump(ms->map, ms->sym->start); + notes->src->start = map__rip_2objdump(ms->map, ms->sym->start); return symbol__disassemble(sym, &args); } @@ -2658,7 +1110,7 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel) int graph_dotted_len; char buf[512]; - filename = strdup(dso->long_name); + filename = strdup(dso__long_name(dso)); if (!filename) return -ENOMEM; @@ -2823,7 +1275,7 @@ int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel) } fprintf(fp, "%s() %s\nEvent: %s\n\n", - ms->sym->name, map__dso(ms->map)->long_name, ev_name); + ms->sym->name, dso__long_name(map__dso(ms->map)), ev_name); symbol__annotate_fprintf2(ms->sym, fp); fclose(fp); @@ -2845,13 +1297,16 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx) { struct annotation *notes = symbol__annotation(sym); struct sym_hist *h = annotation__histogram(notes, evidx); - int len = symbol__size(sym), offset; + struct annotation_line *al; h->nr_samples = 0; - for (offset = 0; offset < len; ++offset) { + list_for_each_entry(al, ¬es->src->source, node) { struct sym_hist_entry *entry; - entry = annotated_source__hist_entry(notes->src, evidx, offset); + if (al->offset == -1) + continue; + + entry = annotated_source__hist_entry(notes->src, evidx, al->offset); if (entry == NULL) continue; @@ -2908,64 +1363,56 @@ bool disasm_line__is_valid_local_jump(struct disasm_line *dl, struct symbol *sym return true; } -void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) +static void +annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) { - u64 offset, size = symbol__size(sym); + struct annotation_line *al; /* PLT symbols contain external offsets */ if (strstr(sym->name, "@plt")) return; - for (offset = 0; offset < size; ++offset) { - struct annotation_line *al = notes->src->offsets[offset]; + list_for_each_entry(al, ¬es->src->source, node) { struct disasm_line *dl; + struct annotation_line *target; dl = disasm_line(al); if (!disasm_line__is_valid_local_jump(dl, sym)) continue; - al = notes->src->offsets[dl->ops.target.offset]; - + target = annotated_source__get_line(notes->src, + dl->ops.target.offset); /* * FIXME: Oops, no jump target? Buggy disassembler? Or do we * have to adjust to the previous offset? */ - if (al == NULL) + if (target == NULL) continue; - if (++al->jump_sources > notes->max_jump_sources) - notes->max_jump_sources = al->jump_sources; + if (++target->jump_sources > notes->src->max_jump_sources) + notes->src->max_jump_sources = target->jump_sources; } } -void annotation__set_offsets(struct annotation *notes, s64 size) +static void annotation__set_index(struct annotation *notes) { struct annotation_line *al; struct annotated_source *src = notes->src; - src->max_line_len = 0; + src->widths.max_line_len = 0; src->nr_entries = 0; src->nr_asm_entries = 0; list_for_each_entry(al, &src->source, node) { size_t line_len = strlen(al->line); - if (src->max_line_len < line_len) - src->max_line_len = line_len; + if (src->widths.max_line_len < line_len) + src->widths.max_line_len = line_len; al->idx = src->nr_entries++; - if (al->offset != -1) { + if (al->offset != -1) al->idx_asm = src->nr_asm_entries++; - /* - * FIXME: short term bandaid to cope with assembly - * routines that comes with labels in the same column - * as the address in objdump, sigh. - * - * E.g. copy_user_generic_unrolled - */ - if (al->offset < size) - notes->src->offsets[al->offset] = al; - } else + else al->idx_asm = -1; } } @@ -2996,28 +1443,29 @@ static int annotation__max_ins_name(struct annotation *notes) return max_name; } -void annotation__init_column_widths(struct annotation *notes, struct symbol *sym) +static void +annotation__init_column_widths(struct annotation *notes, struct symbol *sym) { - notes->widths.addr = notes->widths.target = - notes->widths.min_addr = hex_width(symbol__size(sym)); - notes->widths.max_addr = hex_width(sym->end); - notes->widths.jumps = width_jumps(notes->max_jump_sources); - notes->widths.max_ins_name = annotation__max_ins_name(notes); + notes->src->widths.addr = notes->src->widths.target = + notes->src->widths.min_addr = hex_width(symbol__size(sym)); + notes->src->widths.max_addr = hex_width(sym->end); + notes->src->widths.jumps = width_jumps(notes->src->max_jump_sources); + notes->src->widths.max_ins_name = annotation__max_ins_name(notes); } void annotation__update_column_widths(struct annotation *notes) { if (annotate_opts.use_offset) - notes->widths.target = notes->widths.min_addr; + notes->src->widths.target = notes->src->widths.min_addr; else if (annotate_opts.full_addr) - notes->widths.target = BITS_PER_LONG / 4; + notes->src->widths.target = BITS_PER_LONG / 4; else - notes->widths.target = notes->widths.max_addr; + notes->src->widths.target = notes->src->widths.max_addr; - notes->widths.addr = notes->widths.target; + notes->src->widths.addr = notes->src->widths.target; if (annotate_opts.show_nr_jumps) - notes->widths.addr += notes->widths.jumps + 1; + notes->src->widths.addr += notes->src->widths.jumps + 1; } void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms) @@ -3025,9 +1473,9 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m annotate_opts.full_addr = !annotate_opts.full_addr; if (annotate_opts.full_addr) - notes->start = map__objdump_2mem(ms->map, ms->sym->start); + notes->src->start = map__objdump_2mem(ms->map, ms->sym->start); else - notes->start = map__rip_2objdump(ms->map, ms->sym->start); + notes->src->start = map__rip_2objdump(ms->map, ms->sym->start); annotation__update_column_widths(notes); } @@ -3085,7 +1533,7 @@ int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel) if (err) { char msg[BUFSIZ]; - dso->annotate_warned = true; + dso__set_annotate_warned(dso); symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s:\n%s", sym->name, msg); return -1; @@ -3094,13 +1542,12 @@ int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel) if (annotate_opts.print_lines) { srcline_full_filename = annotate_opts.full_path; symbol__calc_lines(ms, &source_line); - print_summary(&source_line, dso->long_name); + print_summary(&source_line, dso__long_name(dso)); } hists__scnprintf_title(hists, buf, sizeof(buf)); fprintf(stdout, "%s, [percent: %s]\n%s() %s\n", - buf, percent_type_str(annotate_opts.percent_type), sym->name, - dso->long_name); + buf, percent_type_str(annotate_opts.percent_type), sym->name, dso__long_name(dso)); symbol__annotate_fprintf2(sym, stdout); annotated_source__purge(symbol__annotation(sym)->src); @@ -3119,7 +1566,7 @@ int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel) if (err) { char msg[BUFSIZ]; - dso->annotate_warned = true; + dso__set_annotate_warned(dso); symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s:\n%s", sym->name, msg); return -1; @@ -3130,7 +1577,7 @@ int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel) if (annotate_opts.print_lines) { srcline_full_filename = annotate_opts.full_path; symbol__calc_lines(ms, &source_line); - print_summary(&source_line, dso->long_name); + print_summary(&source_line, dso__long_name(dso)); } symbol__annotate_printf(ms, evsel); @@ -3153,7 +1600,7 @@ static double annotation_line__max_percent(struct annotation_line *al, double percent_max = 0.0; int i; - for (i = 0; i < notes->nr_events; i++) { + for (i = 0; i < notes->src->nr_events; i++) { double percent; percent = annotation_data__percent(&al->data[i], @@ -3194,7 +1641,8 @@ call_like: obj__printf(obj, " "); } - disasm_line__scnprintf(dl, bf, size, !annotate_opts.use_offset, notes->widths.max_ins_name); + disasm_line__scnprintf(dl, bf, size, !annotate_opts.use_offset, + notes->src->widths.max_ins_name); } static void ipc_coverage_string(char *bf, int size, struct annotation *notes) @@ -3242,7 +1690,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati if (al->offset != -1 && percent_max != 0.0) { int i; - for (i = 0; i < notes->nr_events; i++) { + for (i = 0; i < notes->src->nr_events; i++) { double percent; percent = annotation_data__percent(&al->data[i], percent_type); @@ -3322,9 +1770,11 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati obj__printf(obj, "%-*s", width - pcnt_width - cycles_width, " "); else if (al->offset == -1) { if (al->line_nr && annotate_opts.show_linenr) - printed = scnprintf(bf, sizeof(bf), "%-*d ", notes->widths.addr + 1, al->line_nr); + printed = scnprintf(bf, sizeof(bf), "%-*d ", + notes->src->widths.addr + 1, al->line_nr); else - printed = scnprintf(bf, sizeof(bf), "%-*s ", notes->widths.addr, " "); + printed = scnprintf(bf, sizeof(bf), "%-*s ", + notes->src->widths.addr, " "); obj__printf(obj, bf); obj__printf(obj, "%-*s", width - printed - pcnt_width - cycles_width + 1, al->line); } else { @@ -3332,7 +1782,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati int color = -1; if (!annotate_opts.use_offset) - addr += notes->start; + addr += notes->src->start; if (!annotate_opts.use_offset) { printed = scnprintf(bf, sizeof(bf), "%" PRIx64 ": ", addr); @@ -3342,7 +1792,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati if (annotate_opts.show_nr_jumps) { int prev; printed = scnprintf(bf, sizeof(bf), "%*d ", - notes->widths.jumps, + notes->src->widths.jumps, al->jump_sources); prev = obj__set_jumps_percent_color(obj, al->jump_sources, current_entry); @@ -3351,7 +1801,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati } print_addr: printed = scnprintf(bf, sizeof(bf), "%*" PRIx64 ": ", - notes->widths.target, addr); + notes->src->widths.target, addr); } else if (ins__is_call(&disasm_line(al)->ins) && annotate_opts.offset_level >= ANNOTATION__OFFSET_CALL) { goto print_addr; @@ -3359,7 +1809,7 @@ print_addr: goto print_addr; } else { printed = scnprintf(bf, sizeof(bf), "%-*s ", - notes->widths.addr, " "); + notes->src->widths.addr, " "); } } @@ -3395,37 +1845,29 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, size_t size = symbol__size(sym); int nr_pcnt = 1, err; - notes->src->offsets = zalloc(size * sizeof(struct annotation_line *)); - if (notes->src->offsets == NULL) - return ENOMEM; - if (evsel__is_group_event(evsel)) nr_pcnt = evsel->core.nr_members; err = symbol__annotate(ms, evsel, parch); if (err) - goto out_free_offsets; + return err; symbol__calc_percent(sym, evsel); - annotation__set_offsets(notes, size); + annotation__set_index(notes); annotation__mark_jump_targets(notes, sym); err = annotation__compute_ipc(notes, size); if (err) - goto out_free_offsets; + return err; annotation__init_column_widths(notes, sym); - notes->nr_events = nr_pcnt; + notes->src->nr_events = nr_pcnt; annotation__update_column_widths(notes); sym->annotate2 = 1; return 0; - -out_free_offsets: - zfree(¬es->src->offsets); - return err; } static int annotation__config(const char *var, const char *value, void *data) @@ -3597,6 +2039,12 @@ static int extract_reg_offset(struct arch *arch, const char *str, * %gs:0x18(%rbx). In that case it should skip the part. */ if (*str == arch->objdump.register_char) { + if (arch__is(arch, "x86")) { + /* FIXME: Handle other segment registers */ + if (!strncmp(str, "%gs:", 4)) + op_loc->segment = INSN_SEG_X86_GS; + } + while (*str && !isdigit(*str) && *str != arch->objdump.memory_ref_char) str++; @@ -3651,7 +2099,7 @@ static int extract_reg_offset(struct arch *arch, const char *str, * mov 0x18, %r8 # src_reg1 = -1, src_mem = 0 * # dst_reg1 = r8, dst_mem = 0 * - * mov %rsi, 8(%rbx,%rcx,4) # src_reg1 = rsi, src_mem = 0, dst_multi_regs = 0 + * mov %rsi, 8(%rbx,%rcx,4) # src_reg1 = rsi, src_mem = 0, src_multi_regs = 0 * # dst_reg1 = rbx, dst_reg2 = rcx, dst_mem = 1 * # dst_multi_regs = 1, dst_offset = 8 */ @@ -3662,7 +2110,7 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, struct annotated_op_loc *op_loc; int i; - if (!strcmp(dl->ins.name, "lock")) + if (ins__is_lock(&dl->ins)) ops = dl->ops.locked.ops; else ops = &dl->ops; @@ -3693,40 +2141,40 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, op_loc->multi_regs = multi_regs; extract_reg_offset(arch, insn_str, op_loc); } else { - char *s = strdup(insn_str); + char *s, *p = NULL; + + if (arch__is(arch, "x86")) { + /* FIXME: Handle other segment registers */ + if (!strncmp(insn_str, "%gs:", 4)) { + op_loc->segment = INSN_SEG_X86_GS; + op_loc->offset = strtol(insn_str + 4, + &p, 0); + if (p && p != insn_str + 4) + op_loc->imm = true; + continue; + } + } + + s = strdup(insn_str); + if (s == NULL) + return -1; - if (s) { + if (*s == arch->objdump.register_char) op_loc->reg1 = get_dwarf_regnum(s, 0); - free(s); + else if (*s == arch->objdump.imm_char) { + op_loc->offset = strtol(s + 1, &p, 0); + if (p && p != s + 1) + op_loc->imm = true; } + free(s); } } return 0; } -static void symbol__ensure_annotate(struct map_symbol *ms, struct evsel *evsel) -{ - struct disasm_line *dl, *tmp_dl; - struct annotation *notes; - - notes = symbol__annotation(ms->sym); - if (!list_empty(¬es->src->source)) - return; - - if (symbol__annotate(ms, evsel, NULL) < 0) - return; - - /* remove non-insn disasm lines for simplicity */ - list_for_each_entry_safe(dl, tmp_dl, ¬es->src->source, al.node) { - if (dl->al.offset == -1) { - list_del(&dl->al.node); - free(dl); - } - } -} - -static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip) +static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip, + bool allow_update) { struct disasm_line *dl; struct annotation *notes; @@ -3734,12 +2182,16 @@ static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip) notes = symbol__annotation(sym); list_for_each_entry(dl, ¬es->src->source, al.node) { + if (dl->al.offset == -1) + continue; + if (sym->start + dl->al.offset == ip) { /* * llvm-objdump places "lock" in a separate line and * in that case, we want to get the next line. */ - if (!strcmp(dl->ins.name, "lock") && *dl->ops.raw == '\0') { + if (ins__is_lock(&dl->ins) && + *dl->ops.raw == '\0' && allow_update) { ip++; continue; } @@ -3785,6 +2237,58 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) return false; } +static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) +{ + /* On x86_64, %gs:40 is used for stack canary */ + if (arch__is(arch, "x86")) { + if (loc->segment == INSN_SEG_X86_GS && loc->imm && + loc->offset == 40) + return true; + } + + return false; +} + +static struct disasm_line * +annotation__prev_asm_line(struct annotation *notes, struct disasm_line *curr) +{ + struct list_head *sources = ¬es->src->source; + struct disasm_line *prev; + + if (curr == list_first_entry(sources, struct disasm_line, al.node)) + return NULL; + + prev = list_prev_entry(curr, al.node); + while (prev->al.offset == -1 && + prev != list_first_entry(sources, struct disasm_line, al.node)) + prev = list_prev_entry(prev, al.node); + + if (prev->al.offset == -1) + return NULL; + + return prev; +} + +static struct disasm_line * +annotation__next_asm_line(struct annotation *notes, struct disasm_line *curr) +{ + struct list_head *sources = ¬es->src->source; + struct disasm_line *next; + + if (curr == list_last_entry(sources, struct disasm_line, al.node)) + return NULL; + + next = list_next_entry(curr, al.node); + while (next->al.offset == -1 && + next != list_last_entry(sources, struct disasm_line, al.node)) + next = list_next_entry(next, al.node); + + if (next->al.offset == -1) + return NULL; + + return next; +} + u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, struct disasm_line *dl) { @@ -3800,12 +2304,12 @@ u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, * disasm_line. If it's the last one, we can use symbol's end * address directly. */ - if (&dl->al.node == notes->src->source.prev) + next = annotation__next_asm_line(notes, dl); + if (next == NULL) addr = ms->sym->end + offset; - else { - next = list_next_entry(dl, al.node); + else addr = ip + (next->al.offset - dl->al.offset) + offset; - } + return map__rip_2objdump(ms->map, addr); } @@ -3828,9 +2332,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) struct annotated_op_loc *op_loc; struct annotated_data_type *mem_type; struct annotated_item_stat *istat; - u64 ip = he->ip, addr = 0; - const char *var_name = NULL; - int var_offset; + u64 ip = he->ip; int i; ann_data_stat.total++; @@ -3845,19 +2347,17 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) return NULL; } - if (evsel__get_arch(evsel, &arch) < 0) { + /* Make sure it has the disasm of the function */ + if (symbol__annotate(ms, evsel, &arch) < 0) { ann_data_stat.no_insn++; return NULL; } - /* Make sure it runs objdump to get disasm of the function */ - symbol__ensure_annotate(ms, evsel); - /* * Get a disasm to extract the location from the insn. * This is too slow... */ - dl = find_disasm_line(ms->sym, ip); + dl = find_disasm_line(ms->sym, ip, /*allow_update=*/true); if (dl == NULL) { ann_data_stat.no_insn++; return NULL; @@ -3883,51 +2383,55 @@ retry: } for_each_insn_op_loc(&loc, i, op_loc) { - if (!op_loc->mem_ref) + struct data_loc_info dloc = { + .arch = arch, + .thread = he->thread, + .ms = ms, + /* Recalculate IP for LOCK prefix or insn fusion */ + .ip = ms->sym->start + dl->al.offset, + .cpumode = he->cpumode, + .op = op_loc, + }; + + if (!op_loc->mem_ref && op_loc->segment == INSN_SEG_NONE) continue; /* Recalculate IP because of LOCK prefix or insn fusion */ ip = ms->sym->start + dl->al.offset; - var_offset = op_loc->offset; - /* PC-relative addressing */ if (op_loc->reg1 == DWARF_REG_PC) { - struct addr_location al; - struct symbol *var; - u64 map_addr; - - addr = annotate_calc_pcrel(ms, ip, op_loc->offset, dl); - /* Kernel symbols might be relocated */ - map_addr = addr + map__reloc(ms->map); - - addr_location__init(&al); - var = thread__find_symbol_fb(he->thread, he->cpumode, - map_addr, &al); - if (var) { - var_name = var->name; - /* Calculate type offset from the start of variable */ - var_offset = map_addr - map__unmap_ip(al.map, var->start); - } - addr_location__exit(&al); + dloc.var_addr = annotate_calc_pcrel(ms, dloc.ip, + op_loc->offset, dl); + } + + /* This CPU access in kernel - pretend PC-relative addressing */ + if (dso__kernel(map__dso(ms->map)) && arch__is(arch, "x86") && + op_loc->segment == INSN_SEG_X86_GS && op_loc->imm) { + dloc.var_addr = op_loc->offset; + op_loc->reg1 = DWARF_REG_PC; + } + + mem_type = find_data_type(&dloc); + + if (mem_type == NULL && is_stack_canary(arch, op_loc)) { + istat->good++; + he->mem_type_off = 0; + return &canary_type; } - mem_type = find_data_type(ms, ip, op_loc, addr, var_name); if (mem_type) istat->good++; else istat->bad++; - if (mem_type && var_name) - op_loc->offset = var_offset; - if (symbol_conf.annotate_data_sample) { annotated_data_type__update_samples(mem_type, evsel, - op_loc->offset, + dloc.type_offset, he->stat.nr_events, he->stat.period); } - he->mem_type_off = op_loc->offset; + he->mem_type_off = dloc.type_offset; return mem_type; } @@ -3936,10 +2440,13 @@ retry: * from the previous instruction. */ if (dl->al.offset > 0) { + struct annotation *notes; struct disasm_line *prev_dl; - prev_dl = list_prev_entry(dl, al.node); - if (ins__is_fused(arch, prev_dl->ins.name, dl->ins.name)) { + notes = symbol__annotation(ms->sym); + prev_dl = annotation__prev_asm_line(notes, dl); + + if (prev_dl && ins__is_fused(arch, prev_dl->ins.name, dl->ins.name)) { dl = prev_dl; goto retry; } @@ -3949,3 +2456,227 @@ retry: istat->bad++; return NULL; } + +/* Basic block traversal (BFS) data structure */ +struct basic_block_data { + struct list_head queue; + struct list_head visited; +}; + +/* + * During the traversal, it needs to know the parent block where the current + * block block started from. Note that single basic block can be parent of + * two child basic blocks (in case of condition jump). + */ +struct basic_block_link { + struct list_head node; + struct basic_block_link *parent; + struct annotated_basic_block *bb; +}; + +/* Check any of basic block in the list already has the offset */ +static bool basic_block_has_offset(struct list_head *head, s64 offset) +{ + struct basic_block_link *link; + + list_for_each_entry(link, head, node) { + s64 begin_offset = link->bb->begin->al.offset; + s64 end_offset = link->bb->end->al.offset; + + if (begin_offset <= offset && offset <= end_offset) + return true; + } + return false; +} + +static bool is_new_basic_block(struct basic_block_data *bb_data, + struct disasm_line *dl) +{ + s64 offset = dl->al.offset; + + if (basic_block_has_offset(&bb_data->visited, offset)) + return false; + if (basic_block_has_offset(&bb_data->queue, offset)) + return false; + return true; +} + +/* Add a basic block starting from dl and link it to the parent */ +static int add_basic_block(struct basic_block_data *bb_data, + struct basic_block_link *parent, + struct disasm_line *dl) +{ + struct annotated_basic_block *bb; + struct basic_block_link *link; + + if (dl == NULL) + return -1; + + if (!is_new_basic_block(bb_data, dl)) + return 0; + + bb = zalloc(sizeof(*bb)); + if (bb == NULL) + return -1; + + bb->begin = dl; + bb->end = dl; + INIT_LIST_HEAD(&bb->list); + + link = malloc(sizeof(*link)); + if (link == NULL) { + free(bb); + return -1; + } + + link->bb = bb; + link->parent = parent; + list_add_tail(&link->node, &bb_data->queue); + return 0; +} + +/* Returns true when it finds the target in the current basic block */ +static bool process_basic_block(struct basic_block_data *bb_data, + struct basic_block_link *link, + struct symbol *sym, u64 target) +{ + struct disasm_line *dl, *next_dl, *last_dl; + struct annotation *notes = symbol__annotation(sym); + bool found = false; + + dl = link->bb->begin; + /* Check if it's already visited */ + if (basic_block_has_offset(&bb_data->visited, dl->al.offset)) + return false; + + last_dl = list_last_entry(¬es->src->source, + struct disasm_line, al.node); + if (last_dl->al.offset == -1) + last_dl = annotation__prev_asm_line(notes, last_dl); + + if (last_dl == NULL) + return false; + + list_for_each_entry_from(dl, ¬es->src->source, al.node) { + /* Skip comment or debug info line */ + if (dl->al.offset == -1) + continue; + /* Found the target instruction */ + if (sym->start + dl->al.offset == target) { + found = true; + break; + } + /* End of the function, finish the block */ + if (dl == last_dl) + break; + /* 'return' instruction finishes the block */ + if (ins__is_ret(&dl->ins)) + break; + /* normal instructions are part of the basic block */ + if (!ins__is_jump(&dl->ins)) + continue; + /* jump to a different function, tail call or return */ + if (dl->ops.target.outside) + break; + /* jump instruction creates new basic block(s) */ + next_dl = find_disasm_line(sym, sym->start + dl->ops.target.offset, + /*allow_update=*/false); + if (next_dl) + add_basic_block(bb_data, link, next_dl); + + /* + * FIXME: determine conditional jumps properly. + * Conditional jumps create another basic block with the + * next disasm line. + */ + if (!strstr(dl->ins.name, "jmp")) { + next_dl = annotation__next_asm_line(notes, dl); + if (next_dl) + add_basic_block(bb_data, link, next_dl); + } + break; + + } + link->bb->end = dl; + return found; +} + +/* + * It founds a target basic block, build a proper linked list of basic blocks + * by following the link recursively. + */ +static void link_found_basic_blocks(struct basic_block_link *link, + struct list_head *head) +{ + while (link) { + struct basic_block_link *parent = link->parent; + + list_move(&link->bb->list, head); + list_del(&link->node); + free(link); + + link = parent; + } +} + +static void delete_basic_blocks(struct basic_block_data *bb_data) +{ + struct basic_block_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &bb_data->queue, node) { + list_del(&link->node); + zfree(&link->bb); + free(link); + } + + list_for_each_entry_safe(link, tmp, &bb_data->visited, node) { + list_del(&link->node); + zfree(&link->bb); + free(link); + } +} + +/** + * annotate_get_basic_blocks - Get basic blocks for given address range + * @sym: symbol to annotate + * @src: source address + * @dst: destination address + * @head: list head to save basic blocks + * + * This function traverses disasm_lines from @src to @dst and save them in a + * list of annotated_basic_block to @head. It uses BFS to find the shortest + * path between two. The basic_block_link is to maintain parent links so + * that it can build a list of blocks from the start. + */ +int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst, + struct list_head *head) +{ + struct basic_block_data bb_data = { + .queue = LIST_HEAD_INIT(bb_data.queue), + .visited = LIST_HEAD_INIT(bb_data.visited), + }; + struct basic_block_link *link; + struct disasm_line *dl; + int ret = -1; + + dl = find_disasm_line(sym, src, /*allow_update=*/false); + if (dl == NULL) + return -1; + + if (add_basic_block(&bb_data, /*parent=*/NULL, dl) < 0) + return -1; + + /* Find shortest path from src to dst using BFS */ + while (!list_empty(&bb_data.queue)) { + link = list_first_entry(&bb_data.queue, struct basic_block_link, node); + + if (process_basic_block(&bb_data, link, sym, dst)) { + link_found_basic_blocks(link, head); + ret = 0; + break; + } + list_move(&link->node, &bb_data.visited); + } + delete_basic_blocks(&bb_data); + return ret; +} diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 13cc659e50..d5c821c22f 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -13,10 +13,10 @@ #include "mutex.h" #include "spark.h" #include "hashmap.h" +#include "disasm.h" struct hist_browser_timer; struct hist_entry; -struct ins_ops; struct map; struct map_symbol; struct addr_map_symbol; @@ -26,59 +26,6 @@ struct evsel; struct symbol; struct annotated_data_type; -struct ins { - const char *name; - struct ins_ops *ops; -}; - -struct ins_operands { - char *raw; - struct { - char *raw; - char *name; - struct symbol *sym; - u64 addr; - s64 offset; - bool offset_avail; - bool outside; - bool multi_regs; - } target; - union { - struct { - char *raw; - char *name; - u64 addr; - bool multi_regs; - } source; - struct { - struct ins ins; - struct ins_operands *ops; - } locked; - struct { - char *raw_comment; - char *raw_func_start; - } jump; - }; -}; - -struct arch; - -bool arch__is(struct arch *arch, const char *name); - -struct ins_ops { - void (*free)(struct ins_operands *ops); - int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms); - int (*scnprintf)(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); -}; - -bool ins__is_jump(const struct ins *ins); -bool ins__is_call(const struct ins *ins); -bool ins__is_ret(const struct ins *ins); -bool ins__is_lock(const struct ins *ins); -int ins__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); -bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2); - #define ANNOTATION__IPC_WIDTH 6 #define ANNOTATION__CYCLES_WIDTH 6 #define ANNOTATION__MINMAX_CYCLES_WIDTH 19 @@ -171,6 +118,8 @@ struct disasm_line { struct annotation_line al; }; +void annotation_line__add(struct annotation_line *al, struct list_head *head); + static inline double annotation_data__percent(struct annotation_data *data, unsigned int which) { @@ -212,7 +161,6 @@ static inline bool disasm_line__has_local_offset(const struct disasm_line *dl) */ bool disasm_line__is_valid_local_jump(struct disasm_line *dl, struct symbol *sym); -void disasm_line__free(struct disasm_line *dl); struct annotation_line * annotation_line__next(struct annotation_line *pos, struct list_head *head); @@ -235,7 +183,6 @@ int __annotation__scnprintf_samples_period(struct annotation *notes, struct evsel *evsel, bool show_freq); -int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw, int max_ins_name); size_t disasm__fprintf(struct list_head *head, FILE *fp); void symbol__calc_percent(struct symbol *sym, struct evsel *evsel); @@ -299,12 +246,14 @@ struct cyc_hist { * we have more than a group in a evlist, where we will want * to see each group separately, that is why symbol__annotate2() * sets src->nr_histograms to evsel->nr_members. - * @offsets: Array of annotation_line to be accessed by offset. * @samples: Hash map of sym_hist_entry. Keyed by event index and offset in symbol. + * @nr_events: Number of events in the current output. * @nr_entries: Number of annotated_line in the source list. * @nr_asm_entries: Number of annotated_line with actual asm instruction in the * source list. - * @max_line_len: Maximum length of objdump output in an annotated_line. + * @max_jump_sources: Maximum number of jump instructions targeting to the same + * instruction. + * @widths: Precalculated width of each column in the TUI output. * * disasm_lines are allocated, percentages calculated and all sorted by percentage * when the annotation is about to be presented, so the percentages are for @@ -315,14 +264,27 @@ struct cyc_hist { struct annotated_source { struct list_head source; struct sym_hist *histograms; - struct annotation_line **offsets; struct hashmap *samples; int nr_histograms; + int nr_events; int nr_entries; int nr_asm_entries; - u16 max_line_len; + int max_jump_sources; + u64 start; + struct { + u8 addr; + u8 jumps; + u8 target; + u8 min_addr; + u8 max_addr; + u8 max_ins_name; + u16 max_line_len; + } widths; }; +struct annotation_line *annotated_source__get_line(struct annotated_source *src, + s64 offset); + /** * struct annotated_branch - basic block and IPC information for a symbol. * @@ -351,17 +313,6 @@ struct annotated_branch { }; struct LOCKABLE annotation { - u64 start; - int nr_events; - int max_jump_sources; - struct { - u8 addr; - u8 jumps; - u8 target; - u8 min_addr; - u8 max_addr; - u8 max_ins_name; - } widths; struct annotated_source *src; struct annotated_branch *branch; }; @@ -385,7 +336,7 @@ static inline int annotation__cycles_width(struct annotation *notes) static inline int annotation__pcnt_width(struct annotation *notes) { - return (symbol_conf.show_total_period ? 12 : 7) * notes->nr_events; + return (symbol_conf.show_total_period ? 12 : 7) * notes->src->nr_events; } static inline bool annotation_line__filter(struct annotation_line *al) @@ -393,10 +344,7 @@ static inline bool annotation_line__filter(struct annotation_line *al) return annotate_opts.hide_src_code && al->offset == -1; } -void annotation__set_offsets(struct annotation *notes, s64 size); -void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym); void annotation__update_column_widths(struct annotation *notes); -void annotation__init_column_widths(struct annotation *notes, struct symbol *sym); void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms); static inline struct sym_hist *annotated_source__histogram(struct annotated_source *src, int idx) @@ -511,15 +459,19 @@ int annotate_check_args(void); * @reg1: First register in the operand * @reg2: Second register in the operand * @offset: Memory access offset in the operand + * @segment: Segment selector register * @mem_ref: Whether the operand accesses memory * @multi_regs: Whether the second register is used + * @imm: Whether the operand is an immediate value (in offset) */ struct annotated_op_loc { int reg1; int reg2; int offset; + u8 segment; bool mem_ref; bool multi_regs; + bool imm; }; enum annotated_insn_ops { @@ -529,6 +481,17 @@ enum annotated_insn_ops { INSN_OP_MAX, }; +enum annotated_x86_segment { + INSN_SEG_NONE = 0, + + INSN_SEG_X86_CS, + INSN_SEG_X86_DS, + INSN_SEG_X86_ES, + INSN_SEG_X86_FS, + INSN_SEG_X86_GS, + INSN_SEG_X86_SS, +}; + /** * struct annotated_insn_loc - Location info of instruction * @ops: Array of location info for source and target operands @@ -561,4 +524,20 @@ extern struct list_head ann_insn_stat; u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, struct disasm_line *dl); +/** + * struct annotated_basic_block - Basic block of instructions + * @list: List node + * @begin: start instruction in the block + * @end: end instruction in the block + */ +struct annotated_basic_block { + struct list_head list; + struct disasm_line *begin; + struct disasm_line *end; +}; + +/* Get a list of basic blocks from src to dst addresses */ +int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst, + struct list_head *head); + #endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index ef314a5797..e2f317063e 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -174,7 +174,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, struct evlist *evlist, struct evsel *evsel, int idx) { - bool per_cpu = !perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus); + bool per_cpu = !perf_cpu_map__has_any_cpu(evlist->core.user_requested_cpus); mp->mmap_needed = evsel->needs_auxtrace_mmap; @@ -218,15 +218,20 @@ static struct auxtrace_queue *auxtrace_alloc_queue_array(unsigned int nr_queues) return queue_array; } -int auxtrace_queues__init(struct auxtrace_queues *queues) +int auxtrace_queues__init_nr(struct auxtrace_queues *queues, int nr_queues) { - queues->nr_queues = AUXTRACE_INIT_NR_QUEUES; + queues->nr_queues = nr_queues; queues->queue_array = auxtrace_alloc_queue_array(queues->nr_queues); if (!queues->queue_array) return -ENOMEM; return 0; } +int auxtrace_queues__init(struct auxtrace_queues *queues) +{ + return auxtrace_queues__init_nr(queues, AUXTRACE_INIT_NR_QUEUES); +} + static int auxtrace_queues__grow(struct auxtrace_queues *queues, unsigned int new_nr_queues) { @@ -648,7 +653,7 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr, static int evlist__enable_event_idx(struct evlist *evlist, struct evsel *evsel, int idx) { - bool per_cpu_mmaps = !perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus); + bool per_cpu_mmaps = !perf_cpu_map__has_any_cpu(evlist->core.user_requested_cpus); if (per_cpu_mmaps) { struct perf_cpu evlist_cpu = perf_cpu_map__cpu(evlist->core.all_cpus, idx); @@ -2654,7 +2659,7 @@ static int addr_filter__entire_dso(struct addr_filter *filt, struct dso *dso) } filt->addr = 0; - filt->size = dso->data.file_size; + filt->size = dso__data(dso)->file_size; return 0; } diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 55702215a8..8a6ec95658 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -521,6 +521,7 @@ int auxtrace_mmap__read_snapshot(struct mmap *map, struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size); +int auxtrace_queues__init_nr(struct auxtrace_queues *queues, int nr_queues); int auxtrace_queues__init(struct auxtrace_queues *queues); int auxtrace_queues__add_event(struct auxtrace_queues *queues, struct perf_session *session, diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index dec9109897..04068d4868 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -43,26 +43,14 @@ static struct block_header_column { } }; -struct block_info *block_info__get(struct block_info *bi) -{ - if (bi) - refcount_inc(&bi->refcnt); - return bi; -} - -void block_info__put(struct block_info *bi) +struct block_info *block_info__new(void) { - if (bi && refcount_dec_and_test(&bi->refcnt)) - free(bi); + return zalloc(sizeof(struct block_info)); } -struct block_info *block_info__new(void) +void block_info__delete(struct block_info *bi) { - struct block_info *bi = zalloc(sizeof(*bi)); - - if (bi) - refcount_set(&bi->refcnt, 1); - return bi; + free(bi); } int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right) @@ -148,7 +136,7 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, he_block = hists__add_entry_block(&bh->block_hists, &al, bi); if (!he_block) { - block_info__put(bi); + block_info__delete(bi); return -1; } } @@ -319,7 +307,7 @@ static int block_dso_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, if (map && map__dso(map)) { return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, - map__dso(map)->short_name); + dso__short_name(map__dso(map))); } return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h index 96f53e8979..0b9e1aad4c 100644 --- a/tools/perf/util/block-info.h +++ b/tools/perf/util/block-info.h @@ -3,7 +3,6 @@ #define __PERF_BLOCK_H #include -#include #include "hist.h" #include "symbol.h" #include "sort.h" @@ -19,7 +18,6 @@ struct block_info { u64 total_cycles; int num; int num_aggr; - refcount_t refcnt; }; struct block_fmt { @@ -48,19 +46,8 @@ struct block_report { int nr_fmts; }; -struct block_hist; - struct block_info *block_info__new(void); -struct block_info *block_info__get(struct block_info *bi); -void block_info__put(struct block_info *bi); - -static inline void __block_info__zput(struct block_info **bi) -{ - block_info__put(*bi); - *bi = NULL; -} - -#define block_info__zput(bi) __block_info__zput(&bi) +void block_info__delete(struct block_info *bi); int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right); diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 83709146a4..827695cd04 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -59,10 +59,10 @@ static int machine__process_bpf_event_load(struct machine *machine, if (map) { struct dso *dso = map__dso(map); - dso->binary_type = DSO_BINARY_TYPE__BPF_PROG_INFO; - dso->bpf_prog.id = id; - dso->bpf_prog.sub_id = i; - dso->bpf_prog.env = env; + dso__set_binary_type(dso, DSO_BINARY_TYPE__BPF_PROG_INFO); + dso__bpf_prog(dso)->id = id; + dso__bpf_prog(dso)->sub_id = i; + dso__bpf_prog(dso)->env = env; map__put(map); } } diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c index 1c82377ed7..ea29c372f3 100644 --- a/tools/perf/util/bpf_counter_cgroup.c +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -136,9 +136,8 @@ static int bperf_load_program(struct evlist *evlist) cgrp = evsel->cgrp; if (read_cgroup_id(cgrp) < 0) { - pr_err("Failed to get cgroup id\n"); - err = -1; - goto out; + pr_debug("Failed to get cgroup id for %s\n", cgrp->name); + cgrp->id = 0; } map_fd = bpf_map__fd(skel->maps.cgrp_idx); diff --git a/tools/perf/util/bpf_kwork.c b/tools/perf/util/bpf_kwork.c index 6eb2c78fd7..44f0f708a1 100644 --- a/tools/perf/util/bpf_kwork.c +++ b/tools/perf/util/bpf_kwork.c @@ -147,12 +147,12 @@ static bool valid_kwork_class_type(enum kwork_class_type type) static int setup_filters(struct perf_kwork *kwork) { - u8 val = 1; - int i, nr_cpus, key, fd; - struct perf_cpu_map *map; - if (kwork->cpu_list != NULL) { - fd = bpf_map__fd(skel->maps.perf_kwork_cpu_filter); + int idx, nr_cpus; + struct perf_cpu_map *map; + struct perf_cpu cpu; + int fd = bpf_map__fd(skel->maps.perf_kwork_cpu_filter); + if (fd < 0) { pr_debug("Invalid cpu filter fd\n"); return -1; @@ -165,8 +165,8 @@ static int setup_filters(struct perf_kwork *kwork) } nr_cpus = libbpf_num_possible_cpus(); - for (i = 0; i < perf_cpu_map__nr(map); i++) { - struct perf_cpu cpu = perf_cpu_map__cpu(map, i); + perf_cpu_map__for_each_cpu(cpu, idx, map) { + u8 val = 1; if (cpu.cpu >= nr_cpus) { perf_cpu_map__put(map); @@ -181,6 +181,8 @@ static int setup_filters(struct perf_kwork *kwork) } if (kwork->profile_name != NULL) { + int key, fd; + if (strlen(kwork->profile_name) >= MAX_KWORKNAME) { pr_err("Requested name filter %s too large, limit to %d\n", kwork->profile_name, MAX_KWORKNAME - 1); diff --git a/tools/perf/util/bpf_kwork_top.c b/tools/perf/util/bpf_kwork_top.c index 035e022727..22a3b00a1e 100644 --- a/tools/perf/util/bpf_kwork_top.c +++ b/tools/perf/util/bpf_kwork_top.c @@ -122,11 +122,11 @@ static bool valid_kwork_class_type(enum kwork_class_type type) static int setup_filters(struct perf_kwork *kwork) { - u8 val = 1; - int i, nr_cpus, fd; - struct perf_cpu_map *map; - if (kwork->cpu_list) { + int idx, nr_cpus, fd; + struct perf_cpu_map *map; + struct perf_cpu cpu; + fd = bpf_map__fd(skel->maps.kwork_top_cpu_filter); if (fd < 0) { pr_debug("Invalid cpu filter fd\n"); @@ -140,8 +140,8 @@ static int setup_filters(struct perf_kwork *kwork) } nr_cpus = libbpf_num_possible_cpus(); - for (i = 0; i < perf_cpu_map__nr(map); i++) { - struct perf_cpu cpu = perf_cpu_map__cpu(map, i); + perf_cpu_map__for_each_cpu(cpu, idx, map) { + u8 val = 1; if (cpu.cpu >= nr_cpus) { perf_cpu_map__put(map); diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c index 2872f9bc07..0acbd74e8c 100644 --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c @@ -341,6 +341,27 @@ failure: return 1; /* Failure: don't filter */ } +SEC("tp/syscalls/sys_enter_nanosleep") +int sys_enter_nanosleep(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *req_arg = (const void *)args->args[0]; + unsigned int len = sizeof(augmented_args->args); + __u32 size = sizeof(struct timespec64); + + if (augmented_args == NULL) + goto failure; + + if (size > sizeof(augmented_args->__data)) + goto failure; + + bpf_probe_read_user(&augmented_args->__data, size, req_arg); + + return augmented__output(args, augmented_args, len + size); +failure: + return 1; /* Failure: don't filter */ +} + static pid_t getpid(void) { return bpf_get_current_pid_tgid(); diff --git a/tools/perf/util/bpf_skel/bench_uprobe.bpf.c b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c index 2c55896bb3..a01c7f791f 100644 --- a/tools/perf/util/bpf_skel/bench_uprobe.bpf.c +++ b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c @@ -4,6 +4,7 @@ #include unsigned int nr_uprobes; +unsigned int nr_uretprobes; SEC("uprobe") int BPF_UPROBE(empty) @@ -20,4 +21,19 @@ int BPF_UPROBE(trace_printk) return 0; } +SEC("uretprobe") +int BPF_URETPROBE(empty_ret) +{ + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(trace_printk_ret) +{ + char fmt[] = "perf bench uretprobe %u"; + + bpf_trace_printk(fmt, sizeof(fmt), ++nr_uretprobes); + return 0; +} + char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 03c64b8538..83a1581e8c 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -60,7 +60,7 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused, addr_location__init(&al); if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) - map__dso(al.map)->hit = 1; + dso__set_hit(map__dso(al.map)); addr_location__exit(&al); thread__put(thread); @@ -272,10 +272,10 @@ char *__dso__build_id_filename(const struct dso *dso, char *bf, size_t size, bool alloc = (bf == NULL); int ret; - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) return NULL; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid_const(dso), sbuild_id); linkname = build_id_cache__linkname(sbuild_id, NULL, 0); if (!linkname) return NULL; @@ -327,48 +327,56 @@ static int write_buildid(const char *name, size_t name_len, struct build_id *bid return write_padded(fd, name, name_len + 1, len); } -static int machine__write_buildid_table(struct machine *machine, - struct feat_fd *fd) +struct machine__write_buildid_table_cb_args { + struct machine *machine; + struct feat_fd *fd; + u16 kmisc, umisc; +}; + +static int machine__write_buildid_table_cb(struct dso *dso, void *data) { - int err = 0; - struct dso *pos; - u16 kmisc = PERF_RECORD_MISC_KERNEL, - umisc = PERF_RECORD_MISC_USER; + struct machine__write_buildid_table_cb_args *args = data; + const char *name; + size_t name_len; + bool in_kernel = false; - if (!machine__is_host(machine)) { - kmisc = PERF_RECORD_MISC_GUEST_KERNEL; - umisc = PERF_RECORD_MISC_GUEST_USER; - } + if (!dso__has_build_id(dso)) + return 0; - dsos__for_each_with_build_id(pos, &machine->dsos.head) { - const char *name; - size_t name_len; - bool in_kernel = false; + if (!dso__hit(dso) && !dso__is_vdso(dso)) + return 0; - if (!pos->hit && !dso__is_vdso(pos)) - continue; + if (dso__is_vdso(dso)) { + name = dso__short_name(dso); + name_len = dso__short_name_len(dso); + } else if (dso__is_kcore(dso)) { + name = args->machine->mmap_name; + name_len = strlen(name); + } else { + name = dso__long_name(dso); + name_len = dso__long_name_len(dso); + } - if (dso__is_vdso(pos)) { - name = pos->short_name; - name_len = pos->short_name_len; - } else if (dso__is_kcore(pos)) { - name = machine->mmap_name; - name_len = strlen(name); - } else { - name = pos->long_name; - name_len = pos->long_name_len; - } + in_kernel = dso__kernel(dso) || is_kernel_module(name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); + return write_buildid(name, name_len, dso__bid(dso), args->machine->pid, + in_kernel ? args->kmisc : args->umisc, args->fd); +} - in_kernel = pos->kernel || - is_kernel_module(name, - PERF_RECORD_MISC_CPUMODE_UNKNOWN); - err = write_buildid(name, name_len, &pos->bid, machine->pid, - in_kernel ? kmisc : umisc, fd); - if (err) - break; +static int machine__write_buildid_table(struct machine *machine, struct feat_fd *fd) +{ + struct machine__write_buildid_table_cb_args args = { + .machine = machine, + .fd = fd, + .kmisc = PERF_RECORD_MISC_KERNEL, + .umisc = PERF_RECORD_MISC_USER, + }; + + if (!machine__is_host(machine)) { + args.kmisc = PERF_RECORD_MISC_GUEST_KERNEL; + args.umisc = PERF_RECORD_MISC_GUEST_USER; } - return err; + return dsos__for_each_dso(&machine->dsos, machine__write_buildid_table_cb, &args); } int perf_session__write_buildid_table(struct perf_session *session, @@ -390,42 +398,6 @@ int perf_session__write_buildid_table(struct perf_session *session, return err; } -static int __dsos__hit_all(struct list_head *head) -{ - struct dso *pos; - - list_for_each_entry(pos, head, node) - pos->hit = true; - - return 0; -} - -static int machine__hit_all_dsos(struct machine *machine) -{ - return __dsos__hit_all(&machine->dsos.head); -} - -int dsos__hit_all(struct perf_session *session) -{ - struct rb_node *nd; - int err; - - err = machine__hit_all_dsos(&session->machines.host); - if (err) - return err; - - for (nd = rb_first_cached(&session->machines.guests); nd; - nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - - err = machine__hit_all_dsos(pos); - if (err) - return err; - } - - return 0; -} - void disable_buildid_cache(void) { no_buildid_cache = true; @@ -904,11 +876,11 @@ static bool dso__build_id_mismatch(struct dso *dso, const char *name) struct build_id bid; bool ret = false; - mutex_lock(&dso->lock); - if (filename__read_build_id_ns(name, &bid, dso->nsinfo) >= 0) + mutex_lock(dso__lock(dso)); + if (filename__read_build_id_ns(name, &bid, dso__nsinfo(dso)) >= 0) ret = !dso__build_id_equal(dso, &bid); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); return ret; } @@ -918,13 +890,13 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine, { bool is_kallsyms = dso__is_kallsyms(dso); bool is_vdso = dso__is_vdso(dso); - const char *name = dso->long_name; + const char *name = dso__long_name(dso); const char *proper_name = NULL; const char *root_dir = NULL; char *allocated_name = NULL; int ret = 0; - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) return 0; if (dso__is_kcore(dso)) { @@ -949,10 +921,10 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine, if (!is_kallsyms && dso__build_id_mismatch(dso, name)) goto out_free; - mutex_lock(&dso->lock); - ret = build_id_cache__add_b(&dso->bid, name, dso->nsinfo, + mutex_lock(dso__lock(dso)); + ret = build_id_cache__add_b(dso__bid(dso), name, dso__nsinfo(dso), is_kallsyms, is_vdso, proper_name, root_dir); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); out_free: free(allocated_name); return ret; @@ -992,7 +964,7 @@ int perf_session__cache_build_ids(struct perf_session *session) static bool machine__read_build_ids(struct machine *machine, bool with_hits) { - return __dsos__read_build_ids(&machine->dsos.head, with_hits); + return dsos__read_build_ids(&machine->dsos, with_hits); } bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index 4e3a116937..3fa8bffb07 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -39,8 +39,6 @@ int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); -int dsos__hit_all(struct perf_session *session); - int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 7517d16c02..1730b852a9 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -606,7 +606,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor) call->brtype_stat = zalloc(sizeof(*call->brtype_stat)); if (!call->brtype_stat) { perror("not enough memory for the code path branch statistics"); - free(call->brtype_stat); + zfree(&call->brtype_stat); return -ENOMEM; } } @@ -1205,7 +1205,7 @@ char *callchain_list__sym_name(struct callchain_list *cl, if (show_dso) scnprintf(bf + printed, bfsize - printed, " %s", cl->ms.map ? - map__dso(cl->ms.map)->short_name : + dso__short_name(map__dso(cl->ms.map)) : "unknown"); return bf; diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index fcb5090584..0f759dd96d 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -465,9 +465,11 @@ int evlist__expand_cgroup(struct evlist *evlist, const char *str, name = cn->name + prefix_len; if (name[0] == '/' && name[1]) name++; + + /* the cgroup can go away in the meantime */ cgrp = cgroup__new(name, open_cgroup); if (cgrp == NULL) - goto out_err; + continue; leader = NULL; evlist__for_each_entry(orig_list, pos) { diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c index afb8d4fd26..49b79cf0c5 100644 --- a/tools/perf/util/comm.c +++ b/tools/perf/util/comm.c @@ -1,108 +1,188 @@ // SPDX-License-Identifier: GPL-2.0 #include "comm.h" #include -#include -#include #include +#include #include -#include #include #include "rwsem.h" -struct comm_str { - char *str; - struct rb_node rb_node; +DECLARE_RC_STRUCT(comm_str) { refcount_t refcnt; + char str[]; }; -/* Should perhaps be moved to struct machine */ -static struct rb_root comm_str_root; -static struct rw_semaphore comm_str_lock = {.lock = PTHREAD_RWLOCK_INITIALIZER,}; +static struct comm_strs { + struct rw_semaphore lock; + struct comm_str **strs; + int num_strs; + int capacity; +} _comm_strs; + +static void comm_strs__remove_if_last(struct comm_str *cs); + +static void comm_strs__init(void) +{ + init_rwsem(&_comm_strs.lock); + _comm_strs.capacity = 16; + _comm_strs.num_strs = 0; + _comm_strs.strs = calloc(16, sizeof(*_comm_strs.strs)); +} + +static struct comm_strs *comm_strs__get(void) +{ + static pthread_once_t comm_strs_type_once = PTHREAD_ONCE_INIT; + + pthread_once(&comm_strs_type_once, comm_strs__init); + + return &_comm_strs; +} + +static refcount_t *comm_str__refcnt(struct comm_str *cs) +{ + return &RC_CHK_ACCESS(cs)->refcnt; +} + +static const char *comm_str__str(const struct comm_str *cs) +{ + return &RC_CHK_ACCESS(cs)->str[0]; +} static struct comm_str *comm_str__get(struct comm_str *cs) { - if (cs && refcount_inc_not_zero(&cs->refcnt)) - return cs; + struct comm_str *result; - return NULL; + if (RC_CHK_GET(result, cs)) + refcount_inc_not_zero(comm_str__refcnt(cs)); + + return result; } static void comm_str__put(struct comm_str *cs) { - if (cs && refcount_dec_and_test(&cs->refcnt)) { - down_write(&comm_str_lock); - rb_erase(&cs->rb_node, &comm_str_root); - up_write(&comm_str_lock); - zfree(&cs->str); - free(cs); + if (!cs) + return; + + if (refcount_dec_and_test(comm_str__refcnt(cs))) { + RC_CHK_FREE(cs); + } else { + if (refcount_read(comm_str__refcnt(cs)) == 1) + comm_strs__remove_if_last(cs); + + RC_CHK_PUT(cs); } } -static struct comm_str *comm_str__alloc(const char *str) +static struct comm_str *comm_str__new(const char *str) { - struct comm_str *cs; - - cs = zalloc(sizeof(*cs)); - if (!cs) - return NULL; + struct comm_str *result = NULL; + RC_STRUCT(comm_str) *cs; - cs->str = strdup(str); - if (!cs->str) { - free(cs); - return NULL; + cs = malloc(sizeof(*cs) + strlen(str) + 1); + if (ADD_RC_CHK(result, cs)) { + refcount_set(comm_str__refcnt(result), 1); + strcpy(&cs->str[0], str); } + return result; +} - refcount_set(&cs->refcnt, 1); +static int comm_str__search(const void *_key, const void *_member) +{ + const char *key = _key; + const struct comm_str *member = *(const struct comm_str * const *)_member; - return cs; + return strcmp(key, comm_str__str(member)); } -static -struct comm_str *__comm_str__findnew(const char *str, struct rb_root *root) +static void comm_strs__remove_if_last(struct comm_str *cs) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct comm_str *iter, *new; - int cmp; - - while (*p != NULL) { - parent = *p; - iter = rb_entry(parent, struct comm_str, rb_node); - - /* - * If we race with comm_str__put, iter->refcnt is 0 - * and it will be removed within comm_str__put call - * shortly, ignore it in this search. - */ - cmp = strcmp(str, iter->str); - if (!cmp && comm_str__get(iter)) - return iter; - - if (cmp < 0) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; + struct comm_strs *comm_strs = comm_strs__get(); + + down_write(&comm_strs->lock); + /* + * Are there only references from the array, if so remove the array + * reference under the write lock so that we don't race with findnew. + */ + if (refcount_read(comm_str__refcnt(cs)) == 1) { + struct comm_str **entry; + + entry = bsearch(comm_str__str(cs), comm_strs->strs, comm_strs->num_strs, + sizeof(struct comm_str *), comm_str__search); + comm_str__put(*entry); + for (int i = entry - comm_strs->strs; i < comm_strs->num_strs - 1; i++) + comm_strs->strs[i] = comm_strs->strs[i + 1]; + comm_strs->num_strs--; } + up_write(&comm_strs->lock); +} - new = comm_str__alloc(str); - if (!new) - return NULL; +static struct comm_str *__comm_strs__find(struct comm_strs *comm_strs, const char *str) +{ + struct comm_str **result; - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, root); + result = bsearch(str, comm_strs->strs, comm_strs->num_strs, sizeof(struct comm_str *), + comm_str__search); - return new; + if (!result) + return NULL; + + return comm_str__get(*result); } -static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root) +static struct comm_str *comm_strs__findnew(const char *str) { - struct comm_str *cs; + struct comm_strs *comm_strs = comm_strs__get(); + struct comm_str *result; - down_write(&comm_str_lock); - cs = __comm_str__findnew(str, root); - up_write(&comm_str_lock); + if (!comm_strs) + return NULL; - return cs; + down_read(&comm_strs->lock); + result = __comm_strs__find(comm_strs, str); + up_read(&comm_strs->lock); + if (result) + return result; + + down_write(&comm_strs->lock); + result = __comm_strs__find(comm_strs, str); + if (!result) { + if (comm_strs->num_strs == comm_strs->capacity) { + struct comm_str **tmp; + + tmp = reallocarray(comm_strs->strs, + comm_strs->capacity + 16, + sizeof(*comm_strs->strs)); + if (!tmp) { + up_write(&comm_strs->lock); + return NULL; + } + comm_strs->strs = tmp; + comm_strs->capacity += 16; + } + result = comm_str__new(str); + if (result) { + int low = 0, high = comm_strs->num_strs - 1; + int insert = comm_strs->num_strs; /* Default to inserting at the end. */ + + while (low <= high) { + int mid = low + (high - low) / 2; + int cmp = strcmp(comm_str__str(comm_strs->strs[mid]), str); + + if (cmp < 0) { + low = mid + 1; + } else { + high = mid - 1; + insert = mid; + } + } + memmove(&comm_strs->strs[insert + 1], &comm_strs->strs[insert], + (comm_strs->num_strs - insert) * sizeof(struct comm_str *)); + comm_strs->num_strs++; + comm_strs->strs[insert] = result; + } + } + up_write(&comm_strs->lock); + return comm_str__get(result); } struct comm *comm__new(const char *str, u64 timestamp, bool exec) @@ -115,7 +195,7 @@ struct comm *comm__new(const char *str, u64 timestamp, bool exec) comm->start = timestamp; comm->exec = exec; - comm->comm_str = comm_str__findnew(str, &comm_str_root); + comm->comm_str = comm_strs__findnew(str); if (!comm->comm_str) { free(comm); return NULL; @@ -128,7 +208,7 @@ int comm__override(struct comm *comm, const char *str, u64 timestamp, bool exec) { struct comm_str *new, *old = comm->comm_str; - new = comm_str__findnew(str, &comm_str_root); + new = comm_strs__findnew(str); if (!new) return -ENOMEM; @@ -149,5 +229,5 @@ void comm__free(struct comm *comm) const char *comm__str(const struct comm *comm) { - return comm->comm_str->str; + return comm_str__str(comm->comm_str); } diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 356e30c42c..27094211ed 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -180,8 +180,6 @@ struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr) cpus->nr = nr; for (i = 0; i < nr; i++) cpus->map[i] = aggr_cpu_id__empty(); - - refcount_set(&cpus->refcnt, 1); } return cpus; @@ -655,10 +653,10 @@ static char hex_char(unsigned char val) size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size) { - int i, cpu; + int idx; char *ptr = buf; unsigned char *bitmap; - struct perf_cpu last_cpu = perf_cpu_map__cpu(map, perf_cpu_map__nr(map) - 1); + struct perf_cpu c, last_cpu = perf_cpu_map__max(map); if (buf == NULL) return 0; @@ -669,12 +667,10 @@ size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size) return 0; } - for (i = 0; i < perf_cpu_map__nr(map); i++) { - cpu = perf_cpu_map__cpu(map, i).cpu; - bitmap[cpu / 8] |= 1 << (cpu % 8); - } + perf_cpu_map__for_each_cpu(c, idx, map) + bitmap[c.cpu / 8] |= 1 << (c.cpu % 8); - for (cpu = last_cpu.cpu / 4 * 4; cpu >= 0; cpu -= 4) { + for (int cpu = last_cpu.cpu / 4 * 4; cpu >= 0; cpu -= 4) { unsigned char bits = bitmap[cpu / 8]; if (cpu % 8) diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 26cf76c693..ee0f6139b0 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -5,7 +5,6 @@ #include #include #include -#include /** Identify where counts are aggregated, -1 implies not to aggregate. */ struct aggr_cpu_id { @@ -37,7 +36,6 @@ struct aggr_cpu_id { /** A collection of aggr_cpu_id values, the "built" version is sorted and uniqued. */ struct cpu_aggr_map { - refcount_t refcnt; /** Number of valid entries. */ int nr; /** The entries. */ diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index d65d748588..5e9fbcfad7 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -335,8 +335,11 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session, trace_chan_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id); /* check that we can handle this version */ - if (version > CS_AUX_HW_ID_CURR_VERSION) + if (version > CS_AUX_HW_ID_CURR_VERSION) { + pr_err("CS ETM Trace: PERF_RECORD_AUX_OUTPUT_HW_ID version %d not supported. Please update Perf.\n", + version); return -EINVAL; + } /* get access to the etm metadata */ etm = container_of(session->auxtrace, struct cs_etm_auxtrace, auxtrace); @@ -1010,7 +1013,7 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, if (!dso) goto out; - if (dso->data.status == DSO_DATA_STATUS_ERROR && + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR && dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) goto out; @@ -1024,11 +1027,11 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, if (len <= 0) { ui__warning_once("CS ETM Trace: Missing DSO. Use 'perf archive' or debuginfod to export data from the traced system.\n" " Enable CONFIG_PROC_KCORE or use option '-k /path/to/vmlinux' for kernel symbols.\n"); - if (!dso->auxtrace_warned) { + if (!dso__auxtrace_warned(dso)) { pr_err("CS ETM Trace: Debug data not found for address %#"PRIx64" in %s\n", - address, - dso->long_name ? dso->long_name : "Unknown"); - dso->auxtrace_warned = true; + address, + dso__long_name(dso) ? dso__long_name(dso) : "Unknown"); + dso__set_auxtrace_warned(dso); } goto out; } diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 09d57efd2d..3cf64f5b23 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -134,7 +134,7 @@ static void output_sample_callchain_entry(struct perf_tool *tool, output_json_key_string(out, false, 5, "symbol", al->sym->name); if (dso) { - const char *dso_name = dso->short_name; + const char *dso_name = dso__short_name(dso); if (dso_name && strlen(dso_name) > 0) { fputc(',', out); diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 106429155c..50f916374d 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -146,10 +146,10 @@ int db_export__comm_thread(struct db_export *dbe, struct comm *comm, int db_export__dso(struct db_export *dbe, struct dso *dso, struct machine *machine) { - if (dso->db_id) + if (dso__db_id(dso)) return 0; - dso->db_id = ++dbe->dso_last_db_id; + dso__set_db_id(dso, ++dbe->dso_last_db_id); if (dbe->export_dso) return dbe->export_dso(dbe, dso, machine); @@ -184,7 +184,7 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al, err = db_export__dso(dbe, dso, maps__machine(al->maps)); if (err) return err; - *dso_db_id = dso->db_id; + *dso_db_id = dso__db_id(dso); if (!al->sym) { al->sym = symbol__new(al->addr, 0, 0, 0, "unknown"); diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index c39ee0fcb8..d633d15329 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -41,6 +41,7 @@ static int redirect_to_stderr; int debug_data_convert; static FILE *_debug_file; bool debug_display_time; +int debug_type_profile; FILE *debug_file(void) { @@ -231,6 +232,7 @@ static struct sublevel_option debug_opts[] = { { .name = "data-convert", .value_ptr = &debug_data_convert }, { .name = "perf-event-open", .value_ptr = &debug_peo_args }, { .name = "kmaps", .value_ptr = &debug_kmaps }, + { .name = "type-profile", .value_ptr = &debug_type_profile }, { .name = NULL, } }; @@ -270,6 +272,7 @@ int perf_quiet_option(void) redirect_to_stderr = 0; debug_peo_args = 0; debug_kmaps = 0; + debug_type_profile = 0; return 0; } diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index 35a7a5ae76..a4026d1fd6 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -14,6 +14,7 @@ extern int debug_peo_args; extern bool quiet, dump_trace; extern int debug_ordered_events; extern int debug_data_convert; +extern int debug_type_profile; #ifndef pr_fmt #define pr_fmt(fmt) fmt diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c new file mode 100644 index 0000000000..e10558b795 --- /dev/null +++ b/tools/perf/util/disasm.c @@ -0,0 +1,1837 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "annotate.h" +#include "build-id.h" +#include "debug.h" +#include "disasm.h" +#include "dso.h" +#include "env.h" +#include "evsel.h" +#include "map.h" +#include "maps.h" +#include "namespaces.h" +#include "srcline.h" +#include "symbol.h" +#include "util.h" + +static regex_t file_lineno; + +/* These can be referred from the arch-dependent code */ +static struct ins_ops call_ops; +static struct ins_ops dec_ops; +static struct ins_ops jump_ops; +static struct ins_ops mov_ops; +static struct ins_ops nop_ops; +static struct ins_ops lock_ops; +static struct ins_ops ret_ops; + +static int jump__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +static int call__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); + +static void ins__sort(struct arch *arch); +static int disasm_line__parse(char *line, const char **namep, char **rawp); + +static __attribute__((constructor)) void symbol__init_regexpr(void) +{ + regcomp(&file_lineno, "^/[^:]+:([0-9]+)", REG_EXTENDED); +} + +static int arch__grow_instructions(struct arch *arch) +{ + struct ins *new_instructions; + size_t new_nr_allocated; + + if (arch->nr_instructions_allocated == 0 && arch->instructions) + goto grow_from_non_allocated_table; + + new_nr_allocated = arch->nr_instructions_allocated + 128; + new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins)); + if (new_instructions == NULL) + return -1; + +out_update_instructions: + arch->instructions = new_instructions; + arch->nr_instructions_allocated = new_nr_allocated; + return 0; + +grow_from_non_allocated_table: + new_nr_allocated = arch->nr_instructions + 128; + new_instructions = calloc(new_nr_allocated, sizeof(struct ins)); + if (new_instructions == NULL) + return -1; + + memcpy(new_instructions, arch->instructions, arch->nr_instructions); + goto out_update_instructions; +} + +static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops) +{ + struct ins *ins; + + if (arch->nr_instructions == arch->nr_instructions_allocated && + arch__grow_instructions(arch)) + return -1; + + ins = &arch->instructions[arch->nr_instructions]; + ins->name = strdup(name); + if (!ins->name) + return -1; + + ins->ops = ops; + arch->nr_instructions++; + + ins__sort(arch); + return 0; +} + +#include "arch/arc/annotate/instructions.c" +#include "arch/arm/annotate/instructions.c" +#include "arch/arm64/annotate/instructions.c" +#include "arch/csky/annotate/instructions.c" +#include "arch/loongarch/annotate/instructions.c" +#include "arch/mips/annotate/instructions.c" +#include "arch/x86/annotate/instructions.c" +#include "arch/powerpc/annotate/instructions.c" +#include "arch/riscv64/annotate/instructions.c" +#include "arch/s390/annotate/instructions.c" +#include "arch/sparc/annotate/instructions.c" + +static struct arch architectures[] = { + { + .name = "arc", + .init = arc__annotate_init, + }, + { + .name = "arm", + .init = arm__annotate_init, + }, + { + .name = "arm64", + .init = arm64__annotate_init, + }, + { + .name = "csky", + .init = csky__annotate_init, + }, + { + .name = "mips", + .init = mips__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, + { + .name = "x86", + .init = x86__annotate_init, + .instructions = x86__instructions, + .nr_instructions = ARRAY_SIZE(x86__instructions), + .insn_suffix = "bwlq", + .objdump = { + .comment_char = '#', + .register_char = '%', + .memory_ref_char = '(', + .imm_char = '$', + }, + }, + { + .name = "powerpc", + .init = powerpc__annotate_init, + }, + { + .name = "riscv64", + .init = riscv64__annotate_init, + }, + { + .name = "s390", + .init = s390__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, + { + .name = "sparc", + .init = sparc__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, + { + .name = "loongarch", + .init = loongarch__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, +}; + +static int arch__key_cmp(const void *name, const void *archp) +{ + const struct arch *arch = archp; + + return strcmp(name, arch->name); +} + +static int arch__cmp(const void *a, const void *b) +{ + const struct arch *aa = a; + const struct arch *ab = b; + + return strcmp(aa->name, ab->name); +} + +static void arch__sort(void) +{ + const int nmemb = ARRAY_SIZE(architectures); + + qsort(architectures, nmemb, sizeof(struct arch), arch__cmp); +} + +struct arch *arch__find(const char *name) +{ + const int nmemb = ARRAY_SIZE(architectures); + static bool sorted; + + if (!sorted) { + arch__sort(); + sorted = true; + } + + return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); +} + +bool arch__is(struct arch *arch, const char *name) +{ + return !strcmp(arch->name, name); +} + +static void ins_ops__delete(struct ins_operands *ops) +{ + if (ops == NULL) + return; + zfree(&ops->source.raw); + zfree(&ops->source.name); + zfree(&ops->target.raw); + zfree(&ops->target.name); +} + +static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw); +} + +int ins__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + if (ins->ops->scnprintf) + return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name); + + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); +} + +bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2) +{ + if (!arch || !arch->ins_is_fused) + return false; + + return arch->ins_is_fused(arch, ins1, ins2); +} + +static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + char *endptr, *tok, *name; + struct map *map = ms->map; + struct addr_map_symbol target = { + .ms = { .map = map, }, + }; + + ops->target.addr = strtoull(ops->raw, &endptr, 16); + + name = strchr(endptr, '<'); + if (name == NULL) + goto indirect_call; + + name++; + + if (arch->objdump.skip_functions_char && + strchr(name, arch->objdump.skip_functions_char)) + return -1; + + tok = strchr(name, '>'); + if (tok == NULL) + return -1; + + *tok = '\0'; + ops->target.name = strdup(name); + *tok = '>'; + + if (ops->target.name == NULL) + return -1; +find_target: + target.addr = map__objdump_2mem(map, ops->target.addr); + + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + return 0; + +indirect_call: + tok = strchr(endptr, '*'); + if (tok != NULL) { + endptr++; + + /* Indirect call can use a non-rip register and offset: callq *0x8(%rbx). + * Do not parse such instruction. */ + if (strstr(endptr, "(%r") == NULL) + ops->target.addr = strtoull(endptr, NULL, 16); + } + goto find_target; +} + +static int call__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + if (ops->target.sym) + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); + + if (ops->target.addr == 0) + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); + + if (ops->target.name) + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.name); + + return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr); +} + +static struct ins_ops call_ops = { + .parse = call__parse, + .scnprintf = call__scnprintf, +}; + +bool ins__is_call(const struct ins *ins) +{ + return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops; +} + +/* + * Prevents from matching commas in the comment section, e.g.: + * ffff200008446e70: b.cs ffff2000084470f4 // b.hs, b.nlast + * + * and skip comma as part of function arguments, e.g.: + * 1d8b4ac + */ +static inline const char *validate_comma(const char *c, struct ins_operands *ops) +{ + if (ops->jump.raw_comment && c > ops->jump.raw_comment) + return NULL; + + if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) + return NULL; + + return c; +} + +static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + struct map *map = ms->map; + struct symbol *sym = ms->sym; + struct addr_map_symbol target = { + .ms = { .map = map, }, + }; + const char *c = strchr(ops->raw, ','); + u64 start, end; + + ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); + ops->jump.raw_func_start = strchr(ops->raw, '<'); + + c = validate_comma(c, ops); + + /* + * Examples of lines to parse for the _cpp_lex_token@@Base + * function: + * + * 1159e6c: jne 115aa32 <_cpp_lex_token@@Base+0xf92> + * 1159e8b: jne c469be + * + * The first is a jump to an offset inside the same function, + * the second is to another function, i.e. that 0xa72 is an + * offset in the cpp_named_operator2name@@base function. + */ + /* + * skip over possible up to 2 operands to get to address, e.g.: + * tbnz w0, #26, ffff0000083cd190 + */ + if (c++ != NULL) { + ops->target.addr = strtoull(c, NULL, 16); + if (!ops->target.addr) { + c = strchr(c, ','); + c = validate_comma(c, ops); + if (c++ != NULL) + ops->target.addr = strtoull(c, NULL, 16); + } + } else { + ops->target.addr = strtoull(ops->raw, NULL, 16); + } + + target.addr = map__objdump_2mem(map, ops->target.addr); + start = map__unmap_ip(map, sym->start); + end = map__unmap_ip(map, sym->end); + + ops->target.outside = target.addr < start || target.addr > end; + + /* + * FIXME: things like this in _cpp_lex_token (gcc's cc1 program): + + cpp_named_operator2name@@Base+0xa72 + + * Point to a place that is after the cpp_named_operator2name + * boundaries, i.e. in the ELF symbol table for cc1 + * cpp_named_operator2name is marked as being 32-bytes long, but it in + * fact is much larger than that, so we seem to need a symbols__find() + * routine that looks for >= current->start and < next_symbol->start, + * possibly just for C++ objects? + * + * For now lets just make some progress by marking jumps to outside the + * current function as call like. + * + * Actual navigation will come next, with further understanding of how + * the symbol searching and disassembly should be done. + */ + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + if (!ops->target.outside) { + ops->target.offset = target.addr - start; + ops->target.offset_avail = true; + } else { + ops->target.offset_avail = false; + } + + return 0; +} + +static int jump__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + const char *c; + + if (!ops->target.addr || ops->target.offset < 0) + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); + + if (ops->target.outside && ops->target.sym != NULL) + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); + + c = strchr(ops->raw, ','); + c = validate_comma(c, ops); + + if (c != NULL) { + const char *c2 = strchr(c + 1, ','); + + c2 = validate_comma(c2, ops); + /* check for 3-op insn */ + if (c2 != NULL) + c = c2; + c++; + + /* mirror arch objdump's space-after-comma style */ + if (*c == ' ') + c++; + } + + return scnprintf(bf, size, "%-*s %.*s%" PRIx64, max_ins_name, + ins->name, c ? c - ops->raw : 0, ops->raw, + ops->target.offset); +} + +static void jump__delete(struct ins_operands *ops __maybe_unused) +{ + /* + * The ops->jump.raw_comment and ops->jump.raw_func_start belong to the + * raw string, don't free them. + */ +} + +static struct ins_ops jump_ops = { + .free = jump__delete, + .parse = jump__parse, + .scnprintf = jump__scnprintf, +}; + +bool ins__is_jump(const struct ins *ins) +{ + return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops; +} + +static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) +{ + char *endptr, *name, *t; + + if (strstr(raw, "(%rip)") == NULL) + return 0; + + *addrp = strtoull(comment, &endptr, 16); + if (endptr == comment) + return 0; + name = strchr(endptr, '<'); + if (name == NULL) + return -1; + + name++; + + t = strchr(name, '>'); + if (t == NULL) + return 0; + + *t = '\0'; + *namep = strdup(name); + *t = '>'; + + return 0; +} + +static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + ops->locked.ops = zalloc(sizeof(*ops->locked.ops)); + if (ops->locked.ops == NULL) + return 0; + + if (disasm_line__parse(ops->raw, &ops->locked.ins.name, &ops->locked.ops->raw) < 0) + goto out_free_ops; + + ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name); + + if (ops->locked.ins.ops == NULL) + goto out_free_ops; + + if (ops->locked.ins.ops->parse && + ops->locked.ins.ops->parse(arch, ops->locked.ops, ms) < 0) + goto out_free_ops; + + return 0; + +out_free_ops: + zfree(&ops->locked.ops); + return 0; +} + +static int lock__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + int printed; + + if (ops->locked.ins.ops == NULL) + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); + + printed = scnprintf(bf, size, "%-*s ", max_ins_name, ins->name); + return printed + ins__scnprintf(&ops->locked.ins, bf + printed, + size - printed, ops->locked.ops, max_ins_name); +} + +static void lock__delete(struct ins_operands *ops) +{ + struct ins *ins = &ops->locked.ins; + + if (ins->ops && ins->ops->free) + ins->ops->free(ops->locked.ops); + else + ins_ops__delete(ops->locked.ops); + + zfree(&ops->locked.ops); + zfree(&ops->target.raw); + zfree(&ops->target.name); +} + +static struct ins_ops lock_ops = { + .free = lock__delete, + .parse = lock__parse, + .scnprintf = lock__scnprintf, +}; + +/* + * Check if the operand has more than one registers like x86 SIB addressing: + * 0x1234(%rax, %rbx, 8) + * + * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check + * the input string after 'memory_ref_char' if exists. + */ +static bool check_multi_regs(struct arch *arch, const char *op) +{ + int count = 0; + + if (arch->objdump.register_char == 0) + return false; + + if (arch->objdump.memory_ref_char) { + op = strchr(op, arch->objdump.memory_ref_char); + if (op == NULL) + return false; + } + + while ((op = strchr(op, arch->objdump.register_char)) != NULL) { + count++; + op++; + } + + return count > 1; +} + +static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) +{ + char *s = strchr(ops->raw, ','), *target, *comment, prev; + + if (s == NULL) + return -1; + + *s = '\0'; + + /* + * x86 SIB addressing has something like 0x8(%rax, %rcx, 1) + * then it needs to have the closing parenthesis. + */ + if (strchr(ops->raw, '(')) { + *s = ','; + s = strchr(ops->raw, ')'); + if (s == NULL || s[1] != ',') + return -1; + *++s = '\0'; + } + + ops->source.raw = strdup(ops->raw); + *s = ','; + + if (ops->source.raw == NULL) + return -1; + + ops->source.multi_regs = check_multi_regs(arch, ops->source.raw); + + target = skip_spaces(++s); + comment = strchr(s, arch->objdump.comment_char); + + if (comment != NULL) + s = comment - 1; + else + s = strchr(s, '\0') - 1; + + while (s > target && isspace(s[0])) + --s; + s++; + prev = *s; + *s = '\0'; + + ops->target.raw = strdup(target); + *s = prev; + + if (ops->target.raw == NULL) + goto out_free_source; + + ops->target.multi_regs = check_multi_regs(arch, ops->target.raw); + + if (comment == NULL) + return 0; + + comment = skip_spaces(comment); + comment__symbol(ops->source.raw, comment + 1, &ops->source.addr, &ops->source.name); + comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); + + return 0; + +out_free_source: + zfree(&ops->source.raw); + return -1; +} + +static int mov__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name, + ops->source.name ?: ops->source.raw, + ops->target.name ?: ops->target.raw); +} + +static struct ins_ops mov_ops = { + .parse = mov__parse, + .scnprintf = mov__scnprintf, +}; + +static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) +{ + char *target, *comment, *s, prev; + + target = s = ops->raw; + + while (s[0] != '\0' && !isspace(s[0])) + ++s; + prev = *s; + *s = '\0'; + + ops->target.raw = strdup(target); + *s = prev; + + if (ops->target.raw == NULL) + return -1; + + comment = strchr(s, arch->objdump.comment_char); + if (comment == NULL) + return 0; + + comment = skip_spaces(comment); + comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); + + return 0; +} + +static int dec__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, + ops->target.name ?: ops->target.raw); +} + +static struct ins_ops dec_ops = { + .parse = dec__parse, + .scnprintf = dec__scnprintf, +}; + +static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size, + struct ins_operands *ops __maybe_unused, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s", max_ins_name, "nop"); +} + +static struct ins_ops nop_ops = { + .scnprintf = nop__scnprintf, +}; + +static struct ins_ops ret_ops = { + .scnprintf = ins__raw_scnprintf, +}; + +bool ins__is_nop(const struct ins *ins) +{ + return ins->ops == &nop_ops; +} + +bool ins__is_ret(const struct ins *ins) +{ + return ins->ops == &ret_ops; +} + +bool ins__is_lock(const struct ins *ins) +{ + return ins->ops == &lock_ops; +} + +static int ins__key_cmp(const void *name, const void *insp) +{ + const struct ins *ins = insp; + + return strcmp(name, ins->name); +} + +static int ins__cmp(const void *a, const void *b) +{ + const struct ins *ia = a; + const struct ins *ib = b; + + return strcmp(ia->name, ib->name); +} + +static void ins__sort(struct arch *arch) +{ + const int nmemb = arch->nr_instructions; + + qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); +} + +static struct ins_ops *__ins__find(struct arch *arch, const char *name) +{ + struct ins *ins; + const int nmemb = arch->nr_instructions; + + if (!arch->sorted_instructions) { + ins__sort(arch); + arch->sorted_instructions = true; + } + + ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); + if (ins) + return ins->ops; + + if (arch->insn_suffix) { + char tmp[32]; + char suffix; + size_t len = strlen(name); + + if (len == 0 || len >= sizeof(tmp)) + return NULL; + + suffix = name[len - 1]; + if (strchr(arch->insn_suffix, suffix) == NULL) + return NULL; + + strcpy(tmp, name); + tmp[len - 1] = '\0'; /* remove the suffix and check again */ + + ins = bsearch(tmp, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); + } + return ins ? ins->ops : NULL; +} + +struct ins_ops *ins__find(struct arch *arch, const char *name) +{ + struct ins_ops *ops = __ins__find(arch, name); + + if (!ops && arch->associate_instruction_ops) + ops = arch->associate_instruction_ops(arch, name); + + return ops; +} + +static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms) +{ + dl->ins.ops = ins__find(arch, dl->ins.name); + + if (!dl->ins.ops) + return; + + if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms) < 0) + dl->ins.ops = NULL; +} + +static int disasm_line__parse(char *line, const char **namep, char **rawp) +{ + char tmp, *name = skip_spaces(line); + + if (name[0] == '\0') + return -1; + + *rawp = name + 1; + + while ((*rawp)[0] != '\0' && !isspace((*rawp)[0])) + ++*rawp; + + tmp = (*rawp)[0]; + (*rawp)[0] = '\0'; + *namep = strdup(name); + + if (*namep == NULL) + goto out; + + (*rawp)[0] = tmp; + *rawp = strim(*rawp); + + return 0; + +out: + return -1; +} + +static void annotation_line__init(struct annotation_line *al, + struct annotate_args *args, + int nr) +{ + al->offset = args->offset; + al->line = strdup(args->line); + al->line_nr = args->line_nr; + al->fileloc = args->fileloc; + al->data_nr = nr; +} + +static void annotation_line__exit(struct annotation_line *al) +{ + zfree_srcline(&al->path); + zfree(&al->line); + zfree(&al->cycles); +} + +static size_t disasm_line_size(int nr) +{ + struct annotation_line *al; + + return (sizeof(struct disasm_line) + (sizeof(al->data[0]) * nr)); +} + +/* + * Allocating the disasm annotation line data with + * following structure: + * + * ------------------------------------------- + * struct disasm_line | struct annotation_line + * ------------------------------------------- + * + * We have 'struct annotation_line' member as last member + * of 'struct disasm_line' to have an easy access. + */ +struct disasm_line *disasm_line__new(struct annotate_args *args) +{ + struct disasm_line *dl = NULL; + int nr = 1; + + if (evsel__is_group_event(args->evsel)) + nr = args->evsel->core.nr_members; + + dl = zalloc(disasm_line_size(nr)); + if (!dl) + return NULL; + + annotation_line__init(&dl->al, args, nr); + if (dl->al.line == NULL) + goto out_delete; + + if (args->offset != -1) { + if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) + goto out_free_line; + + disasm_line__init_ins(dl, args->arch, &args->ms); + } + + return dl; + +out_free_line: + zfree(&dl->al.line); +out_delete: + free(dl); + return NULL; +} + +void disasm_line__free(struct disasm_line *dl) +{ + if (dl->ins.ops && dl->ins.ops->free) + dl->ins.ops->free(&dl->ops); + else + ins_ops__delete(&dl->ops); + zfree(&dl->ins.name); + annotation_line__exit(&dl->al); + free(dl); +} + +int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw, int max_ins_name) +{ + if (raw || !dl->ins.ops) + return scnprintf(bf, size, "%-*s %s", max_ins_name, dl->ins.name, dl->ops.raw); + + return ins__scnprintf(&dl->ins, bf, size, &dl->ops, max_ins_name); +} + +/* + * symbol__parse_objdump_line() parses objdump output (with -d --no-show-raw) + * which looks like following + * + * 0000000000415500 <_init>: + * 415500: sub $0x8,%rsp + * 415504: mov 0x2f5ad5(%rip),%rax # 70afe0 <_DYNAMIC+0x2f8> + * 41550b: test %rax,%rax + * 41550e: je 415515 <_init+0x15> + * 415510: callq 416e70 <__gmon_start__@plt> + * 415515: add $0x8,%rsp + * 415519: retq + * + * it will be parsed and saved into struct disasm_line as + * + * + * The offset will be a relative offset from the start of the symbol and -1 + * means that it's not a disassembly line so should be treated differently. + * The ops.raw part will be parsed further according to type of the instruction. + */ +static int symbol__parse_objdump_line(struct symbol *sym, + struct annotate_args *args, + char *parsed_line, int *line_nr, char **fileloc) +{ + struct map *map = args->ms.map; + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + char *tmp; + s64 line_ip, offset = -1; + regmatch_t match[2]; + + /* /filename:linenr ? Save line number and ignore. */ + if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) { + *line_nr = atoi(parsed_line + match[1].rm_so); + free(*fileloc); + *fileloc = strdup(parsed_line); + return 0; + } + + /* Process hex address followed by ':'. */ + line_ip = strtoull(parsed_line, &tmp, 16); + if (parsed_line != tmp && tmp[0] == ':' && tmp[1] != '\0') { + u64 start = map__rip_2objdump(map, sym->start), + end = map__rip_2objdump(map, sym->end); + + offset = line_ip - start; + if ((u64)line_ip < start || (u64)line_ip >= end) + offset = -1; + else + parsed_line = tmp + 1; + } + + args->offset = offset; + args->line = parsed_line; + args->line_nr = *line_nr; + args->fileloc = *fileloc; + args->ms.sym = sym; + + dl = disasm_line__new(args); + (*line_nr)++; + + if (dl == NULL) + return -1; + + if (!disasm_line__has_local_offset(dl)) { + dl->ops.target.offset = dl->ops.target.addr - + map__rip_2objdump(map, sym->start); + dl->ops.target.offset_avail = true; + } + + /* kcore has no symbols, so add the call target symbol */ + if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.sym) { + struct addr_map_symbol target = { + .addr = dl->ops.target.addr, + .ms = { .map = map, }, + }; + + if (!maps__find_ams(args->ms.maps, &target) && + target.ms.sym->start == target.al_addr) + dl->ops.target.sym = target.ms.sym; + } + + annotation_line__add(&dl->al, ¬es->src->source); + return 0; +} + +static void delete_last_nop(struct symbol *sym) +{ + struct annotation *notes = symbol__annotation(sym); + struct list_head *list = ¬es->src->source; + struct disasm_line *dl; + + while (!list_empty(list)) { + dl = list_entry(list->prev, struct disasm_line, al.node); + + if (dl->ins.ops) { + if (!ins__is_nop(&dl->ins)) + return; + } else { + if (!strstr(dl->al.line, " nop ") && + !strstr(dl->al.line, " nopl ") && + !strstr(dl->al.line, " nopw ")) + return; + } + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } +} + +int symbol__strerror_disassemble(struct map_symbol *ms, int errnum, char *buf, size_t buflen) +{ + struct dso *dso = map__dso(ms->map); + + BUG_ON(buflen == 0); + + if (errnum >= 0) { + str_error_r(errnum, buf, buflen); + return 0; + } + + switch (errnum) { + case SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX: { + char bf[SBUILD_ID_SIZE + 15] = " with build id "; + char *build_id_msg = NULL; + + if (dso__has_build_id(dso)) { + build_id__sprintf(dso__bid(dso), bf + 15); + build_id_msg = bf; + } + scnprintf(buf, buflen, + "No vmlinux file%s\nwas found in the path.\n\n" + "Note that annotation using /proc/kcore requires CAP_SYS_RAWIO capability.\n\n" + "Please use:\n\n" + " perf buildid-cache -vu vmlinux\n\n" + "or:\n\n" + " --vmlinux vmlinux\n", build_id_msg ?: ""); + } + break; + case SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF: + scnprintf(buf, buflen, "Please link with binutils's libopcode to enable BPF annotation"); + break; + case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP: + scnprintf(buf, buflen, "Problems with arch specific instruction name regular expressions."); + break; + case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING: + scnprintf(buf, buflen, "Problems while parsing the CPUID in the arch specific initialization."); + break; + case SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE: + scnprintf(buf, buflen, "Invalid BPF file: %s.", dso__long_name(dso)); + break; + case SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF: + scnprintf(buf, buflen, "The %s BPF file has no BTF section, compile with -g or use pahole -J.", + dso__long_name(dso)); + break; + default: + scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); + break; + } + + return 0; +} + +static int dso__disassemble_filename(struct dso *dso, char *filename, size_t filename_size) +{ + char linkname[PATH_MAX]; + char *build_id_filename; + char *build_id_path = NULL; + char *pos; + int len; + + if (dso__symtab_type(dso) == DSO_BINARY_TYPE__KALLSYMS && + !dso__is_kcore(dso)) + return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX; + + build_id_filename = dso__build_id_filename(dso, NULL, 0, false); + if (build_id_filename) { + __symbol__join_symfs(filename, filename_size, build_id_filename); + free(build_id_filename); + } else { + if (dso__has_build_id(dso)) + return ENOMEM; + goto fallback; + } + + build_id_path = strdup(filename); + if (!build_id_path) + return ENOMEM; + + /* + * old style build-id cache has name of XX/XXXXXXX.. while + * new style has XX/XXXXXXX../{elf,kallsyms,vdso}. + * extract the build-id part of dirname in the new style only. + */ + pos = strrchr(build_id_path, '/'); + if (pos && strlen(pos) < SBUILD_ID_SIZE - 2) + dirname(build_id_path); + + if (dso__is_kcore(dso)) + goto fallback; + + len = readlink(build_id_path, linkname, sizeof(linkname) - 1); + if (len < 0) + goto fallback; + + linkname[len] = '\0'; + if (strstr(linkname, DSO__NAME_KALLSYMS) || + access(filename, R_OK)) { +fallback: + /* + * If we don't have build-ids or the build-id file isn't in the + * cache, or is just a kallsyms file, well, lets hope that this + * DSO is the same as when 'perf record' ran. + */ + if (dso__kernel(dso) && dso__long_name(dso)[0] == '/') + snprintf(filename, filename_size, "%s", dso__long_name(dso)); + else + __symbol__join_symfs(filename, filename_size, dso__long_name(dso)); + + mutex_lock(dso__lock(dso)); + if (access(filename, R_OK) && errno == ENOENT && dso__nsinfo(dso)) { + char *new_name = dso__filename_with_chroot(dso, filename); + if (new_name) { + strlcpy(filename, new_name, filename_size); + free(new_name); + } + } + mutex_unlock(dso__lock(dso)); + } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) { + dso__set_binary_type(dso, DSO_BINARY_TYPE__BUILD_ID_CACHE); + } + + free(build_id_path); + return 0; +} + +#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +#define PACKAGE "perf" +#include +#include +#include +#include +#include +#include +#include + +#include "bpf-event.h" +#include "bpf-utils.h" + +static int symbol__disassemble_bpf(struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct bpf_prog_linfo *prog_linfo = NULL; + struct bpf_prog_info_node *info_node; + int len = sym->end - sym->start; + disassembler_ftype disassemble; + struct map *map = args->ms.map; + struct perf_bpil *info_linear; + struct disassemble_info info; + struct dso *dso = map__dso(map); + int pc = 0, count, sub_id; + struct btf *btf = NULL; + char tpath[PATH_MAX]; + size_t buf_size; + int nr_skip = 0; + char *buf; + bfd *bfdf; + int ret; + FILE *s; + + if (dso__binary_type(dso) != DSO_BINARY_TYPE__BPF_PROG_INFO) + return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; + + pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, + sym->name, sym->start, sym->end - sym->start); + + memset(tpath, 0, sizeof(tpath)); + perf_exe(tpath, sizeof(tpath)); + + bfdf = bfd_openr(tpath, NULL); + if (bfdf == NULL) + abort(); + + if (!bfd_check_format(bfdf, bfd_object)) + abort(); + + s = open_memstream(&buf, &buf_size); + if (!s) { + ret = errno; + goto out; + } + init_disassemble_info_compat(&info, s, + (fprintf_ftype) fprintf, + fprintf_styled); + info.arch = bfd_get_arch(bfdf); + info.mach = bfd_get_mach(bfdf); + + info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env, + dso__bpf_prog(dso)->id); + if (!info_node) { + ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; + goto out; + } + info_linear = info_node->info_linear; + sub_id = dso__bpf_prog(dso)->sub_id; + + info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); + info.buffer_length = info_linear->info.jited_prog_len; + + if (info_linear->info.nr_line_info) + prog_linfo = bpf_prog_linfo__new(&info_linear->info); + + if (info_linear->info.btf_id) { + struct btf_node *node; + + node = perf_env__find_btf(dso__bpf_prog(dso)->env, + info_linear->info.btf_id); + if (node) + btf = btf__new((__u8 *)(node->data), + node->data_size); + } + + disassemble_init_for_target(&info); + +#ifdef DISASM_FOUR_ARGS_SIGNATURE + disassemble = disassembler(info.arch, + bfd_big_endian(bfdf), + info.mach, + bfdf); +#else + disassemble = disassembler(bfdf); +#endif + if (disassemble == NULL) + abort(); + + fflush(s); + do { + const struct bpf_line_info *linfo = NULL; + struct disasm_line *dl; + size_t prev_buf_size; + const char *srcline; + u64 addr; + + addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; + count = disassemble(pc, &info); + + if (prog_linfo) + linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, + addr, sub_id, + nr_skip); + + if (linfo && btf) { + srcline = btf__name_by_offset(btf, linfo->line_off); + nr_skip++; + } else + srcline = NULL; + + fprintf(s, "\n"); + prev_buf_size = buf_size; + fflush(s); + + if (!annotate_opts.hide_src_code && srcline) { + args->offset = -1; + args->line = strdup(srcline); + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) { + annotation_line__add(&dl->al, + ¬es->src->source); + } + } + + args->offset = pc; + args->line = buf + prev_buf_size; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + pc += count; + } while (count > 0 && pc < len); + + ret = 0; +out: + free(prog_linfo); + btf__free(btf); + fclose(s); + bfd_close(bfdf); + return ret; +} +#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, + struct annotate_args *args __maybe_unused) +{ + return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; +} +#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) + +static int +symbol__disassemble_bpf_image(struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + + args->offset = -1; + args->line = strdup("to be implemented"); + args->line_nr = 0; + args->fileloc = NULL; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + zfree(&args->line); + return 0; +} + +#ifdef HAVE_LIBCAPSTONE_SUPPORT +#include + +static int open_capstone_handle(struct annotate_args *args, bool is_64bit, + csh *handle) +{ + struct annotation_options *opt = args->options; + cs_mode mode = is_64bit ? CS_MODE_64 : CS_MODE_32; + + /* TODO: support more architectures */ + if (!arch__is(args->arch, "x86")) + return -1; + + if (cs_open(CS_ARCH_X86, mode, handle) != CS_ERR_OK) + return -1; + + if (!opt->disassembler_style || + !strcmp(opt->disassembler_style, "att")) + cs_option(*handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); + + /* + * Resolving address operands to symbols is implemented + * on x86 by investigating instruction details. + */ + cs_option(*handle, CS_OPT_DETAIL, CS_OPT_ON); + + return 0; +} + +struct find_file_offset_data { + u64 ip; + u64 offset; +}; + +/* This will be called for each PHDR in an ELF binary */ +static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg) +{ + struct find_file_offset_data *data = arg; + + if (start <= data->ip && data->ip < start + len) { + data->offset = pgoff + data->ip - start; + return 1; + } + return 0; +} + +static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, + struct annotate_args *args, u64 addr) +{ + int i; + struct map *map = args->ms.map; + struct symbol *sym; + + /* TODO: support more architectures */ + if (!arch__is(args->arch, "x86")) + return; + + if (insn->detail == NULL) + return; + + for (i = 0; i < insn->detail->x86.op_count; i++) { + cs_x86_op *op = &insn->detail->x86.operands[i]; + u64 orig_addr; + + if (op->type != X86_OP_MEM) + continue; + + /* only print RIP-based global symbols for now */ + if (op->mem.base != X86_REG_RIP) + continue; + + /* get the target address */ + orig_addr = addr + insn->size + op->mem.disp; + addr = map__objdump_2mem(map, orig_addr); + + if (dso__kernel(map__dso(map))) { + /* + * The kernel maps can be splitted into sections, + * let's find the map first and the search the symbol. + */ + map = maps__find(map__kmaps(map), addr); + if (map == NULL) + continue; + } + + /* convert it to map-relative address for search */ + addr = map__map_ip(map, addr); + + sym = map__find_symbol(map, addr); + if (sym == NULL) + continue; + + if (addr == sym->start) { + scnprintf(buf, len, "\t# %"PRIx64" <%s>", + orig_addr, sym->name); + } else { + scnprintf(buf, len, "\t# %"PRIx64" <%s+%#"PRIx64">", + orig_addr, sym->name, addr - sym->start); + } + break; + } +} + +static int symbol__disassemble_capstone(char *filename, struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + struct nscookie nsc; + u64 start = map__rip_2objdump(map, sym->start); + u64 end = map__rip_2objdump(map, sym->end); + u64 len = end - start; + u64 offset; + int i, fd, count; + bool is_64bit = false; + bool needs_cs_close = false; + u8 *buf = NULL; + struct find_file_offset_data data = { + .ip = start, + }; + csh handle; + cs_insn *insn; + char disasm_buf[512]; + struct disasm_line *dl; + + if (args->options->objdump_path) + return -1; + + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + fd = open(filename, O_RDONLY); + nsinfo__mountns_exit(&nsc); + if (fd < 0) + return -1; + + if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, + &is_64bit) == 0) + goto err; + + if (open_capstone_handle(args, is_64bit, &handle) < 0) + goto err; + + needs_cs_close = true; + + buf = malloc(len); + if (buf == NULL) + goto err; + + count = pread(fd, buf, len, data.offset); + close(fd); + fd = -1; + + if ((u64)count != len) + goto err; + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + count = cs_disasm(handle, buf, len, start, len, &insn); + for (i = 0, offset = 0; i < count; i++) { + int printed; + + printed = scnprintf(disasm_buf, sizeof(disasm_buf), + " %-7s %s", + insn[i].mnemonic, insn[i].op_str); + print_capstone_detail(&insn[i], disasm_buf + printed, + sizeof(disasm_buf) - printed, args, + start + offset); + + args->offset = offset; + args->line = disasm_buf; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + offset += insn[i].size; + } + + /* It failed in the middle: probably due to unknown instructions */ + if (offset != len) { + struct list_head *list = ¬es->src->source; + + /* Discard all lines and fallback to objdump */ + while (!list_empty(list)) { + dl = list_first_entry(list, struct disasm_line, al.node); + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } + count = -1; + } + +out: + if (needs_cs_close) + cs_close(&handle); + free(buf); + return count < 0 ? count : 0; + +err: + if (fd >= 0) + close(fd); + if (needs_cs_close) { + struct disasm_line *tmp; + + /* + * It probably failed in the middle of the above loop. + * Release any resources it might add. + */ + list_for_each_entry_safe(dl, tmp, ¬es->src->source, al.node) { + list_del(&dl->al.node); + free(dl); + } + } + count = -1; + goto out; +} +#endif + +/* + * Possibly create a new version of line with tabs expanded. Returns the + * existing or new line, storage is updated if a new line is allocated. If + * allocation fails then NULL is returned. + */ +static char *expand_tabs(char *line, char **storage, size_t *storage_len) +{ + size_t i, src, dst, len, new_storage_len, num_tabs; + char *new_line; + size_t line_len = strlen(line); + + for (num_tabs = 0, i = 0; i < line_len; i++) + if (line[i] == '\t') + num_tabs++; + + if (num_tabs == 0) + return line; + + /* + * Space for the line and '\0', less the leading and trailing + * spaces. Each tab may introduce 7 additional spaces. + */ + new_storage_len = line_len + 1 + (num_tabs * 7); + + new_line = malloc(new_storage_len); + if (new_line == NULL) { + pr_err("Failure allocating memory for tab expansion\n"); + return NULL; + } + + /* + * Copy regions starting at src and expand tabs. If there are two + * adjacent tabs then 'src == i', the memcpy is of size 0 and the spaces + * are inserted. + */ + for (i = 0, src = 0, dst = 0; i < line_len && num_tabs; i++) { + if (line[i] == '\t') { + len = i - src; + memcpy(&new_line[dst], &line[src], len); + dst += len; + new_line[dst++] = ' '; + while (dst % 8 != 0) + new_line[dst++] = ' '; + src = i + 1; + num_tabs--; + } + } + + /* Expand the last region. */ + len = line_len - src; + memcpy(&new_line[dst], &line[src], len); + dst += len; + new_line[dst] = '\0'; + + free(*storage); + *storage = new_line; + *storage_len = new_storage_len; + return new_line; +} + +int symbol__disassemble(struct symbol *sym, struct annotate_args *args) +{ + struct annotation_options *opts = &annotate_opts; + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + char *command; + FILE *file; + char symfs_filename[PATH_MAX]; + struct kcore_extract kce; + bool delete_extract = false; + bool decomp = false; + int lineno = 0; + char *fileloc = NULL; + int nline; + char *line; + size_t line_len; + const char *objdump_argv[] = { + "/bin/sh", + "-c", + NULL, /* Will be the objdump command to run. */ + "--", + NULL, /* Will be the symfs path. */ + NULL, + }; + struct child_process objdump_process; + int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename)); + + if (err) + return err; + + pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, + symfs_filename, sym->name, map__unmap_ip(map, sym->start), + map__unmap_ip(map, sym->end)); + + pr_debug("annotating [%p] %30s : [%p] %30s\n", + dso, dso__long_name(dso), sym, sym->name); + + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) { + return symbol__disassemble_bpf(sym, args); + } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_IMAGE) { + return symbol__disassemble_bpf_image(sym, args); + } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) { + return -1; + } else if (dso__is_kcore(dso)) { + kce.kcore_filename = symfs_filename; + kce.addr = map__rip_2objdump(map, sym->start); + kce.offs = sym->start; + kce.len = sym->end - sym->start; + if (!kcore_extract__create(&kce)) { + delete_extract = true; + strlcpy(symfs_filename, kce.extract_filename, + sizeof(symfs_filename)); + } + } else if (dso__needs_decompress(dso)) { + char tmp[KMOD_DECOMP_LEN]; + + if (dso__decompress_kmodule_path(dso, symfs_filename, + tmp, sizeof(tmp)) < 0) + return -1; + + decomp = true; + strcpy(symfs_filename, tmp); + } + +#ifdef HAVE_LIBCAPSTONE_SUPPORT + err = symbol__disassemble_capstone(symfs_filename, sym, args); + if (err == 0) + goto out_remove_tmp; +#endif + + err = asprintf(&command, + "%s %s%s --start-address=0x%016" PRIx64 + " --stop-address=0x%016" PRIx64 + " %s -d %s %s %s %c%s%c %s%s -C \"$1\"", + opts->objdump_path ?: "objdump", + opts->disassembler_style ? "-M " : "", + opts->disassembler_style ?: "", + map__rip_2objdump(map, sym->start), + map__rip_2objdump(map, sym->end), + opts->show_linenr ? "-l" : "", + opts->show_asm_raw ? "" : "--no-show-raw-insn", + opts->annotate_src ? "-S" : "", + opts->prefix ? "--prefix " : "", + opts->prefix ? '"' : ' ', + opts->prefix ?: "", + opts->prefix ? '"' : ' ', + opts->prefix_strip ? "--prefix-strip=" : "", + opts->prefix_strip ?: ""); + + if (err < 0) { + pr_err("Failure allocating memory for the command to run\n"); + goto out_remove_tmp; + } + + pr_debug("Executing: %s\n", command); + + objdump_argv[2] = command; + objdump_argv[4] = symfs_filename; + + /* Create a pipe to read from for stdout */ + memset(&objdump_process, 0, sizeof(objdump_process)); + objdump_process.argv = objdump_argv; + objdump_process.out = -1; + objdump_process.err = -1; + objdump_process.no_stderr = 1; + if (start_command(&objdump_process)) { + pr_err("Failure starting to run %s\n", command); + err = -1; + goto out_free_command; + } + + file = fdopen(objdump_process.out, "r"); + if (!file) { + pr_err("Failure creating FILE stream for %s\n", command); + /* + * If we were using debug info should retry with + * original binary. + */ + err = -1; + goto out_close_stdout; + } + + /* Storage for getline. */ + line = NULL; + line_len = 0; + + nline = 0; + while (!feof(file)) { + const char *match; + char *expanded_line; + + if (getline(&line, &line_len, file) < 0 || !line) + break; + + /* Skip lines containing "filename:" */ + match = strstr(line, symfs_filename); + if (match && match[strlen(symfs_filename)] == ':') + continue; + + expanded_line = strim(line); + expanded_line = expand_tabs(expanded_line, &line, &line_len); + if (!expanded_line) + break; + + /* + * The source code line number (lineno) needs to be kept in + * across calls to symbol__parse_objdump_line(), so that it + * can associate it with the instructions till the next one. + * See disasm_line__new() and struct disasm_line::line_nr. + */ + if (symbol__parse_objdump_line(sym, args, expanded_line, + &lineno, &fileloc) < 0) + break; + nline++; + } + free(line); + free(fileloc); + + err = finish_command(&objdump_process); + if (err) + pr_err("Error running %s\n", command); + + if (nline == 0) { + err = -1; + pr_err("No output from %s\n", command); + } + + /* + * kallsyms does not have symbol sizes so there may a nop at the end. + * Remove it. + */ + if (dso__is_kcore(dso)) + delete_last_nop(sym); + + fclose(file); + +out_close_stdout: + close(objdump_process.out); + +out_free_command: + free(command); + +out_remove_tmp: + if (decomp) + unlink(symfs_filename); + + if (delete_extract) + kcore_extract__delete(&kce); + + return err; +} diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h new file mode 100644 index 0000000000..3d381a0435 --- /dev/null +++ b/tools/perf/util/disasm.h @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __PERF_UTIL_DISASM_H +#define __PERF_UTIL_DISASM_H + +#include "map_symbol.h" + +struct annotation_options; +struct disasm_line; +struct ins; +struct evsel; +struct symbol; + +struct arch { + const char *name; + struct ins *instructions; + size_t nr_instructions; + size_t nr_instructions_allocated; + struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); + bool sorted_instructions; + bool initialized; + const char *insn_suffix; + void *priv; + unsigned int model; + unsigned int family; + int (*init)(struct arch *arch, char *cpuid); + bool (*ins_is_fused)(struct arch *arch, const char *ins1, + const char *ins2); + struct { + char comment_char; + char skip_functions_char; + char register_char; + char memory_ref_char; + char imm_char; + } objdump; +}; + +struct ins { + const char *name; + struct ins_ops *ops; +}; + +struct ins_operands { + char *raw; + struct { + char *raw; + char *name; + struct symbol *sym; + u64 addr; + s64 offset; + bool offset_avail; + bool outside; + bool multi_regs; + } target; + union { + struct { + char *raw; + char *name; + u64 addr; + bool multi_regs; + } source; + struct { + struct ins ins; + struct ins_operands *ops; + } locked; + struct { + char *raw_comment; + char *raw_func_start; + } jump; + }; +}; + +struct ins_ops { + void (*free)(struct ins_operands *ops); + int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms); + int (*scnprintf)(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +}; + +struct annotate_args { + struct arch *arch; + struct map_symbol ms; + struct evsel *evsel; + struct annotation_options *options; + s64 offset; + char *line; + int line_nr; + char *fileloc; +}; + +struct arch *arch__find(const char *name); +bool arch__is(struct arch *arch, const char *name); + +struct ins_ops *ins__find(struct arch *arch, const char *name); +int ins__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); + +bool ins__is_call(const struct ins *ins); +bool ins__is_jump(const struct ins *ins); +bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2); +bool ins__is_nop(const struct ins *ins); +bool ins__is_ret(const struct ins *ins); +bool ins__is_lock(const struct ins *ins); + +struct disasm_line *disasm_line__new(struct annotate_args *args); +void disasm_line__free(struct disasm_line *dl); + +int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, + bool raw, int max_ins_name); + +int symbol__disassemble(struct symbol *sym, struct annotate_args *args); + +#endif /* __PERF_UTIL_DISASM_H */ diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c index 908e168137..7d180bdaed 100644 --- a/tools/perf/util/dlfilter.c +++ b/tools/perf/util/dlfilter.c @@ -33,13 +33,13 @@ static void al_to_d_al(struct addr_location *al, struct perf_dlfilter_al *d_al) if (al->map) { struct dso *dso = map__dso(al->map); - if (symbol_conf.show_kernel_path && dso->long_name) - d_al->dso = dso->long_name; + if (symbol_conf.show_kernel_path && dso__long_name(dso)) + d_al->dso = dso__long_name(dso); else - d_al->dso = dso->name; - d_al->is_64_bit = dso->is_64_bit; - d_al->buildid_size = dso->bid.size; - d_al->buildid = dso->bid.data; + d_al->dso = dso__name(dso); + d_al->is_64_bit = dso__is_64_bit(dso); + d_al->buildid_size = dso__bid(dso)->size; + d_al->buildid = dso__bid(dso)->data; } else { d_al->dso = NULL; d_al->is_64_bit = 0; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 22fd5fa806..67414944f2 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -40,6 +40,12 @@ static const char * const debuglink_paths[] = { "/usr/lib/debug%s/%s" }; +void dso__set_nsinfo(struct dso *dso, struct nsinfo *nsi) +{ + nsinfo__put(RC_CHK_ACCESS(dso)->nsinfo); + RC_CHK_ACCESS(dso)->nsinfo = nsi; +} + char dso__symtab_origin(const struct dso *dso) { static const char origin[] = { @@ -63,14 +69,14 @@ char dso__symtab_origin(const struct dso *dso) [DSO_BINARY_TYPE__GUEST_VMLINUX] = 'V', }; - if (dso == NULL || dso->symtab_type == DSO_BINARY_TYPE__NOT_FOUND) + if (dso == NULL || dso__symtab_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) return '!'; - return origin[dso->symtab_type]; + return origin[dso__symtab_type(dso)]; } bool dso__is_object_file(const struct dso *dso) { - switch (dso->binary_type) { + switch (dso__binary_type(dso)) { case DSO_BINARY_TYPE__KALLSYMS: case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__JAVA_JIT: @@ -117,7 +123,7 @@ int dso__read_binary_type_filename(const struct dso *dso, char symfile[PATH_MAX]; unsigned int i; - len = __symbol__join_symfs(filename, size, dso->long_name); + len = __symbol__join_symfs(filename, size, dso__long_name(dso)); last_slash = filename + len; while (last_slash != filename && *last_slash != '/') last_slash--; @@ -159,12 +165,12 @@ int dso__read_binary_type_filename(const struct dso *dso, case DSO_BINARY_TYPE__FEDORA_DEBUGINFO: len = __symbol__join_symfs(filename, size, "/usr/lib/debug"); - snprintf(filename + len, size - len, "%s.debug", dso->long_name); + snprintf(filename + len, size - len, "%s.debug", dso__long_name(dso)); break; case DSO_BINARY_TYPE__UBUNTU_DEBUGINFO: len = __symbol__join_symfs(filename, size, "/usr/lib/debug"); - snprintf(filename + len, size - len, "%s", dso->long_name); + snprintf(filename + len, size - len, "%s", dso__long_name(dso)); break; case DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO: @@ -173,13 +179,13 @@ int dso__read_binary_type_filename(const struct dso *dso, * /usr/lib/debug/lib when it is expected to be in * /usr/lib/debug/usr/lib */ - if (strlen(dso->long_name) < 9 || - strncmp(dso->long_name, "/usr/lib/", 9)) { + if (strlen(dso__long_name(dso)) < 9 || + strncmp(dso__long_name(dso), "/usr/lib/", 9)) { ret = -1; break; } len = __symbol__join_symfs(filename, size, "/usr/lib/debug"); - snprintf(filename + len, size - len, "%s", dso->long_name + 4); + snprintf(filename + len, size - len, "%s", dso__long_name(dso) + 4); break; case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO: @@ -187,29 +193,29 @@ int dso__read_binary_type_filename(const struct dso *dso, const char *last_slash; size_t dir_size; - last_slash = dso->long_name + dso->long_name_len; - while (last_slash != dso->long_name && *last_slash != '/') + last_slash = dso__long_name(dso) + dso__long_name_len(dso); + while (last_slash != dso__long_name(dso) && *last_slash != '/') last_slash--; len = __symbol__join_symfs(filename, size, ""); - dir_size = last_slash - dso->long_name + 2; + dir_size = last_slash - dso__long_name(dso) + 2; if (dir_size > (size - len)) { ret = -1; break; } - len += scnprintf(filename + len, dir_size, "%s", dso->long_name); + len += scnprintf(filename + len, dir_size, "%s", dso__long_name(dso)); len += scnprintf(filename + len , size - len, ".debug%s", last_slash); break; } case DSO_BINARY_TYPE__BUILDID_DEBUGINFO: - if (!dso->has_build_id) { + if (!dso__has_build_id(dso)) { ret = -1; break; } - build_id__sprintf(&dso->bid, build_id_hex); + build_id__sprintf(dso__bid_const(dso), build_id_hex); len = __symbol__join_symfs(filename, size, "/usr/lib/debug/.build-id/"); snprintf(filename + len, size - len, "%.2s/%s.debug", build_id_hex, build_id_hex + 2); @@ -218,23 +224,23 @@ int dso__read_binary_type_filename(const struct dso *dso, case DSO_BINARY_TYPE__VMLINUX: case DSO_BINARY_TYPE__GUEST_VMLINUX: case DSO_BINARY_TYPE__SYSTEM_PATH_DSO: - __symbol__join_symfs(filename, size, dso->long_name); + __symbol__join_symfs(filename, size, dso__long_name(dso)); break; case DSO_BINARY_TYPE__GUEST_KMODULE: case DSO_BINARY_TYPE__GUEST_KMODULE_COMP: path__join3(filename, size, symbol_conf.symfs, - root_dir, dso->long_name); + root_dir, dso__long_name(dso)); break; case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE: case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP: - __symbol__join_symfs(filename, size, dso->long_name); + __symbol__join_symfs(filename, size, dso__long_name(dso)); break; case DSO_BINARY_TYPE__KCORE: case DSO_BINARY_TYPE__GUEST_KCORE: - snprintf(filename, size, "%s", dso->long_name); + snprintf(filename, size, "%s", dso__long_name(dso)); break; default: @@ -310,8 +316,8 @@ bool is_kernel_module(const char *pathname, int cpumode) bool dso__needs_decompress(struct dso *dso) { - return dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; + return dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; } int filename__decompress(const char *name, char *pathname, @@ -363,11 +369,10 @@ static int decompress_kmodule(struct dso *dso, const char *name, if (!dso__needs_decompress(dso)) return -1; - if (dso->comp == COMP_ID__NONE) + if (dso__comp(dso) == COMP_ID__NONE) return -1; - return filename__decompress(name, pathname, len, dso->comp, - &dso->load_errno); + return filename__decompress(name, pathname, len, dso__comp(dso), dso__load_errno(dso)); } int dso__decompress_kmodule_fd(struct dso *dso, const char *name) @@ -468,17 +473,17 @@ void dso__set_module_info(struct dso *dso, struct kmod_path *m, struct machine *machine) { if (machine__is_host(machine)) - dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE; + dso__set_symtab_type(dso, DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE); else - dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE; + dso__set_symtab_type(dso, DSO_BINARY_TYPE__GUEST_KMODULE); /* _KMODULE_COMP should be next to _KMODULE */ if (m->kmod && m->comp) { - dso->symtab_type++; - dso->comp = m->comp; + dso__set_symtab_type(dso, dso__symtab_type(dso) + 1); + dso__set_comp(dso, m->comp); } - dso->is_kmod = 1; + dso__set_is_kmod(dso); dso__set_short_name(dso, strdup(m->name), true); } @@ -491,13 +496,21 @@ static pthread_mutex_t dso__data_open_lock = PTHREAD_MUTEX_INITIALIZER; static void dso__list_add(struct dso *dso) { - list_add_tail(&dso->data.open_entry, &dso__data_open); + list_add_tail(&dso__data(dso)->open_entry, &dso__data_open); +#ifdef REFCNT_CHECKING + dso__data(dso)->dso = dso__get(dso); +#endif + /* Assume the dso is part of dsos, hence the optional reference count above. */ + assert(dso__dsos(dso)); dso__data_open_cnt++; } static void dso__list_del(struct dso *dso) { - list_del_init(&dso->data.open_entry); + list_del_init(&dso__data(dso)->open_entry); +#ifdef REFCNT_CHECKING + dso__put(dso__data(dso)->dso); +#endif WARN_ONCE(dso__data_open_cnt <= 0, "DSO data fd counter out of bounds."); dso__data_open_cnt--; @@ -528,7 +541,7 @@ static int do_open(char *name) char *dso__filename_with_chroot(const struct dso *dso, const char *filename) { - return filename_with_chroot(nsinfo__pid(dso->nsinfo), filename); + return filename_with_chroot(nsinfo__pid(dso__nsinfo_const(dso)), filename); } static int __open_dso(struct dso *dso, struct machine *machine) @@ -541,18 +554,18 @@ static int __open_dso(struct dso *dso, struct machine *machine) if (!name) return -ENOMEM; - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); if (machine) root_dir = machine->root_dir; - if (dso__read_binary_type_filename(dso, dso->binary_type, + if (dso__read_binary_type_filename(dso, dso__binary_type(dso), root_dir, name, PATH_MAX)) goto out; if (!is_regular_file(name)) { char *new_name; - if (errno != ENOENT || dso->nsinfo == NULL) + if (errno != ENOENT || dso__nsinfo(dso) == NULL) goto out; new_name = dso__filename_with_chroot(dso, name); @@ -568,7 +581,7 @@ static int __open_dso(struct dso *dso, struct machine *machine) size_t len = sizeof(newpath); if (dso__decompress_kmodule_path(dso, name, newpath, len) < 0) { - fd = -dso->load_errno; + fd = -(*dso__load_errno(dso)); goto out; } @@ -582,7 +595,7 @@ static int __open_dso(struct dso *dso, struct machine *machine) unlink(name); out: - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); free(name); return fd; } @@ -601,13 +614,13 @@ static int open_dso(struct dso *dso, struct machine *machine) int fd; struct nscookie nsc; - if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE) { - mutex_lock(&dso->lock); - nsinfo__mountns_enter(dso->nsinfo, &nsc); - mutex_unlock(&dso->lock); + if (dso__binary_type(dso) != DSO_BINARY_TYPE__BUILD_ID_CACHE) { + mutex_lock(dso__lock(dso)); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + mutex_unlock(dso__lock(dso)); } fd = __open_dso(dso, machine); - if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE) + if (dso__binary_type(dso) != DSO_BINARY_TYPE__BUILD_ID_CACHE) nsinfo__mountns_exit(&nsc); if (fd >= 0) { @@ -624,10 +637,10 @@ static int open_dso(struct dso *dso, struct machine *machine) static void close_data_fd(struct dso *dso) { - if (dso->data.fd >= 0) { - close(dso->data.fd); - dso->data.fd = -1; - dso->data.file_size = 0; + if (dso__data(dso)->fd >= 0) { + close(dso__data(dso)->fd); + dso__data(dso)->fd = -1; + dso__data(dso)->file_size = 0; dso__list_del(dso); } } @@ -646,9 +659,15 @@ static void close_dso(struct dso *dso) static void close_first_dso(void) { + struct dso_data *dso_data; struct dso *dso; - dso = list_first_entry(&dso__data_open, struct dso, data.open_entry); + dso_data = list_first_entry(&dso__data_open, struct dso_data, open_entry); +#ifdef REFCNT_CHECKING + dso = dso_data->dso; +#else + dso = container_of(dso_data, struct dso, data); +#endif close_dso(dso); } @@ -728,28 +747,29 @@ static void try_to_open_dso(struct dso *dso, struct machine *machine) DSO_BINARY_TYPE__NOT_FOUND, }; int i = 0; + struct dso_data *dso_data = dso__data(dso); - if (dso->data.fd >= 0) + if (dso_data->fd >= 0) return; - if (dso->binary_type != DSO_BINARY_TYPE__NOT_FOUND) { - dso->data.fd = open_dso(dso, machine); + if (dso__binary_type(dso) != DSO_BINARY_TYPE__NOT_FOUND) { + dso_data->fd = open_dso(dso, machine); goto out; } do { - dso->binary_type = binary_type_data[i++]; + dso__set_binary_type(dso, binary_type_data[i++]); - dso->data.fd = open_dso(dso, machine); - if (dso->data.fd >= 0) + dso_data->fd = open_dso(dso, machine); + if (dso_data->fd >= 0) goto out; - } while (dso->binary_type != DSO_BINARY_TYPE__NOT_FOUND); + } while (dso__binary_type(dso) != DSO_BINARY_TYPE__NOT_FOUND); out: - if (dso->data.fd >= 0) - dso->data.status = DSO_DATA_STATUS_OK; + if (dso_data->fd >= 0) + dso_data->status = DSO_DATA_STATUS_OK; else - dso->data.status = DSO_DATA_STATUS_ERROR; + dso_data->status = DSO_DATA_STATUS_ERROR; } /** @@ -763,7 +783,7 @@ out: */ int dso__data_get_fd(struct dso *dso, struct machine *machine) { - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; if (pthread_mutex_lock(&dso__data_open_lock) < 0) @@ -771,10 +791,10 @@ int dso__data_get_fd(struct dso *dso, struct machine *machine) try_to_open_dso(dso, machine); - if (dso->data.fd < 0) + if (dso__data(dso)->fd < 0) pthread_mutex_unlock(&dso__data_open_lock); - return dso->data.fd; + return dso__data(dso)->fd; } void dso__data_put_fd(struct dso *dso __maybe_unused) @@ -786,10 +806,10 @@ bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by) { u32 flag = 1 << by; - if (dso->data.status_seen & flag) + if (dso__data(dso)->status_seen & flag) return true; - dso->data.status_seen |= flag; + dso__data(dso)->status_seen |= flag; return false; } @@ -799,12 +819,13 @@ static ssize_t bpf_read(struct dso *dso, u64 offset, char *data) { struct bpf_prog_info_node *node; ssize_t size = DSO__DATA_CACHE_SIZE; + struct dso_bpf_prog *dso_bpf_prog = dso__bpf_prog(dso); u64 len; u8 *buf; - node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, dso->bpf_prog.id); + node = perf_env__find_bpf_prog_info(dso_bpf_prog->env, dso_bpf_prog->id); if (!node || !node->info_linear) { - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; return -1; } @@ -822,14 +843,15 @@ static ssize_t bpf_read(struct dso *dso, u64 offset, char *data) static int bpf_size(struct dso *dso) { struct bpf_prog_info_node *node; + struct dso_bpf_prog *dso_bpf_prog = dso__bpf_prog(dso); - node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, dso->bpf_prog.id); + node = perf_env__find_bpf_prog_info(dso_bpf_prog->env, dso_bpf_prog->id); if (!node || !node->info_linear) { - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; return -1; } - dso->data.file_size = node->info_linear->info.jited_prog_len; + dso__data(dso)->file_size = node->info_linear->info.jited_prog_len; return 0; } #endif // HAVE_LIBBPF_SUPPORT @@ -837,10 +859,10 @@ static int bpf_size(struct dso *dso) static void dso_cache__free(struct dso *dso) { - struct rb_root *root = &dso->data.cache; + struct rb_root *root = &dso__data(dso)->cache; struct rb_node *next = rb_first(root); - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); while (next) { struct dso_cache *cache; @@ -849,12 +871,12 @@ dso_cache__free(struct dso *dso) rb_erase(&cache->rb_node, root); free(cache); } - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); } static struct dso_cache *__dso_cache__find(struct dso *dso, u64 offset) { - const struct rb_root *root = &dso->data.cache; + const struct rb_root *root = &dso__data(dso)->cache; struct rb_node * const *p = &root->rb_node; const struct rb_node *parent = NULL; struct dso_cache *cache; @@ -880,13 +902,13 @@ static struct dso_cache *__dso_cache__find(struct dso *dso, u64 offset) static struct dso_cache * dso_cache__insert(struct dso *dso, struct dso_cache *new) { - struct rb_root *root = &dso->data.cache; + struct rb_root *root = &dso__data(dso)->cache; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct dso_cache *cache; u64 offset = new->offset; - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); while (*p != NULL) { u64 end; @@ -907,7 +929,7 @@ dso_cache__insert(struct dso *dso, struct dso_cache *new) cache = NULL; out: - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); return cache; } @@ -932,18 +954,18 @@ static ssize_t file_read(struct dso *dso, struct machine *machine, pthread_mutex_lock(&dso__data_open_lock); /* - * dso->data.fd might be closed if other thread opened another + * dso__data(dso)->fd might be closed if other thread opened another * file (dso) due to open file limit (RLIMIT_NOFILE). */ try_to_open_dso(dso, machine); - if (dso->data.fd < 0) { - dso->data.status = DSO_DATA_STATUS_ERROR; + if (dso__data(dso)->fd < 0) { + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; ret = -errno; goto out; } - ret = pread(dso->data.fd, data, DSO__DATA_CACHE_SIZE, offset); + ret = pread(dso__data(dso)->fd, data, DSO__DATA_CACHE_SIZE, offset); out: pthread_mutex_unlock(&dso__data_open_lock); return ret; @@ -963,11 +985,11 @@ static struct dso_cache *dso_cache__populate(struct dso *dso, return NULL; } #ifdef HAVE_LIBBPF_SUPPORT - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) *ret = bpf_read(dso, cache_offset, cache->data); else #endif - if (dso->binary_type == DSO_BINARY_TYPE__OOL) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__OOL) *ret = DSO__DATA_CACHE_SIZE; else *ret = file_read(dso, machine, cache_offset, cache->data); @@ -1056,25 +1078,25 @@ static int file_size(struct dso *dso, struct machine *machine) pthread_mutex_lock(&dso__data_open_lock); /* - * dso->data.fd might be closed if other thread opened another + * dso__data(dso)->fd might be closed if other thread opened another * file (dso) due to open file limit (RLIMIT_NOFILE). */ try_to_open_dso(dso, machine); - if (dso->data.fd < 0) { + if (dso__data(dso)->fd < 0) { ret = -errno; - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; goto out; } - if (fstat(dso->data.fd, &st) < 0) { + if (fstat(dso__data(dso)->fd, &st) < 0) { ret = -errno; pr_err("dso cache fstat failed: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; goto out; } - dso->data.file_size = st.st_size; + dso__data(dso)->file_size = st.st_size; out: pthread_mutex_unlock(&dso__data_open_lock); @@ -1083,13 +1105,13 @@ out: int dso__data_file_size(struct dso *dso, struct machine *machine) { - if (dso->data.file_size) + if (dso__data(dso)->file_size) return 0; - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; #ifdef HAVE_LIBBPF_SUPPORT - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) return bpf_size(dso); #endif return file_size(dso, machine); @@ -1108,7 +1130,7 @@ off_t dso__data_size(struct dso *dso, struct machine *machine) return -1; /* For now just estimate dso data size is close to file size */ - return dso->data.file_size; + return dso__data(dso)->file_size; } static ssize_t data_read_write_offset(struct dso *dso, struct machine *machine, @@ -1119,7 +1141,7 @@ static ssize_t data_read_write_offset(struct dso *dso, struct machine *machine, return -1; /* Check the offset sanity. */ - if (offset > dso->data.file_size) + if (offset > dso__data(dso)->file_size) return -1; if (offset + size < offset) @@ -1142,7 +1164,7 @@ static ssize_t data_read_write_offset(struct dso *dso, struct machine *machine, ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine, u64 offset, u8 *data, ssize_t size) { - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; return data_read_write_offset(dso, machine, offset, data, size, true); @@ -1182,7 +1204,7 @@ ssize_t dso__data_write_cache_offs(struct dso *dso, struct machine *machine, { u8 *data = (u8 *)data_in; /* cast away const to use same fns for r/w */ - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; return data_read_write_offset(dso, machine, offset, data, size, false); @@ -1235,56 +1257,139 @@ struct dso *machine__findnew_kernel(struct machine *machine, const char *name, */ if (dso != NULL) { dso__set_short_name(dso, short_name, false); - dso->kernel = dso_type; + dso__set_kernel(dso, dso_type); } return dso; } -static void dso__set_long_name_id(struct dso *dso, const char *name, struct dso_id *id, bool name_allocated) +static void dso__set_long_name_id(struct dso *dso, const char *name, bool name_allocated) { - struct rb_root *root = dso->root; + struct dsos *dsos = dso__dsos(dso); if (name == NULL) return; - if (dso->long_name_allocated) - free((char *)dso->long_name); - - if (root) { - rb_erase(&dso->rb_node, root); + if (dsos) { /* - * __dsos__findnew_link_by_longname_id() isn't guaranteed to - * add it back, so a clean removal is required here. + * Need to avoid re-sorting the dsos breaking by non-atomically + * renaming the dso. */ - RB_CLEAR_NODE(&dso->rb_node); - dso->root = NULL; + down_write(&dsos->lock); + } + + if (dso__long_name_allocated(dso)) + free((char *)dso__long_name(dso)); + + RC_CHK_ACCESS(dso)->long_name = name; + RC_CHK_ACCESS(dso)->long_name_len = strlen(name); + dso__set_long_name_allocated(dso, name_allocated); + + if (dsos) { + dsos->sorted = false; + up_write(&dsos->lock); + } +} + +static int __dso_id__cmp(const struct dso_id *a, const struct dso_id *b) +{ + if (a->maj > b->maj) return -1; + if (a->maj < b->maj) return 1; + + if (a->min > b->min) return -1; + if (a->min < b->min) return 1; + + if (a->ino > b->ino) return -1; + if (a->ino < b->ino) return 1; + + /* + * Synthesized MMAP events have zero ino_generation, avoid comparing + * them with MMAP events with actual ino_generation. + * + * I found it harmful because the mismatch resulted in a new + * dso that did not have a build ID whereas the original dso did have a + * build ID. The build ID was essential because the object was not found + * otherwise. - Adrian + */ + if (a->ino_generation && b->ino_generation) { + if (a->ino_generation > b->ino_generation) return -1; + if (a->ino_generation < b->ino_generation) return 1; } - dso->long_name = name; - dso->long_name_len = strlen(name); - dso->long_name_allocated = name_allocated; + return 0; +} + +bool dso_id__empty(const struct dso_id *id) +{ + if (!id) + return true; + + return !id->maj && !id->min && !id->ino && !id->ino_generation; +} + +void __dso__inject_id(struct dso *dso, struct dso_id *id) +{ + struct dsos *dsos = dso__dsos(dso); + struct dso_id *dso_id = dso__id(dso); + + /* dsos write lock held by caller. */ + + dso_id->maj = id->maj; + dso_id->min = id->min; + dso_id->ino = id->ino; + dso_id->ino_generation = id->ino_generation; + + if (dsos) + dsos->sorted = false; +} - if (root) - __dsos__findnew_link_by_longname_id(root, dso, NULL, id); +int dso_id__cmp(const struct dso_id *a, const struct dso_id *b) +{ + /* + * The second is always dso->id, so zeroes if not set, assume passing + * NULL for a means a zeroed id + */ + if (dso_id__empty(a) || dso_id__empty(b)) + return 0; + + return __dso_id__cmp(a, b); +} + +int dso__cmp_id(struct dso *a, struct dso *b) +{ + return __dso_id__cmp(dso__id(a), dso__id(b)); } void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) { - dso__set_long_name_id(dso, name, NULL, name_allocated); + dso__set_long_name_id(dso, name, name_allocated); } void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated) { + struct dsos *dsos = dso__dsos(dso); + if (name == NULL) return; - if (dso->short_name_allocated) - free((char *)dso->short_name); + if (dsos) { + /* + * Need to avoid re-sorting the dsos breaking by non-atomically + * renaming the dso. + */ + down_write(&dsos->lock); + } + if (dso__short_name_allocated(dso)) + free((char *)dso__short_name(dso)); + + RC_CHK_ACCESS(dso)->short_name = name; + RC_CHK_ACCESS(dso)->short_name_len = strlen(name); + dso__set_short_name_allocated(dso, name_allocated); - dso->short_name = name; - dso->short_name_len = strlen(name); - dso->short_name_allocated = name_allocated; + if (dsos) { + dsos->sorted = false; + up_write(&dsos->lock); + } } int dso__name_len(const struct dso *dso) @@ -1292,43 +1397,48 @@ int dso__name_len(const struct dso *dso) if (!dso) return strlen("[unknown]"); if (verbose > 0) - return dso->long_name_len; + return dso__long_name_len(dso); - return dso->short_name_len; + return dso__short_name_len(dso); } bool dso__loaded(const struct dso *dso) { - return dso->loaded; + return RC_CHK_ACCESS(dso)->loaded; } bool dso__sorted_by_name(const struct dso *dso) { - return dso->sorted_by_name; + return RC_CHK_ACCESS(dso)->sorted_by_name; } void dso__set_sorted_by_name(struct dso *dso) { - dso->sorted_by_name = true; + RC_CHK_ACCESS(dso)->sorted_by_name = true; } struct dso *dso__new_id(const char *name, struct dso_id *id) { - struct dso *dso = calloc(1, sizeof(*dso) + strlen(name) + 1); + RC_STRUCT(dso) *dso = zalloc(sizeof(*dso) + strlen(name) + 1); + struct dso *res; + struct dso_data *data; - if (dso != NULL) { + if (!dso) + return NULL; + + if (ADD_RC_CHK(res, dso)) { strcpy(dso->name, name); if (id) dso->id = *id; - dso__set_long_name_id(dso, dso->name, id, false); - dso__set_short_name(dso, dso->name, false); + dso__set_long_name_id(res, dso->name, false); + dso__set_short_name(res, dso->name, false); dso->symbols = RB_ROOT_CACHED; dso->symbol_names = NULL; dso->symbol_names_len = 0; - dso->data.cache = RB_ROOT; dso->inlined_nodes = RB_ROOT_CACHED; dso->srclines = RB_ROOT_CACHED; dso->data_types = RB_ROOT; + dso->global_vars = RB_ROOT; dso->data.fd = -1; dso->data.status = DSO_DATA_STATUS_UNKNOWN; dso->symtab_type = DSO_BINARY_TYPE__NOT_FOUND; @@ -1344,15 +1454,18 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) dso->is_kmod = 0; dso->needs_swap = DSO_SWAP__UNSET; dso->comp = COMP_ID__NONE; - RB_CLEAR_NODE(&dso->rb_node); - dso->root = NULL; - INIT_LIST_HEAD(&dso->node); - INIT_LIST_HEAD(&dso->data.open_entry); mutex_init(&dso->lock); refcount_set(&dso->refcnt, 1); + data = &dso->data; + data->cache = RB_ROOT; + data->fd = -1; + data->status = DSO_DATA_STATUS_UNKNOWN; + INIT_LIST_HEAD(&data->open_entry); +#ifdef REFCNT_CHECKING + data->dso = NULL; /* Set when on the open_entry list. */ +#endif } - - return dso; + return res; } struct dso *dso__new(const char *name) @@ -1362,71 +1475,78 @@ struct dso *dso__new(const char *name) void dso__delete(struct dso *dso) { - if (!RB_EMPTY_NODE(&dso->rb_node)) - pr_err("DSO %s is still in rbtree when being deleted!\n", - dso->long_name); + if (dso__dsos(dso)) + pr_err("DSO %s is still in rbtree when being deleted!\n", dso__long_name(dso)); /* free inlines first, as they reference symbols */ - inlines__tree_delete(&dso->inlined_nodes); - srcline__tree_delete(&dso->srclines); - symbols__delete(&dso->symbols); - dso->symbol_names_len = 0; - zfree(&dso->symbol_names); - annotated_data_type__tree_delete(&dso->data_types); - - if (dso->short_name_allocated) { - zfree((char **)&dso->short_name); - dso->short_name_allocated = false; + inlines__tree_delete(&RC_CHK_ACCESS(dso)->inlined_nodes); + srcline__tree_delete(&RC_CHK_ACCESS(dso)->srclines); + symbols__delete(&RC_CHK_ACCESS(dso)->symbols); + RC_CHK_ACCESS(dso)->symbol_names_len = 0; + zfree(&RC_CHK_ACCESS(dso)->symbol_names); + annotated_data_type__tree_delete(dso__data_types(dso)); + global_var_type__tree_delete(dso__global_vars(dso)); + + if (RC_CHK_ACCESS(dso)->short_name_allocated) { + zfree((char **)&RC_CHK_ACCESS(dso)->short_name); + RC_CHK_ACCESS(dso)->short_name_allocated = false; } - if (dso->long_name_allocated) { - zfree((char **)&dso->long_name); - dso->long_name_allocated = false; + if (RC_CHK_ACCESS(dso)->long_name_allocated) { + zfree((char **)&RC_CHK_ACCESS(dso)->long_name); + RC_CHK_ACCESS(dso)->long_name_allocated = false; } dso__data_close(dso); - auxtrace_cache__free(dso->auxtrace_cache); + auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache); dso_cache__free(dso); dso__free_a2l(dso); - zfree(&dso->symsrc_filename); - nsinfo__zput(dso->nsinfo); - mutex_destroy(&dso->lock); - free(dso); + dso__free_symsrc_filename(dso); + nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo); + mutex_destroy(dso__lock(dso)); + RC_CHK_FREE(dso); } struct dso *dso__get(struct dso *dso) { - if (dso) - refcount_inc(&dso->refcnt); - return dso; + struct dso *result; + + if (RC_CHK_GET(result, dso)) + refcount_inc(&RC_CHK_ACCESS(dso)->refcnt); + + return result; } void dso__put(struct dso *dso) { - if (dso && refcount_dec_and_test(&dso->refcnt)) + if (dso && refcount_dec_and_test(&RC_CHK_ACCESS(dso)->refcnt)) dso__delete(dso); + else + RC_CHK_PUT(dso); } void dso__set_build_id(struct dso *dso, struct build_id *bid) { - dso->bid = *bid; - dso->has_build_id = 1; + RC_CHK_ACCESS(dso)->bid = *bid; + RC_CHK_ACCESS(dso)->has_build_id = 1; } bool dso__build_id_equal(const struct dso *dso, struct build_id *bid) { - if (dso->bid.size > bid->size && dso->bid.size == BUILD_ID_SIZE) { + const struct build_id *dso_bid = dso__bid_const(dso); + + if (dso_bid->size > bid->size && dso_bid->size == BUILD_ID_SIZE) { /* * For the backward compatibility, it allows a build-id has * trailing zeros. */ - return !memcmp(dso->bid.data, bid->data, bid->size) && - !memchr_inv(&dso->bid.data[bid->size], 0, - dso->bid.size - bid->size); + return !memcmp(dso_bid->data, bid->data, bid->size) && + !memchr_inv(&dso_bid->data[bid->size], 0, + dso_bid->size - bid->size); } - return dso->bid.size == bid->size && - memcmp(dso->bid.data, bid->data, dso->bid.size) == 0; + return dso_bid->size == bid->size && + memcmp(dso_bid->data, bid->data, dso_bid->size) == 0; } void dso__read_running_kernel_build_id(struct dso *dso, struct machine *machine) @@ -1436,8 +1556,8 @@ void dso__read_running_kernel_build_id(struct dso *dso, struct machine *machine) if (machine__is_default_guest(machine)) return; sprintf(path, "%s/sys/kernel/notes", machine->root_dir); - if (sysfs__read_build_id(path, &dso->bid) == 0) - dso->has_build_id = true; + if (sysfs__read_build_id(path, dso__bid(dso)) == 0) + dso__set_has_build_id(dso); } int dso__kernel_module_get_build_id(struct dso *dso, @@ -1448,14 +1568,14 @@ int dso__kernel_module_get_build_id(struct dso *dso, * kernel module short names are of the form "[module]" and * we need just "module" here. */ - const char *name = dso->short_name + 1; + const char *name = dso__short_name(dso) + 1; snprintf(filename, sizeof(filename), "%s/sys/module/%.*s/notes/.note.gnu.build-id", root_dir, (int)strlen(name) - 1, name); - if (sysfs__read_build_id(filename, &dso->bid) == 0) - dso->has_build_id = true; + if (sysfs__read_build_id(filename, dso__bid(dso)) == 0) + dso__set_has_build_id(dso); return 0; } @@ -1464,21 +1584,21 @@ static size_t dso__fprintf_buildid(struct dso *dso, FILE *fp) { char sbuild_id[SBUILD_ID_SIZE]; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); return fprintf(fp, "%s", sbuild_id); } size_t dso__fprintf(struct dso *dso, FILE *fp) { struct rb_node *nd; - size_t ret = fprintf(fp, "dso: %s (", dso->short_name); + size_t ret = fprintf(fp, "dso: %s (", dso__short_name(dso)); - if (dso->short_name != dso->long_name) - ret += fprintf(fp, "%s, ", dso->long_name); + if (dso__short_name(dso) != dso__long_name(dso)) + ret += fprintf(fp, "%s, ", dso__long_name(dso)); ret += fprintf(fp, "%sloaded, ", dso__loaded(dso) ? "" : "NOT "); ret += dso__fprintf_buildid(dso, fp); ret += fprintf(fp, ")\n"); - for (nd = rb_first_cached(&dso->symbols); nd; nd = rb_next(nd)) { + for (nd = rb_first_cached(dso__symbols(dso)); nd; nd = rb_next(nd)) { struct symbol *pos = rb_entry(nd, struct symbol, rb_node); ret += symbol__fprintf(pos, fp); } @@ -1502,7 +1622,7 @@ enum dso_type dso__type(struct dso *dso, struct machine *machine) int dso__strerror_load(struct dso *dso, char *buf, size_t buflen) { - int idx, errnum = dso->load_errno; + int idx, errnum = *dso__load_errno(dso); /* * This must have a same ordering as the enum dso_load_errno. */ @@ -1532,3 +1652,15 @@ int dso__strerror_load(struct dso *dso, char *buf, size_t buflen) scnprintf(buf, buflen, "%s", dso_load__error_str[idx]); return 0; } + +bool perf_pid_map_tid(const char *dso_name, int *tid) +{ + return sscanf(dso_name, "/tmp/perf-%d.map", tid) == 1; +} + +bool is_perf_pid_map_name(const char *dso_name) +{ + int tid; + + return perf_pid_map_tid(dso_name, &tid); +} diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index ce9f3849a7..ed0068251c 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -11,6 +11,7 @@ #include #include "build-id.h" #include "mutex.h" +#include struct machine; struct map; @@ -100,26 +101,27 @@ enum dso_load_errno { __DSO_LOAD_ERRNO__END, }; -#define DSO__SWAP(dso, type, val) \ -({ \ - type ____r = val; \ - BUG_ON(dso->needs_swap == DSO_SWAP__UNSET); \ - if (dso->needs_swap == DSO_SWAP__YES) { \ - switch (sizeof(____r)) { \ - case 2: \ - ____r = bswap_16(val); \ - break; \ - case 4: \ - ____r = bswap_32(val); \ - break; \ - case 8: \ - ____r = bswap_64(val); \ - break; \ - default: \ - BUG_ON(1); \ - } \ - } \ - ____r; \ +#define DSO__SWAP(dso, type, val) \ +({ \ + type ____r = val; \ + enum dso_swap_type ___dst = dso__needs_swap(dso); \ + BUG_ON(___dst == DSO_SWAP__UNSET); \ + if (___dst == DSO_SWAP__YES) { \ + switch (sizeof(____r)) { \ + case 2: \ + ____r = bswap_16(val); \ + break; \ + case 4: \ + ____r = bswap_32(val); \ + break; \ + case 8: \ + ____r = bswap_64(val); \ + break; \ + default: \ + BUG_ON(1); \ + } \ + } \ + ____r; \ }) #define DSO__DATA_CACHE_SIZE 4096 @@ -142,33 +144,77 @@ struct dso_cache { char data[]; }; +struct dso_data { + struct rb_root cache; + struct list_head open_entry; +#ifdef REFCNT_CHECKING + struct dso *dso; +#endif + int fd; + int status; + u32 status_seen; + u64 file_size; + u64 elf_base_addr; + u64 debug_frame_offset; + u64 eh_frame_hdr_addr; + u64 eh_frame_hdr_offset; +}; + +struct dso_bpf_prog { + u32 id; + u32 sub_id; + struct perf_env *env; +}; + struct auxtrace_cache; -struct dso { +DECLARE_RC_STRUCT(dso) { struct mutex lock; - struct list_head node; - struct rb_node rb_node; /* rbtree node sorted by long name */ - struct rb_root *root; /* root of rbtree that rb_node is in */ + struct dsos *dsos; struct rb_root_cached symbols; struct symbol **symbol_names; size_t symbol_names_len; struct rb_root_cached inlined_nodes; struct rb_root_cached srclines; - struct rb_root data_types; + struct rb_root data_types; + struct rb_root global_vars; struct { u64 addr; struct symbol *symbol; } last_find_result; + struct build_id bid; + u64 text_offset; + u64 text_end; + const char *short_name; + const char *long_name; void *a2l; char *symsrc_filename; +#if defined(__powerpc__) + void *dwfl; /* DWARF debug info */ +#endif + struct nsinfo *nsinfo; + struct auxtrace_cache *auxtrace_cache; + union { /* Tool specific area */ + void *priv; + u64 db_id; + }; + /* bpf prog information */ + struct dso_bpf_prog bpf_prog; + /* dso data file */ + struct dso_data data; + struct dso_id id; unsigned int a2l_fails; - enum dso_space_type kernel; - bool is_kmod; - enum dso_swap_type needs_swap; - enum dso_binary_type symtab_type; - enum dso_binary_type binary_type; + int comp; + refcount_t refcnt; enum dso_load_errno load_errno; + u16 long_name_len; + u16 short_name_len; + enum dso_binary_type symtab_type:8; + enum dso_binary_type binary_type:8; + enum dso_space_type kernel:2; + enum dso_swap_type needs_swap:2; + bool is_kmod:1; u8 adjust_symbols:1; u8 has_build_id:1; u8 header_build_id:1; @@ -182,44 +228,6 @@ struct dso { bool sorted_by_name; bool loaded; u8 rel; - struct build_id bid; - u64 text_offset; - u64 text_end; - const char *short_name; - const char *long_name; - u16 long_name_len; - u16 short_name_len; - void *dwfl; /* DWARF debug info */ - struct auxtrace_cache *auxtrace_cache; - int comp; - - /* dso data file */ - struct { - struct rb_root cache; - int fd; - int status; - u32 status_seen; - u64 file_size; - struct list_head open_entry; - u64 elf_base_addr; - u64 debug_frame_offset; - u64 eh_frame_hdr_addr; - u64 eh_frame_hdr_offset; - } data; - /* bpf prog information */ - struct { - u32 id; - u32 sub_id; - struct perf_env *env; - } bpf_prog; - - union { /* Tool specific area */ - void *priv; - u64 db_id; - }; - struct nsinfo *nsinfo; - struct dso_id id; - refcount_t refcnt; char name[]; }; @@ -230,19 +238,408 @@ struct dso { * @n: the 'struct rb_node *' to use as a temporary storage */ #define dso__for_each_symbol(dso, pos, n) \ - symbols__for_each_entry(&(dso)->symbols, pos, n) + symbols__for_each_entry(dso__symbols(dso), pos, n) + +static inline void *dso__a2l(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->a2l; +} + +static inline void dso__set_a2l(struct dso *dso, void *val) +{ + RC_CHK_ACCESS(dso)->a2l = val; +} + +static inline unsigned int dso__a2l_fails(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->a2l_fails; +} + +static inline void dso__set_a2l_fails(struct dso *dso, unsigned int val) +{ + RC_CHK_ACCESS(dso)->a2l_fails = val; +} + +static inline bool dso__adjust_symbols(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->adjust_symbols; +} + +static inline void dso__set_adjust_symbols(struct dso *dso, bool val) +{ + RC_CHK_ACCESS(dso)->adjust_symbols = val; +} + +static inline bool dso__annotate_warned(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->annotate_warned; +} + +static inline void dso__set_annotate_warned(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->annotate_warned = 1; +} + +static inline bool dso__auxtrace_warned(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->auxtrace_warned; +} -#define dsos__for_each_with_build_id(pos, head) \ - list_for_each_entry(pos, head, node) \ - if (!pos->has_build_id) \ - continue; \ - else +static inline void dso__set_auxtrace_warned(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->auxtrace_warned = 1; +} + +static inline struct auxtrace_cache *dso__auxtrace_cache(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->auxtrace_cache; +} + +static inline void dso__set_auxtrace_cache(struct dso *dso, struct auxtrace_cache *cache) +{ + RC_CHK_ACCESS(dso)->auxtrace_cache = cache; +} + +static inline struct build_id *dso__bid(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->bid; +} + +static inline const struct build_id *dso__bid_const(const struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->bid; +} + +static inline struct dso_bpf_prog *dso__bpf_prog(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->bpf_prog; +} + +static inline bool dso__has_build_id(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->has_build_id; +} + +static inline void dso__set_has_build_id(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->has_build_id = true; +} + +static inline bool dso__has_srcline(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->has_srcline; +} + +static inline void dso__set_has_srcline(struct dso *dso, bool val) +{ + RC_CHK_ACCESS(dso)->has_srcline = val; +} + +static inline int dso__comp(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->comp; +} + +static inline void dso__set_comp(struct dso *dso, int comp) +{ + RC_CHK_ACCESS(dso)->comp = comp; +} + +static inline struct dso_data *dso__data(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->data; +} + +static inline u64 dso__db_id(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->db_id; +} + +static inline void dso__set_db_id(struct dso *dso, u64 db_id) +{ + RC_CHK_ACCESS(dso)->db_id = db_id; +} + +static inline struct dsos *dso__dsos(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->dsos; +} + +static inline void dso__set_dsos(struct dso *dso, struct dsos *dsos) +{ + RC_CHK_ACCESS(dso)->dsos = dsos; +} + +static inline bool dso__header_build_id(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->header_build_id; +} + +static inline void dso__set_header_build_id(struct dso *dso, bool val) +{ + RC_CHK_ACCESS(dso)->header_build_id = val; +} + +static inline bool dso__hit(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->hit; +} + +static inline void dso__set_hit(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->hit = 1; +} + +static inline struct dso_id *dso__id(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->id; +} + +static inline const struct dso_id *dso__id_const(const struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->id; +} + +static inline struct rb_root_cached *dso__inlined_nodes(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->inlined_nodes; +} + +static inline bool dso__is_64_bit(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->is_64_bit; +} + +static inline void dso__set_is_64_bit(struct dso *dso, bool is) +{ + RC_CHK_ACCESS(dso)->is_64_bit = is; +} + +static inline bool dso__is_kmod(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->is_kmod; +} + +static inline void dso__set_is_kmod(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->is_kmod = 1; +} + +static inline enum dso_space_type dso__kernel(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->kernel; +} + +static inline void dso__set_kernel(struct dso *dso, enum dso_space_type kernel) +{ + RC_CHK_ACCESS(dso)->kernel = kernel; +} + +static inline u64 dso__last_find_result_addr(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->last_find_result.addr; +} + +static inline void dso__set_last_find_result_addr(struct dso *dso, u64 addr) +{ + RC_CHK_ACCESS(dso)->last_find_result.addr = addr; +} + +static inline struct symbol *dso__last_find_result_symbol(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->last_find_result.symbol; +} + +static inline void dso__set_last_find_result_symbol(struct dso *dso, struct symbol *symbol) +{ + RC_CHK_ACCESS(dso)->last_find_result.symbol = symbol; +} + +static inline enum dso_load_errno *dso__load_errno(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->load_errno; +} static inline void dso__set_loaded(struct dso *dso) { - dso->loaded = true; + RC_CHK_ACCESS(dso)->loaded = true; +} + +static inline struct mutex *dso__lock(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->lock; +} + +static inline const char *dso__long_name(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->long_name; } +static inline bool dso__long_name_allocated(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->long_name_allocated; +} + +static inline void dso__set_long_name_allocated(struct dso *dso, bool allocated) +{ + RC_CHK_ACCESS(dso)->long_name_allocated = allocated; +} + +static inline u16 dso__long_name_len(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->long_name_len; +} + +static inline const char *dso__name(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->name; +} + +static inline enum dso_swap_type dso__needs_swap(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->needs_swap; +} + +static inline void dso__set_needs_swap(struct dso *dso, enum dso_swap_type type) +{ + RC_CHK_ACCESS(dso)->needs_swap = type; +} + +static inline struct nsinfo *dso__nsinfo(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->nsinfo; +} + +static inline const struct nsinfo *dso__nsinfo_const(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->nsinfo; +} + +static inline struct nsinfo **dso__nsinfo_ptr(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->nsinfo; +} + +void dso__set_nsinfo(struct dso *dso, struct nsinfo *nsi); + +static inline u8 dso__rel(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->rel; +} + +static inline void dso__set_rel(struct dso *dso, u8 rel) +{ + RC_CHK_ACCESS(dso)->rel = rel; +} + +static inline const char *dso__short_name(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->short_name; +} + +static inline bool dso__short_name_allocated(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->short_name_allocated; +} + +static inline void dso__set_short_name_allocated(struct dso *dso, bool allocated) +{ + RC_CHK_ACCESS(dso)->short_name_allocated = allocated; +} + +static inline u16 dso__short_name_len(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->short_name_len; +} + +static inline struct rb_root_cached *dso__srclines(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->srclines; +} + +static inline struct rb_root *dso__data_types(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->data_types; +} + +static inline struct rb_root *dso__global_vars(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->global_vars; +} + +static inline struct rb_root_cached *dso__symbols(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->symbols; +} + +static inline struct symbol **dso__symbol_names(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symbol_names; +} + +static inline void dso__set_symbol_names(struct dso *dso, struct symbol **names) +{ + RC_CHK_ACCESS(dso)->symbol_names = names; +} + +static inline size_t dso__symbol_names_len(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symbol_names_len; +} + +static inline void dso__set_symbol_names_len(struct dso *dso, size_t len) +{ + RC_CHK_ACCESS(dso)->symbol_names_len = len; +} + +static inline const char *dso__symsrc_filename(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symsrc_filename; +} + +static inline void dso__set_symsrc_filename(struct dso *dso, char *val) +{ + RC_CHK_ACCESS(dso)->symsrc_filename = val; +} + +static inline void dso__free_symsrc_filename(struct dso *dso) +{ + zfree(&RC_CHK_ACCESS(dso)->symsrc_filename); +} + +static inline enum dso_binary_type dso__symtab_type(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symtab_type; +} + +static inline void dso__set_symtab_type(struct dso *dso, enum dso_binary_type bt) +{ + RC_CHK_ACCESS(dso)->symtab_type = bt; +} + +static inline u64 dso__text_end(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->text_end; +} + +static inline void dso__set_text_end(struct dso *dso, u64 val) +{ + RC_CHK_ACCESS(dso)->text_end = val; +} + +static inline u64 dso__text_offset(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->text_offset; +} + +static inline void dso__set_text_offset(struct dso *dso, u64 val) +{ + RC_CHK_ACCESS(dso)->text_offset = val; +} + +int dso_id__cmp(const struct dso_id *a, const struct dso_id *b); +bool dso_id__empty(const struct dso_id *id); + struct dso *dso__new_id(const char *name, struct dso_id *id); struct dso *dso__new(const char *name); void dso__delete(struct dso *dso); @@ -250,6 +647,7 @@ void dso__delete(struct dso *dso); int dso__cmp_id(struct dso *a, struct dso *b); void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated); void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated); +void __dso__inject_id(struct dso *dso, struct dso_id *id); int dso__name_len(const struct dso *dso); @@ -268,7 +666,7 @@ bool dso__loaded(const struct dso *dso); static inline bool dso__has_symbols(const struct dso *dso) { - return !RB_EMPTY_ROOT(&dso->symbols.rb_root); + return !RB_EMPTY_ROOT(&RC_CHK_ACCESS(dso)->symbols.rb_root); } char *dso__filename_with_chroot(const struct dso *dso, const char *filename); @@ -384,21 +782,33 @@ void dso__reset_find_symbol_cache(struct dso *dso); size_t dso__fprintf_symbols_by_name(struct dso *dso, FILE *fp); size_t dso__fprintf(struct dso *dso, FILE *fp); +static inline enum dso_binary_type dso__binary_type(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->binary_type; +} + +static inline void dso__set_binary_type(struct dso *dso, enum dso_binary_type bt) +{ + RC_CHK_ACCESS(dso)->binary_type = bt; +} + static inline bool dso__is_vmlinux(const struct dso *dso) { - return dso->binary_type == DSO_BINARY_TYPE__VMLINUX || - dso->binary_type == DSO_BINARY_TYPE__GUEST_VMLINUX; + enum dso_binary_type bt = dso__binary_type(dso); + + return bt == DSO_BINARY_TYPE__VMLINUX || bt == DSO_BINARY_TYPE__GUEST_VMLINUX; } static inline bool dso__is_kcore(const struct dso *dso) { - return dso->binary_type == DSO_BINARY_TYPE__KCORE || - dso->binary_type == DSO_BINARY_TYPE__GUEST_KCORE; + enum dso_binary_type bt = dso__binary_type(dso); + + return bt == DSO_BINARY_TYPE__KCORE || bt == DSO_BINARY_TYPE__GUEST_KCORE; } static inline bool dso__is_kallsyms(const struct dso *dso) { - return dso->kernel && dso->long_name[0] != '/'; + return RC_CHK_ACCESS(dso)->kernel && RC_CHK_ACCESS(dso)->long_name[0] != '/'; } bool dso__is_object_file(const struct dso *dso); @@ -411,4 +821,11 @@ int dso__strerror_load(struct dso *dso, char *buf, size_t buflen); void reset_fd_limit(void); +u64 dso__find_global_type(struct dso *dso, u64 addr); +u64 dso__findnew_global_type(struct dso *dso, u64 addr, u64 offset); + +/* Check if dso name is of format "/tmp/perf-%d.map" */ +bool perf_pid_map_tid(const char *dso_name, int *tid); +bool is_perf_pid_map_name(const char *dso_name); + #endif /* __PERF_DSO */ diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index cf80aa42dd..a69a9c6612 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -12,115 +12,140 @@ #include // filename__read_build_id #include -static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) +void dsos__init(struct dsos *dsos) { - if (a->maj > b->maj) return -1; - if (a->maj < b->maj) return 1; + init_rwsem(&dsos->lock); - if (a->min > b->min) return -1; - if (a->min < b->min) return 1; + dsos->cnt = 0; + dsos->allocated = 0; + dsos->dsos = NULL; + dsos->sorted = true; +} - if (a->ino > b->ino) return -1; - if (a->ino < b->ino) return 1; +static void dsos__purge(struct dsos *dsos) +{ + down_write(&dsos->lock); - /* - * Synthesized MMAP events have zero ino_generation, avoid comparing - * them with MMAP events with actual ino_generation. - * - * I found it harmful because the mismatch resulted in a new - * dso that did not have a build ID whereas the original dso did have a - * build ID. The build ID was essential because the object was not found - * otherwise. - Adrian - */ - if (a->ino_generation && b->ino_generation) { - if (a->ino_generation > b->ino_generation) return -1; - if (a->ino_generation < b->ino_generation) return 1; - } + for (unsigned int i = 0; i < dsos->cnt; i++) { + struct dso *dso = dsos->dsos[i]; - return 0; -} + dso__set_dsos(dso, NULL); + dso__put(dso); + } -static bool dso_id__empty(struct dso_id *id) -{ - if (!id) - return true; + zfree(&dsos->dsos); + dsos->cnt = 0; + dsos->allocated = 0; + dsos->sorted = true; - return !id->maj && !id->min && !id->ino && !id->ino_generation; + up_write(&dsos->lock); } -static void dso__inject_id(struct dso *dso, struct dso_id *id) +void dsos__exit(struct dsos *dsos) { - dso->id.maj = id->maj; - dso->id.min = id->min; - dso->id.ino = id->ino; - dso->id.ino_generation = id->ino_generation; + dsos__purge(dsos); + exit_rwsem(&dsos->lock); } -static int dso_id__cmp(struct dso_id *a, struct dso_id *b) + +static int __dsos__for_each_dso(struct dsos *dsos, + int (*cb)(struct dso *dso, void *data), + void *data) { - /* - * The second is always dso->id, so zeroes if not set, assume passing - * NULL for a means a zeroed id - */ - if (dso_id__empty(a) || dso_id__empty(b)) - return 0; + for (unsigned int i = 0; i < dsos->cnt; i++) { + struct dso *dso = dsos->dsos[i]; + int err; - return __dso_id__cmp(a, b); + err = cb(dso, data); + if (err) + return err; + } + return 0; } -int dso__cmp_id(struct dso *a, struct dso *b) -{ - return __dso_id__cmp(&a->id, &b->id); -} +struct dsos__read_build_ids_cb_args { + bool with_hits; + bool have_build_id; +}; -bool __dsos__read_build_ids(struct list_head *head, bool with_hits) +static int dsos__read_build_ids_cb(struct dso *dso, void *data) { - bool have_build_id = false; - struct dso *pos; + struct dsos__read_build_ids_cb_args *args = data; struct nscookie nsc; - list_for_each_entry(pos, head, node) { - if (with_hits && !pos->hit && !dso__is_vdso(pos)) - continue; - if (pos->has_build_id) { - have_build_id = true; - continue; - } - nsinfo__mountns_enter(pos->nsinfo, &nsc); - if (filename__read_build_id(pos->long_name, &pos->bid) > 0) { - have_build_id = true; - pos->has_build_id = true; - } else if (errno == ENOENT && pos->nsinfo) { - char *new_name = dso__filename_with_chroot(pos, pos->long_name); - - if (new_name && filename__read_build_id(new_name, - &pos->bid) > 0) { - have_build_id = true; - pos->has_build_id = true; - } - free(new_name); + if (args->with_hits && !dso__hit(dso) && !dso__is_vdso(dso)) + return 0; + if (dso__has_build_id(dso)) { + args->have_build_id = true; + return 0; + } + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + if (filename__read_build_id(dso__long_name(dso), dso__bid(dso)) > 0) { + args->have_build_id = true; + dso__set_has_build_id(dso); + } else if (errno == ENOENT && dso__nsinfo(dso)) { + char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); + + if (new_name && filename__read_build_id(new_name, dso__bid(dso)) > 0) { + args->have_build_id = true; + dso__set_has_build_id(dso); } - nsinfo__mountns_exit(&nsc); + free(new_name); } + nsinfo__mountns_exit(&nsc); + return 0; +} - return have_build_id; +bool dsos__read_build_ids(struct dsos *dsos, bool with_hits) +{ + struct dsos__read_build_ids_cb_args args = { + .with_hits = with_hits, + .have_build_id = false, + }; + + dsos__for_each_dso(dsos, dsos__read_build_ids_cb, &args); + return args.have_build_id; } -static int __dso__cmp_long_name(const char *long_name, struct dso_id *id, struct dso *b) +static int __dso__cmp_long_name(const char *long_name, const struct dso_id *id, + const struct dso *b) { - int rc = strcmp(long_name, b->long_name); - return rc ?: dso_id__cmp(id, &b->id); + int rc = strcmp(long_name, dso__long_name(b)); + return rc ?: dso_id__cmp(id, dso__id_const(b)); } -static int __dso__cmp_short_name(const char *short_name, struct dso_id *id, struct dso *b) +static int __dso__cmp_short_name(const char *short_name, const struct dso_id *id, + const struct dso *b) { - int rc = strcmp(short_name, b->short_name); - return rc ?: dso_id__cmp(id, &b->id); + int rc = strcmp(short_name, dso__short_name(b)); + return rc ?: dso_id__cmp(id, dso__id_const(b)); } -static int dso__cmp_short_name(struct dso *a, struct dso *b) +static int dsos__cmp_long_name_id_short_name(const void *va, const void *vb) { - return __dso__cmp_short_name(a->short_name, &a->id, b); + const struct dso *a = *((const struct dso **)va); + const struct dso *b = *((const struct dso **)vb); + int rc = strcmp(dso__long_name(a), dso__long_name(b)); + + if (!rc) { + rc = dso_id__cmp(dso__id_const(a), dso__id_const(b)); + if (!rc) + rc = strcmp(dso__short_name(a), dso__short_name(b)); + } + return rc; +} + +struct dsos__key { + const char *long_name; + const struct dso_id *id; +}; + +static int dsos__cmp_key_long_name_id(const void *vkey, const void *vdso) +{ + const struct dsos__key *key = vkey; + const struct dso *dso = *((const struct dso **)vdso); + + return __dso__cmp_long_name(key->long_name, key->id, dso); } /* @@ -128,110 +153,137 @@ static int dso__cmp_short_name(struct dso *a, struct dso *b) * Either one of the dso or name parameter must be non-NULL or the * function will not work. */ -struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, - const char *name, struct dso_id *id) +static struct dso *__dsos__find_by_longname_id(struct dsos *dsos, + const char *name, + struct dso_id *id, + bool write_locked) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - if (!name) - name = dso->long_name; - /* - * Find node with the matching name - */ - while (*p) { - struct dso *this = rb_entry(*p, struct dso, rb_node); - int rc = __dso__cmp_long_name(name, id, this); - - parent = *p; - if (rc == 0) { - /* - * In case the new DSO is a duplicate of an existing - * one, print a one-time warning & put the new entry - * at the end of the list of duplicates. - */ - if (!dso || (dso == this)) - return this; /* Find matching dso */ - /* - * The core kernel DSOs may have duplicated long name. - * In this case, the short name should be different. - * Comparing the short names to differentiate the DSOs. - */ - rc = dso__cmp_short_name(dso, this); - if (rc == 0) { - pr_err("Duplicated dso name: %s\n", name); - return NULL; - } + struct dsos__key key = { + .long_name = name, + .id = id, + }; + struct dso **res; + + if (!dsos->sorted) { + if (!write_locked) { + struct dso *dso; + + up_read(&dsos->lock); + down_write(&dsos->lock); + dso = __dsos__find_by_longname_id(dsos, name, id, + /*write_locked=*/true); + up_write(&dsos->lock); + down_read(&dsos->lock); + return dso; } - if (rc < 0) - p = &parent->rb_left; - else - p = &parent->rb_right; + qsort(dsos->dsos, dsos->cnt, sizeof(struct dso *), + dsos__cmp_long_name_id_short_name); + dsos->sorted = true; } - if (dso) { - /* Add new node and rebalance tree */ - rb_link_node(&dso->rb_node, parent, p); - rb_insert_color(&dso->rb_node, root); - dso->root = root; - } - return NULL; + + res = bsearch(&key, dsos->dsos, dsos->cnt, sizeof(struct dso *), + dsos__cmp_key_long_name_id); + if (!res) + return NULL; + + return dso__get(*res); } -void __dsos__add(struct dsos *dsos, struct dso *dso) +int __dsos__add(struct dsos *dsos, struct dso *dso) { - list_add_tail(&dso->node, &dsos->head); - __dsos__findnew_link_by_longname_id(&dsos->root, dso, NULL, &dso->id); - /* - * It is now in the linked list, grab a reference, then garbage collect - * this when needing memory, by looking at LRU dso instances in the - * list with atomic_read(&dso->refcnt) == 1, i.e. no references - * anywhere besides the one for the list, do, under a lock for the - * list: remove it from the list, then a dso__put(), that probably will - * be the last and will then call dso__delete(), end of life. - * - * That, or at the end of the 'struct machine' lifetime, when all - * 'struct dso' instances will be removed from the list, in - * dsos__exit(), if they have no other reference from some other data - * structure. - * - * E.g.: after processing a 'perf.data' file and storing references - * to objects instantiated while processing events, we will have - * references to the 'thread', 'map', 'dso' structs all from 'struct - * hist_entry' instances, but we may not need anything not referenced, - * so we might as well call machines__exit()/machines__delete() and - * garbage collect it. - */ - dso__get(dso); + if (dsos->cnt == dsos->allocated) { + unsigned int to_allocate = 2; + struct dso **temp; + + if (dsos->allocated > 0) + to_allocate = dsos->allocated * 2; + temp = realloc(dsos->dsos, sizeof(struct dso *) * to_allocate); + if (!temp) + return -ENOMEM; + dsos->dsos = temp; + dsos->allocated = to_allocate; + } + if (!dsos->sorted) { + dsos->dsos[dsos->cnt++] = dso__get(dso); + } else { + int low = 0, high = dsos->cnt - 1; + int insert = dsos->cnt; /* Default to inserting at the end. */ + + while (low <= high) { + int mid = low + (high - low) / 2; + int cmp = dsos__cmp_long_name_id_short_name(&dsos->dsos[mid], &dso); + + if (cmp < 0) { + low = mid + 1; + } else { + high = mid - 1; + insert = mid; + } + } + memmove(&dsos->dsos[insert + 1], &dsos->dsos[insert], + (dsos->cnt - insert) * sizeof(struct dso *)); + dsos->cnt++; + dsos->dsos[insert] = dso__get(dso); + } + dso__set_dsos(dso, dsos); + return 0; } -void dsos__add(struct dsos *dsos, struct dso *dso) +int dsos__add(struct dsos *dsos, struct dso *dso) { + int ret; + down_write(&dsos->lock); - __dsos__add(dsos, dso); + ret = __dsos__add(dsos, dso); up_write(&dsos->lock); + return ret; } -static struct dso *__dsos__findnew_by_longname_id(struct rb_root *root, const char *name, struct dso_id *id) +struct dsos__find_id_cb_args { + const char *name; + struct dso_id *id; + struct dso *res; +}; + +static int dsos__find_id_cb(struct dso *dso, void *data) { - return __dsos__findnew_link_by_longname_id(root, NULL, name, id); + struct dsos__find_id_cb_args *args = data; + + if (__dso__cmp_short_name(args->name, args->id, dso) == 0) { + args->res = dso__get(dso); + return 1; + } + return 0; + } -static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id, bool cmp_short) +static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id, + bool cmp_short, bool write_locked) { - struct dso *pos; + struct dso *res; if (cmp_short) { - list_for_each_entry(pos, &dsos->head, node) - if (__dso__cmp_short_name(name, id, pos) == 0) - return pos; - return NULL; + struct dsos__find_id_cb_args args = { + .name = name, + .id = id, + .res = NULL, + }; + + __dsos__for_each_dso(dsos, dsos__find_id_cb, &args); + return args.res; } - return __dsos__findnew_by_longname_id(&dsos->root, name, id); + res = __dsos__find_by_longname_id(dsos, name, id, write_locked); + return res; } -struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short) +struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short) { - return __dsos__find_id(dsos, name, NULL, cmp_short); + struct dso *res; + + down_read(&dsos->lock); + res = __dsos__find_id(dsos, name, NULL, cmp_short, /*write_locked=*/false); + up_read(&dsos->lock); + return res; } static void dso__set_basename(struct dso *dso) @@ -239,7 +291,7 @@ static void dso__set_basename(struct dso *dso) char *base, *lname; int tid; - if (sscanf(dso->long_name, "/tmp/perf-%d.map", &tid) == 1) { + if (sscanf(dso__long_name(dso), "/tmp/perf-%d.map", &tid) == 1) { if (asprintf(&base, "[JIT] tid %d", tid) < 0) return; } else { @@ -247,7 +299,7 @@ static void dso__set_basename(struct dso *dso) * basename() may modify path buffer, so we must pass * a copy. */ - lname = strdup(dso->long_name); + lname = strdup(dso__long_name(dso)); if (!lname) return; @@ -271,25 +323,23 @@ static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct struct dso *dso = dso__new_id(name, id); if (dso != NULL) { - __dsos__add(dsos, dso); + /* + * The dsos lock is held on entry, so rename the dso before + * adding it to avoid needing to take the dsos lock again to say + * the array isn't sorted. + */ dso__set_basename(dso); - /* Put dso here because __dsos_add already got it */ - dso__put(dso); + __dsos__add(dsos, dso); } return dso; } -struct dso *__dsos__addnew(struct dsos *dsos, const char *name) -{ - return __dsos__addnew_id(dsos, name, NULL); -} - static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id) { - struct dso *dso = __dsos__find_id(dsos, name, id, false); + struct dso *dso = __dsos__find_id(dsos, name, id, false, /*write_locked=*/true); - if (dso && dso_id__empty(&dso->id) && !dso_id__empty(id)) - dso__inject_id(dso, id); + if (dso && dso_id__empty(dso__id(dso)) && !dso_id__empty(id)) + __dso__inject_id(dso, id); return dso ? dso : __dsos__addnew_id(dsos, name, id); } @@ -298,36 +348,151 @@ struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id { struct dso *dso; down_write(&dsos->lock); - dso = dso__get(__dsos__findnew_id(dsos, name, id)); + dso = __dsos__findnew_id(dsos, name, id); up_write(&dsos->lock); return dso; } -size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, - bool (skip)(struct dso *dso, int parm), int parm) +struct dsos__fprintf_buildid_cb_args { + FILE *fp; + bool (*skip)(struct dso *dso, int parm); + int parm; + size_t ret; +}; + +static int dsos__fprintf_buildid_cb(struct dso *dso, void *data) { - struct dso *pos; - size_t ret = 0; + struct dsos__fprintf_buildid_cb_args *args = data; + char sbuild_id[SBUILD_ID_SIZE]; - list_for_each_entry(pos, head, node) { - char sbuild_id[SBUILD_ID_SIZE]; + if (args->skip && args->skip(dso, args->parm)) + return 0; + build_id__sprintf(dso__bid(dso), sbuild_id); + args->ret += fprintf(args->fp, "%-40s %s\n", sbuild_id, dso__long_name(dso)); + return 0; +} - if (skip && skip(pos, parm)) - continue; - build_id__sprintf(&pos->bid, sbuild_id); - ret += fprintf(fp, "%-40s %s\n", sbuild_id, pos->long_name); - } - return ret; +size_t dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, + bool (*skip)(struct dso *dso, int parm), int parm) +{ + struct dsos__fprintf_buildid_cb_args args = { + .fp = fp, + .skip = skip, + .parm = parm, + .ret = 0, + }; + + dsos__for_each_dso(dsos, dsos__fprintf_buildid_cb, &args); + return args.ret; +} + +struct dsos__fprintf_cb_args { + FILE *fp; + size_t ret; +}; + +static int dsos__fprintf_cb(struct dso *dso, void *data) +{ + struct dsos__fprintf_cb_args *args = data; + + args->ret += dso__fprintf(dso, args->fp); + return 0; +} + +size_t dsos__fprintf(struct dsos *dsos, FILE *fp) +{ + struct dsos__fprintf_cb_args args = { + .fp = fp, + .ret = 0, + }; + + dsos__for_each_dso(dsos, dsos__fprintf_cb, &args); + return args.ret; +} + +static int dsos__hit_all_cb(struct dso *dso, void *data __maybe_unused) +{ + dso__set_hit(dso); + return 0; } -size_t __dsos__fprintf(struct list_head *head, FILE *fp) +int dsos__hit_all(struct dsos *dsos) { - struct dso *pos; - size_t ret = 0; + return dsos__for_each_dso(dsos, dsos__hit_all_cb, NULL); +} - list_for_each_entry(pos, head, node) { - ret += dso__fprintf(pos, fp); +struct dso *dsos__findnew_module_dso(struct dsos *dsos, + struct machine *machine, + struct kmod_path *m, + const char *filename) +{ + struct dso *dso; + + down_write(&dsos->lock); + + dso = __dsos__find_id(dsos, m->name, NULL, /*cmp_short=*/true, /*write_locked=*/true); + if (dso) { + up_write(&dsos->lock); + return dso; + } + /* + * Failed to find the dso so create it. Change the name before adding it + * to the array, to avoid unnecessary sorts and potential locking + * issues. + */ + dso = dso__new_id(m->name, /*id=*/NULL); + if (!dso) { + up_write(&dsos->lock); + return NULL; } + dso__set_basename(dso); + dso__set_module_info(dso, m, machine); + dso__set_long_name(dso, strdup(filename), true); + dso__set_kernel(dso, DSO_SPACE__KERNEL); + __dsos__add(dsos, dso); - return ret; + up_write(&dsos->lock); + return dso; +} + +static int dsos__find_kernel_dso_cb(struct dso *dso, void *data) +{ + struct dso **res = data; + /* + * The cpumode passed to is_kernel_module is not the cpumode of *this* + * event. If we insist on passing correct cpumode to is_kernel_module, + * we should record the cpumode when we adding this dso to the linked + * list. + * + * However we don't really need passing correct cpumode. We know the + * correct cpumode must be kernel mode (if not, we should not link it + * onto kernel_dsos list). + * + * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN. + * is_kernel_module() treats it as a kernel cpumode. + */ + if (!dso__kernel(dso) || + is_kernel_module(dso__long_name(dso), PERF_RECORD_MISC_CPUMODE_UNKNOWN)) + return 0; + + *res = dso__get(dso); + return 1; +} + +struct dso *dsos__find_kernel_dso(struct dsos *dsos) +{ + struct dso *res = NULL; + + dsos__for_each_dso(dsos, dsos__find_kernel_dso_cb, &res); + return res; +} + +int dsos__for_each_dso(struct dsos *dsos, int (*cb)(struct dso *dso, void *data), void *data) +{ + int err; + + down_read(&dsos->lock); + err = __dsos__for_each_dso(dsos, cb, data); + up_read(&dsos->lock); + return err; } diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index 5dbec2bc69..6c13b65648 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -10,31 +10,43 @@ struct dso; struct dso_id; +struct kmod_path; +struct machine; /* - * DSOs are put into both a list for fast iteration and rbtree for fast - * long name lookup. + * Collection of DSOs as an array for iteration speed, but sorted for O(n) + * lookup. */ struct dsos { - struct list_head head; - struct rb_root root; /* rbtree root sorted by long name */ struct rw_semaphore lock; + struct dso **dsos; + unsigned int cnt; + unsigned int allocated; + bool sorted; }; -void __dsos__add(struct dsos *dsos, struct dso *dso); -void dsos__add(struct dsos *dsos, struct dso *dso); -struct dso *__dsos__addnew(struct dsos *dsos, const char *name); -struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short); +void dsos__init(struct dsos *dsos); +void dsos__exit(struct dsos *dsos); + +int __dsos__add(struct dsos *dsos, struct dso *dso); +int dsos__add(struct dsos *dsos, struct dso *dso); +struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short); struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id); -struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, - const char *name, struct dso_id *id); - -bool __dsos__read_build_ids(struct list_head *head, bool with_hits); +bool dsos__read_build_ids(struct dsos *dsos, bool with_hits); -size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, +size_t dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm); -size_t __dsos__fprintf(struct list_head *head, FILE *fp); +size_t dsos__fprintf(struct dsos *dsos, FILE *fp); + +int dsos__hit_all(struct dsos *dsos); + +struct dso *dsos__findnew_module_dso(struct dsos *dsos, struct machine *machine, + struct kmod_path *m, const char *filename); + +struct dso *dsos__find_kernel_dso(struct dsos *dsos); + +int dsos__for_each_dso(struct dsos *dsos, int (*cb)(struct dso *dso, void *data), void *data); #endif /* __PERF_DSOS */ diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h index 6501250615..4a7797dd6d 100644 --- a/tools/perf/util/dump-insn.h +++ b/tools/perf/util/dump-insn.h @@ -11,6 +11,7 @@ struct thread; struct perf_insn { /* Initialized by callers: */ struct thread *thread; + struct machine *machine; u8 cpumode; bool is64bit; int cpu; diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index f93e57e2fc..44ef968a7a 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -9,6 +9,7 @@ #include #include "debug.h" #include "dwarf-aux.h" +#include "dwarf-regs.h" #include "strbuf.h" #include "string2.h" @@ -696,6 +697,49 @@ Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, return die_mem; } +static int __die_find_func_rettype_cb(Dwarf_Die *die_mem, void *data) +{ + const char *func_name; + + if (dwarf_tag(die_mem) != DW_TAG_subprogram) + return DIE_FIND_CB_SIBLING; + + func_name = dwarf_diename(die_mem); + if (func_name && !strcmp(func_name, data)) + return DIE_FIND_CB_END; + + return DIE_FIND_CB_SIBLING; +} + +/** + * die_find_func_rettype - Search a return type of function + * @cu_die: a CU DIE + * @name: target function name + * @die_mem: a buffer for result DIE + * + * Search a non-inlined function which matches to @name and stores the + * return type of the function to @die_mem and returns it if found. + * Returns NULL if failed. Note that it doesn't needs to find a + * definition of the function, so it doesn't match with address. + * Most likely, it can find a declaration at the top level. Thus the + * callback function continues to sibling entries only. + */ +Dwarf_Die *die_find_func_rettype(Dwarf_Die *cu_die, const char *name, + Dwarf_Die *die_mem) +{ + Dwarf_Die tmp_die; + + cu_die = die_find_child(cu_die, __die_find_func_rettype_cb, + (void *)name, &tmp_die); + if (!cu_die) + return NULL; + + if (die_get_real_type(&tmp_die, die_mem) == NULL) + return NULL; + + return die_mem; +} + struct __instance_walk_param { void *addr; int (*callback)(Dwarf_Die *, void *); @@ -1066,8 +1110,10 @@ int die_get_typename_from_type(Dwarf_Die *type_die, struct strbuf *buf) const char *tmp = ""; tag = dwarf_tag(type_die); - if (tag == DW_TAG_array_type || tag == DW_TAG_pointer_type) + if (tag == DW_TAG_pointer_type) tmp = "*"; + else if (tag == DW_TAG_array_type) + tmp = "[]"; else if (tag == DW_TAG_subroutine_type) { /* Function pointer */ return strbuf_add(buf, "(function_type)", 15); @@ -1147,6 +1193,8 @@ static int reg_from_dwarf_op(Dwarf_Op *op) case DW_OP_regx: case DW_OP_bregx: return op->number; + case DW_OP_fbreg: + return DWARF_REG_FB; default: break; } @@ -1160,6 +1208,7 @@ static int offset_from_dwarf_op(Dwarf_Op *op) case DW_OP_regx: return 0; case DW_OP_breg0 ... DW_OP_breg31: + case DW_OP_fbreg: return op->number; case DW_OP_bregx: return op->number2; @@ -1353,6 +1402,9 @@ static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data, return true; } + if (addr_offset < addr_type) + return false; + if (die_get_real_type(die_mem, &type_die) == NULL) return false; @@ -1399,7 +1451,6 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) /* Local variables accessed using frame base register */ if (data->is_fbreg && ops->atom == DW_OP_fbreg && - data->offset >= (int)ops->number && check_allowed_ops(ops, nops) && match_var_offset(die_mem, data, data->offset, ops->number, /*is_pointer=*/false)) @@ -1490,9 +1541,6 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) if (ops->atom != DW_OP_addr) continue; - if (data->addr < ops->number) - continue; - if (check_allowed_ops(ops, nops) && match_var_offset(die_mem, data, data->addr, ops->number, /*is_pointer=*/false)) @@ -1504,7 +1552,6 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) /** * die_find_variable_by_addr - Find variable located at given address * @sc_die: a scope DIE - * @pc: the program address to find * @addr: the data address to find * @die_mem: a buffer to save the resulting DIE * @offset: the offset in the resulting type @@ -1512,12 +1559,10 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) * Find the variable DIE located at the given address (in PC-relative mode). * This is usually for global variables. */ -Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr pc, - Dwarf_Addr addr, Dwarf_Die *die_mem, - int *offset) +Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr, + Dwarf_Die *die_mem, int *offset) { struct find_var_data data = { - .pc = pc, .addr = addr, }; Dwarf_Die *result; @@ -1587,6 +1632,68 @@ void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types) die_find_child(sc_die, __die_collect_vars_cb, (void *)var_types, &die_mem); } + +static int __die_collect_global_vars_cb(Dwarf_Die *die_mem, void *arg) +{ + struct die_var_type **var_types = arg; + Dwarf_Die type_die; + int tag = dwarf_tag(die_mem); + Dwarf_Attribute attr; + Dwarf_Addr base, start, end; + Dwarf_Op *ops; + size_t nops; + struct die_var_type *vt; + + if (tag != DW_TAG_variable) + return DIE_FIND_CB_SIBLING; + + if (dwarf_attr(die_mem, DW_AT_location, &attr) == NULL) + return DIE_FIND_CB_SIBLING; + + /* Only collect the location with an absolute address. */ + if (dwarf_getlocations(&attr, 0, &base, &start, &end, &ops, &nops) <= 0) + return DIE_FIND_CB_SIBLING; + + if (ops->atom != DW_OP_addr) + return DIE_FIND_CB_SIBLING; + + if (!check_allowed_ops(ops, nops)) + return DIE_FIND_CB_SIBLING; + + if (die_get_real_type(die_mem, &type_die) == NULL) + return DIE_FIND_CB_SIBLING; + + vt = malloc(sizeof(*vt)); + if (vt == NULL) + return DIE_FIND_CB_END; + + vt->die_off = dwarf_dieoffset(&type_die); + vt->addr = ops->number; + vt->reg = -1; + vt->offset = 0; + vt->next = *var_types; + *var_types = vt; + + return DIE_FIND_CB_SIBLING; +} + +/** + * die_collect_global_vars - Save all global variables + * @cu_die: a CU DIE + * @var_types: a pointer to save the resulting list + * + * Save all global variables in the @cu_die and save them to @var_types. + * The @var_types is a singly-linked list containing type and location info. + * Actual type can be retrieved using dwarf_offdie() with 'die_off' later. + * + * Callers should free @var_types. + */ +void die_collect_global_vars(Dwarf_Die *cu_die, struct die_var_type **var_types) +{ + Dwarf_Die die_mem; + + die_find_child(cu_die, __die_collect_global_vars_cb, (void *)var_types, &die_mem); +} #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ #ifdef HAVE_DWARF_CFI_SUPPORT @@ -1855,3 +1962,116 @@ int die_get_scopes(Dwarf_Die *cu_die, Dwarf_Addr pc, Dwarf_Die **scopes) *scopes = data.scopes; return data.nr; } + +static int __die_find_member_offset_cb(Dwarf_Die *die_mem, void *arg) +{ + Dwarf_Die type_die; + Dwarf_Word size, loc; + Dwarf_Word offset = (long)arg; + int tag = dwarf_tag(die_mem); + + if (tag != DW_TAG_member) + return DIE_FIND_CB_SIBLING; + + /* Unions might not have location */ + if (die_get_data_member_location(die_mem, &loc) < 0) + loc = 0; + + if (offset == loc) + return DIE_FIND_CB_END; + + if (die_get_real_type(die_mem, &type_die) == NULL) { + // TODO: add a pr_debug_dtp() later for this unlikely failure + return DIE_FIND_CB_SIBLING; + } + + if (dwarf_aggregate_size(&type_die, &size) < 0) + size = 0; + + if (loc < offset && offset < (loc + size)) + return DIE_FIND_CB_END; + + return DIE_FIND_CB_SIBLING; +} + +/** + * die_get_member_type - Return type info of struct member + * @type_die: a type DIE + * @offset: offset in the type + * @die_mem: a buffer to save the resulting DIE + * + * This function returns a type of a member in @type_die where it's located at + * @offset if it's a struct. For now, it just returns the first matching + * member in a union. For other types, it'd return the given type directly + * if it's within the size of the type or NULL otherwise. + */ +Dwarf_Die *die_get_member_type(Dwarf_Die *type_die, int offset, + Dwarf_Die *die_mem) +{ + Dwarf_Die *member; + Dwarf_Die mb_type; + int tag; + + tag = dwarf_tag(type_die); + /* If it's not a compound type, return the type directly */ + if (tag != DW_TAG_structure_type && tag != DW_TAG_union_type) { + Dwarf_Word size; + + if (dwarf_aggregate_size(type_die, &size) < 0) + size = 0; + + if ((unsigned)offset >= size) + return NULL; + + *die_mem = *type_die; + return die_mem; + } + + mb_type = *type_die; + /* TODO: Handle union types better? */ + while (tag == DW_TAG_structure_type || tag == DW_TAG_union_type) { + member = die_find_child(&mb_type, __die_find_member_offset_cb, + (void *)(long)offset, die_mem); + if (member == NULL) + return NULL; + + if (die_get_real_type(member, &mb_type) == NULL) + return NULL; + + tag = dwarf_tag(&mb_type); + + if (tag == DW_TAG_structure_type || tag == DW_TAG_union_type) { + Dwarf_Word loc; + + /* Update offset for the start of the member struct */ + if (die_get_data_member_location(member, &loc) == 0) + offset -= loc; + } + } + *die_mem = mb_type; + return die_mem; +} + +/** + * die_deref_ptr_type - Return type info for pointer access + * @ptr_die: a pointer type DIE + * @offset: access offset for the pointer + * @die_mem: a buffer to save the resulting DIE + * + * This function follows the pointer in @ptr_die with given @offset + * and saves the resulting type in @die_mem. If the pointer points + * a struct type, actual member at the offset would be returned. + */ +Dwarf_Die *die_deref_ptr_type(Dwarf_Die *ptr_die, int offset, + Dwarf_Die *die_mem) +{ + Dwarf_Die type_die; + + if (dwarf_tag(ptr_die) != DW_TAG_pointer_type) + return NULL; + + if (die_get_real_type(ptr_die, &type_die) == NULL) + return NULL; + + return die_get_member_type(&type_die, offset, die_mem); +} diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index efafd3a1f5..24446412b8 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -94,6 +94,10 @@ Dwarf_Die *die_find_top_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, Dwarf_Die *die_mem); +/* Search a non-inlined function by name and returns its return type */ +Dwarf_Die *die_find_func_rettype(Dwarf_Die *sp_die, const char *name, + Dwarf_Die *die_mem); + /* Walk on the instances of given DIE */ int die_walk_instances(Dwarf_Die *in_die, int (*callback)(Dwarf_Die *, void *), void *data); @@ -144,6 +148,12 @@ struct die_var_type { int offset; }; +/* Return type info of a member at offset */ +Dwarf_Die *die_get_member_type(Dwarf_Die *type_die, int offset, Dwarf_Die *die_mem); + +/* Return type info where the pointer and offset point to */ +Dwarf_Die *die_deref_ptr_type(Dwarf_Die *ptr_die, int offset, Dwarf_Die *die_mem); + #ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT /* Get byte offset range of given variable DIE */ @@ -155,13 +165,15 @@ Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg, Dwarf_Die *die_mem); /* Find a (global) variable located in the 'addr' */ -Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr pc, - Dwarf_Addr addr, Dwarf_Die *die_mem, - int *offset); +Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr, + Dwarf_Die *die_mem, int *offset); /* Save all variables and parameters in this scope */ void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types); +/* Save all global variables in this CU */ +void die_collect_global_vars(Dwarf_Die *cu_die, struct die_var_type **var_types); + #else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, @@ -182,7 +194,6 @@ static inline Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die __maybe_unus } static inline Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die __maybe_unused, - Dwarf_Addr pc __maybe_unused, Dwarf_Addr addr __maybe_unused, Dwarf_Die *die_mem __maybe_unused, int *offset __maybe_unused) @@ -195,6 +206,11 @@ static inline void die_collect_vars(Dwarf_Die *sc_die __maybe_unused, { } +static inline void die_collect_global_vars(Dwarf_Die *cu_die __maybe_unused, + struct die_var_type **var_types __maybe_unused) +{ +} + #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ #ifdef HAVE_DWARF_CFI_SUPPORT diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 198903157f..f32f9abf63 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -726,7 +726,7 @@ int machine__resolve(struct machine *machine, struct addr_location *al, dso = al->map ? map__dso(al->map) : NULL; dump_printf(" ...... dso: %s\n", dso - ? dso->long_name + ? dso__long_name(dso) : (al->level == 'H' ? "[hypervisor]" : "")); if (thread__is_filtered(thread)) @@ -750,10 +750,10 @@ int machine__resolve(struct machine *machine, struct addr_location *al, if (al->map) { if (symbol_conf.dso_list && (!dso || !(strlist__has_entry(symbol_conf.dso_list, - dso->short_name) || - (dso->short_name != dso->long_name && + dso__short_name(dso)) || + (dso__short_name(dso) != dso__long_name(dso) && strlist__has_entry(symbol_conf.dso_list, - dso->long_name))))) { + dso__long_name(dso)))))) { al->filtered |= (1 << HIST_FILTER__DSO); } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 55a300a097..3a719edafc 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -298,7 +298,8 @@ struct evsel *evlist__add_aux_dummy(struct evlist *evlist, bool system_wide) #ifdef HAVE_LIBTRACEEVENT struct evsel *evlist__add_sched_switch(struct evlist *evlist, bool system_wide) { - struct evsel *evsel = evsel__newtp_idx("sched", "sched_switch", 0); + struct evsel *evsel = evsel__newtp_idx("sched", "sched_switch", 0, + /*format=*/true); if (IS_ERR(evsel)) return evsel; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 3536404e94..4f818ab6b6 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -452,7 +452,7 @@ out_err: * Returns pointer with encoded error via interface. */ #ifdef HAVE_LIBTRACEEVENT -struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx) +struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx, bool format) { struct evsel *evsel = zalloc(perf_evsel__object.size); int err = -ENOMEM; @@ -469,14 +469,20 @@ struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx) if (asprintf(&evsel->name, "%s:%s", sys, name) < 0) goto out_free; - evsel->tp_format = trace_event__tp_format(sys, name); - if (IS_ERR(evsel->tp_format)) { - err = PTR_ERR(evsel->tp_format); - goto out_free; + event_attr_init(&attr); + + if (format) { + evsel->tp_format = trace_event__tp_format(sys, name); + if (IS_ERR(evsel->tp_format)) { + err = PTR_ERR(evsel->tp_format); + goto out_free; + } + attr.config = evsel->tp_format->id; + } else { + attr.config = (__u64) -1; } - event_attr_init(&attr); - attr.config = evsel->tp_format->id; + attr.sample_period = 1; evsel__init(evsel, &attr, idx); } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 517cff431d..375a38e15c 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -234,14 +234,14 @@ void free_config_terms(struct list_head *config_terms); #ifdef HAVE_LIBTRACEEVENT -struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx); +struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx, bool format); /* * Returns pointer with encoded error via interface. */ static inline struct evsel *evsel__newtp(const char *sys, const char *name) { - return evsel__newtp_idx(sys, name, 0); + return evsel__newtp_idx(sys, name, 0, true); } #endif diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h index 5f18d20ea9..4e2e4f40e1 100644 --- a/tools/perf/util/genelf.h +++ b/tools/perf/util/genelf.h @@ -43,6 +43,9 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent #elif defined(__riscv) && __riscv_xlen == 64 #define GEN_ELF_ARCH EM_RISCV #define GEN_ELF_CLASS ELFCLASS64 +#elif defined(__riscv) && __riscv_xlen == 32 +#define GEN_ELF_ARCH EM_RISCV +#define GEN_ELF_CLASS ELFCLASS32 #elif defined(__loongarch__) #define GEN_ELF_ARCH EM_LOONGARCH #define GEN_ELF_CLASS ELFCLASS64 diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3fe28edc3d..55e9553861 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2308,7 +2308,7 @@ static int __event_process_build_id(struct perf_record_header_build_id *bev, build_id__init(&bid, bev->data, size); dso__set_build_id(dso, &bid); - dso->header_build_id = 1; + dso__set_header_build_id(dso, true); if (dso_space != DSO_SPACE__USER) { struct kmod_path m = { .name = NULL, }; @@ -2316,13 +2316,13 @@ static int __event_process_build_id(struct perf_record_header_build_id *bev, if (!kmod_path__parse_name(&m, filename) && m.kmod) dso__set_module_info(dso, &m, machine); - dso->kernel = dso_space; + dso__set_kernel(dso, dso_space); free(m.name); } - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); pr_debug("build id event received for %s: %s [%zu]\n", - dso->long_name, sbuild_id, size); + dso__long_name(dso), sbuild_id, size); dso__put(dso); } diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c index eab99ea6ac..a0a46e34f8 100644 --- a/tools/perf/util/help-unknown-cmd.c +++ b/tools/perf/util/help-unknown-cmd.c @@ -52,46 +52,48 @@ static int add_cmd_list(struct cmdnames *cmds, struct cmdnames *old) return 0; } -const char *help_unknown_cmd(const char *cmd) +const char *help_unknown_cmd(const char *cmd, struct cmdnames *main_cmds) { unsigned int i, n = 0, best_similarity = 0; - struct cmdnames main_cmds, other_cmds; + struct cmdnames other_cmds; - memset(&main_cmds, 0, sizeof(main_cmds)); - memset(&other_cmds, 0, sizeof(main_cmds)); + memset(&other_cmds, 0, sizeof(other_cmds)); perf_config(perf_unknown_cmd_config, NULL); - load_command_list("perf-", &main_cmds, &other_cmds); + load_command_list("perf-", main_cmds, &other_cmds); - if (add_cmd_list(&main_cmds, &other_cmds) < 0) { + if (add_cmd_list(main_cmds, &other_cmds) < 0) { fprintf(stderr, "ERROR: Failed to allocate command list for unknown command.\n"); goto end; } - qsort(main_cmds.names, main_cmds.cnt, - sizeof(main_cmds.names), cmdname_compare); - uniq(&main_cmds); + qsort(main_cmds->names, main_cmds->cnt, + sizeof(main_cmds->names), cmdname_compare); + uniq(main_cmds); - if (main_cmds.cnt) { + if (main_cmds->cnt) { /* This reuses cmdname->len for similarity index */ - for (i = 0; i < main_cmds.cnt; ++i) - main_cmds.names[i]->len = - levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4); - - qsort(main_cmds.names, main_cmds.cnt, - sizeof(*main_cmds.names), levenshtein_compare); + for (i = 0; i < main_cmds->cnt; ++i) { + main_cmds->names[i]->len = + levenshtein(cmd, main_cmds->names[i]->name, + /*swap_penalty=*/0, + /*substition_penality=*/2, + /*insertion_penality=*/1, + /*deletion_penalty=*/1); + } + qsort(main_cmds->names, main_cmds->cnt, + sizeof(*main_cmds->names), levenshtein_compare); - best_similarity = main_cmds.names[0]->len; + best_similarity = main_cmds->names[0]->len; n = 1; - while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len) + while (n < main_cmds->cnt && best_similarity == main_cmds->names[n]->len) ++n; } if (autocorrect && n == 1) { - const char *assumed = main_cmds.names[0]->name; + const char *assumed = main_cmds->names[0]->name; - main_cmds.names[0] = NULL; - clean_cmdnames(&main_cmds); + main_cmds->names[0] = NULL; clean_cmdnames(&other_cmds); fprintf(stderr, "WARNING: You called a perf program named '%s', " "which does not exist.\n" @@ -107,15 +109,14 @@ const char *help_unknown_cmd(const char *cmd) fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd); - if (main_cmds.cnt && best_similarity < 6) { + if (main_cmds->cnt && best_similarity < 6) { fprintf(stderr, "\nDid you mean %s?\n", n < 2 ? "this": "one of these"); for (i = 0; i < n; i++) - fprintf(stderr, "\t%s\n", main_cmds.names[i]->name); + fprintf(stderr, "\t%s\n", main_cmds->names[i]->name); } end: - clean_cmdnames(&main_cmds); clean_cmdnames(&other_cmds); - exit(1); + return NULL; } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index fa359180eb..2e9e193179 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -9,6 +9,7 @@ #include "map_symbol.h" #include "branch.h" #include "mem-events.h" +#include "mem-info.h" #include "session.h" #include "namespaces.h" #include "cgroup.h" @@ -153,8 +154,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) } if (h->mem_info) { - if (h->mem_info->daddr.ms.sym) { - symlen = (int)h->mem_info->daddr.ms.sym->namelen + 4 + if (mem_info__daddr(h->mem_info)->ms.sym) { + symlen = (int)mem_info__daddr(h->mem_info)->ms.sym->namelen + 4 + unresolved_col_width + 2; hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen); @@ -168,8 +169,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) symlen); } - if (h->mem_info->iaddr.ms.sym) { - symlen = (int)h->mem_info->iaddr.ms.sym->namelen + 4 + if (mem_info__iaddr(h->mem_info)->ms.sym) { + symlen = (int)mem_info__iaddr(h->mem_info)->ms.sym->namelen + 4 + unresolved_col_width + 2; hists__new_col_len(hists, HISTC_MEM_IADDR_SYMBOL, symlen); @@ -179,8 +180,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) symlen); } - if (h->mem_info->daddr.ms.map) { - symlen = dso__name_len(map__dso(h->mem_info->daddr.ms.map)); + if (mem_info__daddr(h->mem_info)->ms.map) { + symlen = dso__name_len(map__dso(mem_info__daddr(h->mem_info)->ms.map)); hists__new_col_len(hists, HISTC_MEM_DADDR_DSO, symlen); } else { @@ -308,6 +309,9 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) dest->period_us += src->period_us; dest->period_guest_sys += src->period_guest_sys; dest->period_guest_us += src->period_guest_us; + dest->weight1 += src->weight1; + dest->weight2 += src->weight2; + dest->weight3 += src->weight3; dest->nr_events += src->nr_events; } @@ -315,7 +319,9 @@ static void he_stat__decay(struct he_stat *he_stat) { he_stat->period = (he_stat->period * 7) / 8; he_stat->nr_events = (he_stat->nr_events * 7) / 8; - /* XXX need decay for weight too? */ + he_stat->weight1 = (he_stat->weight1 * 7) / 8; + he_stat->weight2 = (he_stat->weight2 * 7) / 8; + he_stat->weight3 = (he_stat->weight3 * 7) / 8; } static void hists__delete_entry(struct hists *hists, struct hist_entry *he); @@ -470,11 +476,6 @@ static int hist_entry__init(struct hist_entry *he, he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map); } - if (he->mem_info) { - he->mem_info->iaddr.ms.map = map__get(he->mem_info->iaddr.ms.map); - he->mem_info->daddr.ms.map = map__get(he->mem_info->daddr.ms.map); - } - if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) callchain_init(he->callchain); @@ -520,8 +521,8 @@ err_infos: zfree(&he->branch_info); } if (he->mem_info) { - map_symbol__exit(&he->mem_info->iaddr.ms); - map_symbol__exit(&he->mem_info->daddr.ms); + map_symbol__exit(&mem_info__iaddr(he->mem_info)->ms); + map_symbol__exit(&mem_info__daddr(he->mem_info)->ms); } err: map_symbol__exit(&he->ms); @@ -566,7 +567,6 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template, he = NULL; } } - return he; } @@ -614,7 +614,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, cmp = hist_entry__cmp(he, entry); if (!cmp) { if (sample_self) { - he_stat__add_period(&he->stat, period); + he_stat__add_stat(&he->stat, &entry->stat); hist_entry__add_callchain_period(he, period); } if (symbol_conf.cumulate_callchain) @@ -626,7 +626,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, */ mem_info__zput(entry->mem_info); - block_info__zput(entry->block_info); + block_info__delete(entry->block_info); kvm_info__zput(entry->kvm_info); @@ -731,12 +731,15 @@ __hists__add_entry(struct hists *hists, .stat = { .nr_events = 1, .period = sample->period, + .weight1 = sample->weight, + .weight2 = sample->ins_lat, + .weight3 = sample->p_stage_cyc, }, .parent = sym_parent, .filtered = symbol__parent_filter(sym_parent) | al->filtered, .hists = hists, .branch_info = bi, - .mem_info = mi, + .mem_info = mem_info__get(mi), .kvm_info = ki, .block_info = block_info, .transaction = sample->transaction, @@ -825,7 +828,7 @@ iter_prepare_mem_entry(struct hist_entry_iter *iter, struct addr_location *al) if (mi == NULL) return -ENOMEM; - iter->priv = mi; + iter->mi = mi; return 0; } @@ -833,7 +836,7 @@ static int iter_add_single_mem_entry(struct hist_entry_iter *iter, struct addr_location *al) { u64 cost; - struct mem_info *mi = iter->priv; + struct mem_info *mi = iter->mi; struct hists *hists = evsel__hists(iter->evsel); struct perf_sample *sample = iter->sample; struct hist_entry *he; @@ -880,12 +883,7 @@ iter_finish_mem_entry(struct hist_entry_iter *iter, err = hist_entry__append_callchain(he, iter->sample); out: - /* - * We don't need to free iter->priv (mem_info) here since the mem info - * was either already freed in hists__findnew_entry() or passed to a - * new hist entry by hist_entry__new(). - */ - iter->priv = NULL; + mem_info__zput(iter->mi); iter->he = NULL; return err; @@ -904,7 +902,7 @@ iter_prepare_branch_entry(struct hist_entry_iter *iter, struct addr_location *al iter->curr = 0; iter->total = sample->branch_stack->nr; - iter->priv = bi; + iter->bi = bi; return 0; } @@ -918,7 +916,7 @@ iter_add_single_branch_entry(struct hist_entry_iter *iter __maybe_unused, static int iter_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *al) { - struct branch_info *bi = iter->priv; + struct branch_info *bi = iter->bi; int i = iter->curr; if (bi == NULL) @@ -947,7 +945,7 @@ iter_add_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *a int i = iter->curr; int err = 0; - bi = iter->priv; + bi = iter->bi; if (iter->hide_unresolved && !(bi[i].from.ms.sym && bi[i].to.ms.sym)) goto out; @@ -976,7 +974,7 @@ static int iter_finish_branch_entry(struct hist_entry_iter *iter, struct addr_location *al __maybe_unused) { - zfree(&iter->priv); + zfree(&iter->bi); iter->he = NULL; return iter->curr >= iter->total ? 0 : -1; @@ -1044,7 +1042,7 @@ iter_prepare_cumulative_entry(struct hist_entry_iter *iter, if (he_cache == NULL) return -ENOMEM; - iter->priv = he_cache; + iter->he_cache = he_cache; iter->curr = 0; return 0; @@ -1057,7 +1055,7 @@ iter_add_single_cumulative_entry(struct hist_entry_iter *iter, struct evsel *evsel = iter->evsel; struct hists *hists = evsel__hists(evsel); struct perf_sample *sample = iter->sample; - struct hist_entry **he_cache = iter->priv; + struct hist_entry **he_cache = iter->he_cache; struct hist_entry *he; int err = 0; @@ -1115,7 +1113,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, { struct evsel *evsel = iter->evsel; struct perf_sample *sample = iter->sample; - struct hist_entry **he_cache = iter->priv; + struct hist_entry **he_cache = iter->he_cache; struct hist_entry *he; struct hist_entry he_tmp = { .hists = evsel__hists(evsel), @@ -1181,7 +1179,9 @@ static int iter_finish_cumulative_entry(struct hist_entry_iter *iter, struct addr_location *al __maybe_unused) { - zfree(&iter->priv); + mem_info__zput(iter->mi); + zfree(&iter->bi); + zfree(&iter->he_cache); iter->he = NULL; return 0; @@ -1327,13 +1327,13 @@ void hist_entry__delete(struct hist_entry *he) } if (he->mem_info) { - map_symbol__exit(&he->mem_info->iaddr.ms); - map_symbol__exit(&he->mem_info->daddr.ms); + map_symbol__exit(&mem_info__iaddr(he->mem_info)->ms); + map_symbol__exit(&mem_info__daddr(he->mem_info)->ms); mem_info__zput(he->mem_info); } if (he->block_info) - block_info__zput(he->block_info); + block_info__delete(he->block_info); if (he->kvm_info) kvm_info__zput(he->kvm_info); @@ -2128,7 +2128,7 @@ static bool hists__filter_entry_by_dso(struct hists *hists, struct hist_entry *he) { if (hists->dso_filter != NULL && - (he->ms.map == NULL || map__dso(he->ms.map) != hists->dso_filter)) { + (he->ms.map == NULL || !RC_CHK_EQUAL(map__dso(he->ms.map), hists->dso_filter))) { he->filtered |= (1 << HIST_FILTER__DSO); return true; } @@ -2808,7 +2808,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh } if (dso) printed += scnprintf(bf + printed, size - printed, - ", DSO: %s", dso->short_name); + ", DSO: %s", dso__short_name(dso)); if (socket_id > -1) printed += scnprintf(bf + printed, size - printed, ", Processor Socket: %d", socket_id); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 4a0aea0c9e..8fb3bdd291 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -4,21 +4,22 @@ #include #include -#include "evsel.h" +#include "callchain.h" #include "color.h" #include "events_stats.h" +#include "evsel.h" +#include "map_symbol.h" #include "mutex.h" +#include "sample.h" +#include "spark.h" +#include "stat.h" -struct hist_entry; -struct hist_entry_ops; struct addr_location; -struct map_symbol; struct mem_info; struct kvm_info; struct branch_info; struct branch_stack; struct block_info; -struct symbol; struct ui_progress; enum hist_filter { @@ -131,18 +132,20 @@ struct hist_entry_iter { int total; int curr; - bool hide_unresolved; - struct evsel *evsel; struct perf_sample *sample; struct hist_entry *he; struct symbol *parent; - void *priv; + + struct mem_info *mi; + struct branch_info *bi; + struct hist_entry **he_cache; const struct hist_iter_ops *ops; /* user-defined callback function (optional) */ int (*add_entry_cb)(struct hist_entry_iter *iter, struct addr_location *al, bool single, void *arg); + bool hide_unresolved; }; extern const struct hist_iter_ops hist_iter_normal; @@ -150,6 +153,162 @@ extern const struct hist_iter_ops hist_iter_branch; extern const struct hist_iter_ops hist_iter_mem; extern const struct hist_iter_ops hist_iter_cumulative; +struct res_sample { + u64 time; + int cpu; + int tid; +}; + +struct he_stat { + u64 period; + u64 period_sys; + u64 period_us; + u64 period_guest_sys; + u64 period_guest_us; + u64 weight1; + u64 weight2; + u64 weight3; + u32 nr_events; +}; + +struct namespace_id { + u64 dev; + u64 ino; +}; + +struct hist_entry_diff { + bool computed; + union { + /* PERF_HPP__DELTA */ + double period_ratio_delta; + + /* PERF_HPP__RATIO */ + double period_ratio; + + /* HISTC_WEIGHTED_DIFF */ + s64 wdiff; + + /* PERF_HPP_DIFF__CYCLES */ + s64 cycles; + }; + struct stats stats; + unsigned long svals[NUM_SPARKS]; +}; + +struct hist_entry_ops { + void *(*new)(size_t size); + void (*free)(void *ptr); +}; + +/** + * struct hist_entry - histogram entry + * + * @row_offset - offset from the first callchain expanded to appear on screen + * @nr_rows - rows expanded in callchain, recalculated on folding/unfolding + */ +struct hist_entry { + struct rb_node rb_node_in; + struct rb_node rb_node; + union { + struct list_head node; + struct list_head head; + } pairs; + struct he_stat stat; + struct he_stat *stat_acc; + struct map_symbol ms; + struct thread *thread; + struct comm *comm; + struct namespace_id cgroup_id; + u64 cgroup; + u64 ip; + u64 transaction; + s32 socket; + s32 cpu; + u64 code_page_size; + u64 weight; + u64 ins_lat; + u64 p_stage_cyc; + u8 cpumode; + u8 depth; + int mem_type_off; + struct simd_flags simd_flags; + + /* We are added by hists__add_dummy_entry. */ + bool dummy; + bool leaf; + + char level; + u8 filtered; + + u16 callchain_size; + union { + /* + * Since perf diff only supports the stdio output, TUI + * fields are only accessed from perf report (or perf + * top). So make it a union to reduce memory usage. + */ + struct hist_entry_diff diff; + struct /* for TUI */ { + u16 row_offset; + u16 nr_rows; + bool init_have_children; + bool unfolded; + bool has_children; + bool has_no_entry; + }; + }; + char *srcline; + char *srcfile; + struct symbol *parent; + struct branch_info *branch_info; + long time; + struct hists *hists; + struct mem_info *mem_info; + struct block_info *block_info; + struct kvm_info *kvm_info; + void *raw_data; + u32 raw_size; + int num_res; + struct res_sample *res_samples; + void *trace_output; + struct perf_hpp_list *hpp_list; + struct hist_entry *parent_he; + struct hist_entry_ops *ops; + struct annotated_data_type *mem_type; + union { + /* this is for hierarchical entry structure */ + struct { + struct rb_root_cached hroot_in; + struct rb_root_cached hroot_out; + }; /* non-leaf entries */ + struct rb_root sorted_chain; /* leaf entry has callchains */ + }; + struct callchain_root callchain[0]; /* must be last member */ +}; + +static __pure inline bool hist_entry__has_callchains(struct hist_entry *he) +{ + return he->callchain_size != 0; +} + +static inline bool hist_entry__has_pairs(struct hist_entry *he) +{ + return !list_empty(&he->pairs.node); +} + +static inline struct hist_entry *hist_entry__next_pair(struct hist_entry *he) +{ + if (hist_entry__has_pairs(he)) + return list_entry(he->pairs.node.next, struct hist_entry, pairs.node); + return NULL; +} + +static inline void hist_entry__add_pair(struct hist_entry *pair, + struct hist_entry *he) +{ + list_add_tail(&pair->pairs.node, &he->pairs.head); +} + struct hist_entry *hists__add_entry(struct hists *hists, struct addr_location *al, struct symbol *parent, @@ -186,6 +345,8 @@ int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size, struct hists *hists); int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp, struct perf_hpp_fmt *fmt, int printed); +int hist_entry__sym_snprintf(struct hist_entry *he, char *bf, size_t size, + unsigned int width); void hist_entry__delete(struct hist_entry *he); typedef int (*hists__resort_cb_t)(struct hist_entry *he, void *arg); @@ -238,6 +399,20 @@ void hists__match(struct hists *leader, struct hists *other); int hists__link(struct hists *leader, struct hists *other); int hists__unlink(struct hists *hists); +static inline float hist_entry__get_percent_limit(struct hist_entry *he) +{ + u64 period = he->stat.period; + u64 total_period = hists__total_period(he->hists); + + if (unlikely(total_period == 0)) + return 0; + + if (symbol_conf.cumulate_callchain) + period = he->stat_acc->period; + + return period * 100.0 / total_period; +} + struct hists_evsel { struct evsel evsel; struct hists hists; @@ -377,6 +552,9 @@ enum { PERF_HPP__OVERHEAD_ACC, PERF_HPP__SAMPLES, PERF_HPP__PERIOD, + PERF_HPP__WEIGHT1, + PERF_HPP__WEIGHT2, + PERF_HPP__WEIGHT3, PERF_HPP__MAX_INDEX }; @@ -423,16 +601,24 @@ void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists); void perf_hpp__set_user_width(const char *width_list_str); void hists__reset_column_width(struct hists *hists); +enum perf_hpp_fmt_type { + PERF_HPP_FMT_TYPE__RAW, + PERF_HPP_FMT_TYPE__PERCENT, + PERF_HPP_FMT_TYPE__AVERAGE, +}; + typedef u64 (*hpp_field_fn)(struct hist_entry *he); typedef int (*hpp_callback_fn)(struct perf_hpp *hpp, bool front); typedef int (*hpp_snprint_fn)(struct perf_hpp *hpp, const char *fmt, ...); int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent); + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype); int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent); + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype); static inline void advance_hpp(struct perf_hpp *hpp, int inc) { @@ -460,15 +646,20 @@ struct hist_browser_timer { int refresh; }; -struct res_sample; - enum rstype { A_NORMAL, A_ASM, A_SOURCE }; -struct block_hist; +struct block_hist { + struct hists block_hists; + struct perf_hpp_list block_list; + struct perf_hpp_fmt block_fmt; + int block_idx; + bool valid; + struct hist_entry he; +}; #ifdef HAVE_SLANG_SUPPORT #include "../ui/keysyms.h" diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 4db9a098f5..d6d7b75125 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -598,15 +598,15 @@ static struct auxtrace_cache *intel_pt_cache(struct dso *dso, struct auxtrace_cache *c; unsigned int bits; - if (dso->auxtrace_cache) - return dso->auxtrace_cache; + if (dso__auxtrace_cache(dso)) + return dso__auxtrace_cache(dso); bits = intel_pt_cache_size(dso, machine); /* Ignoring cache creation failure */ c = auxtrace_cache__new(bits, sizeof(struct intel_pt_cache_entry), 200); - dso->auxtrace_cache = c; + dso__set_auxtrace_cache(dso, c); return c; } @@ -650,7 +650,7 @@ intel_pt_cache_lookup(struct dso *dso, struct machine *machine, u64 offset) if (!c) return NULL; - return auxtrace_cache__lookup(dso->auxtrace_cache, offset); + return auxtrace_cache__lookup(dso__auxtrace_cache(dso), offset); } static void intel_pt_cache_invalidate(struct dso *dso, struct machine *machine, @@ -661,7 +661,7 @@ static void intel_pt_cache_invalidate(struct dso *dso, struct machine *machine, if (!c) return; - auxtrace_cache__remove(dso->auxtrace_cache, offset); + auxtrace_cache__remove(dso__auxtrace_cache(dso), offset); } static inline bool intel_pt_guest_kernel_ip(uint64_t ip) @@ -821,8 +821,8 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, } dso = map__dso(al.map); - if (dso->data.status == DSO_DATA_STATUS_ERROR && - dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) { + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR && + dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) { ret = -ENOENT; goto out_ret; } @@ -855,7 +855,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, /* Load maps to ensure dso->is_64_bit has been updated */ map__load(al.map); - x86_64 = dso->is_64_bit; + x86_64 = dso__is_64_bit(dso); while (1) { len = dso__data_read_offset(dso, machine, @@ -1010,7 +1010,7 @@ static int __intel_pt_pgd_ip(uint64_t ip, void *data) offset = map__map_ip(al.map, ip); - res = intel_pt_match_pgd_ip(ptq->pt, ip, offset, map__dso(al.map)->long_name); + res = intel_pt_match_pgd_ip(ptq->pt, ip, offset, dso__long_name(map__dso(al.map))); addr_location__exit(&al); return res; } @@ -3418,7 +3418,7 @@ static int intel_pt_text_poke(struct intel_pt *pt, union perf_event *event) } dso = map__dso(al.map); - if (!dso || !dso->auxtrace_cache) + if (!dso || !dso__auxtrace_cache(dso)) continue; offset = map__map_ip(al.map, addr); @@ -3438,7 +3438,7 @@ static int intel_pt_text_poke(struct intel_pt *pt, union perf_event *event) } else { intel_pt_cache_invalidate(dso, machine, offset); intel_pt_log("Invalidated instruction cache for %s at %#"PRIx64"\n", - dso->long_name, addr); + dso__long_name(dso), addr); } } out: diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 07c22f765f..8477edefc2 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -16,6 +16,7 @@ #include "map_symbol.h" #include "branch.h" #include "mem-events.h" +#include "mem-info.h" #include "path.h" #include "srcline.h" #include "symbol.h" @@ -48,13 +49,6 @@ static struct dso *machine__kernel_dso(struct machine *machine) return map__dso(machine->vmlinux_map); } -static void dsos__init(struct dsos *dsos) -{ - INIT_LIST_HEAD(&dsos->head); - dsos->root = RB_ROOT; - init_rwsem(&dsos->lock); -} - static int machine__set_mmap_name(struct machine *machine) { if (machine__is_host(machine)) @@ -165,28 +159,6 @@ struct machine *machine__new_kallsyms(void) return machine; } -static void dsos__purge(struct dsos *dsos) -{ - struct dso *pos, *n; - - down_write(&dsos->lock); - - list_for_each_entry_safe(pos, n, &dsos->head, node) { - RB_CLEAR_NODE(&pos->rb_node); - pos->root = NULL; - list_del_init(&pos->node); - dso__put(pos); - } - - up_write(&dsos->lock); -} - -static void dsos__exit(struct dsos *dsos) -{ - dsos__purge(dsos); - exit_rwsem(&dsos->lock); -} - void machine__delete_threads(struct machine *machine) { threads__remove_all_threads(&machine->threads); @@ -675,31 +647,6 @@ int machine__process_lost_samples_event(struct machine *machine __maybe_unused, return 0; } -static struct dso *machine__findnew_module_dso(struct machine *machine, - struct kmod_path *m, - const char *filename) -{ - struct dso *dso; - - down_write(&machine->dsos.lock); - - dso = __dsos__find(&machine->dsos, m->name, true); - if (!dso) { - dso = __dsos__addnew(&machine->dsos, m->name); - if (dso == NULL) - goto out_unlock; - - dso__set_module_info(dso, m, machine); - dso__set_long_name(dso, strdup(filename), true); - dso->kernel = DSO_SPACE__KERNEL; - } - - dso__get(dso); -out_unlock: - up_write(&machine->dsos.lock); - return dso; -} - int machine__process_aux_event(struct machine *machine __maybe_unused, union perf_event *event) { @@ -737,7 +684,7 @@ static int machine__process_ksymbol_register(struct machine *machine, struct perf_sample *sample __maybe_unused) { struct symbol *sym; - struct dso *dso; + struct dso *dso = NULL; struct map *map = maps__find(machine__kernel_maps(machine), event->ksymbol.addr); int err = 0; @@ -748,16 +695,15 @@ static int machine__process_ksymbol_register(struct machine *machine, err = -ENOMEM; goto out; } - dso->kernel = DSO_SPACE__KERNEL; + dso__set_kernel(dso, DSO_SPACE__KERNEL); map = map__new2(0, dso); - dso__put(dso); if (!map) { err = -ENOMEM; goto out; } if (event->ksymbol.ksym_type == PERF_RECORD_KSYMBOL_TYPE_OOL) { - dso->binary_type = DSO_BINARY_TYPE__OOL; - dso->data.file_size = event->ksymbol.len; + dso__set_binary_type(dso, DSO_BINARY_TYPE__OOL); + dso__data(dso)->file_size = event->ksymbol.len; dso__set_loaded(dso); } @@ -772,11 +718,11 @@ static int machine__process_ksymbol_register(struct machine *machine, dso__set_loaded(dso); if (is_bpf_image(event->ksymbol.name)) { - dso->binary_type = DSO_BINARY_TYPE__BPF_IMAGE; + dso__set_binary_type(dso, DSO_BINARY_TYPE__BPF_IMAGE); dso__set_long_name(dso, "", false); } } else { - dso = map__dso(map); + dso = dso__get(map__dso(map)); } sym = symbol__new(map__map_ip(map, map__start(map)), @@ -789,6 +735,7 @@ static int machine__process_ksymbol_register(struct machine *machine, dso__insert_symbol(dso, sym); out: map__put(map); + dso__put(dso); return err; } @@ -883,7 +830,7 @@ static struct map *machine__addnew_module_map(struct machine *machine, u64 start if (kmod_path__parse_name(&m, filename)) return NULL; - dso = machine__findnew_module_dso(machine, &m, filename); + dso = dsos__findnew_module_dso(&machine->dsos, machine, &m, filename); if (dso == NULL) goto out; @@ -907,11 +854,11 @@ out: size_t machines__fprintf_dsos(struct machines *machines, FILE *fp) { struct rb_node *nd; - size_t ret = __dsos__fprintf(&machines->host.dsos.head, fp); + size_t ret = dsos__fprintf(&machines->host.dsos, fp); for (nd = rb_first_cached(&machines->guests); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret += __dsos__fprintf(&pos->dsos.head, fp); + ret += dsos__fprintf(&pos->dsos, fp); } return ret; @@ -920,7 +867,7 @@ size_t machines__fprintf_dsos(struct machines *machines, FILE *fp) size_t machine__fprintf_dsos_buildid(struct machine *m, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm) { - return __dsos__fprintf_buildid(&m->dsos.head, fp, skip, parm); + return dsos__fprintf_buildid(&m->dsos, fp, skip, parm); } size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp, @@ -942,17 +889,17 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp) size_t printed = 0; struct dso *kdso = machine__kernel_dso(machine); - if (kdso->has_build_id) { + if (dso__has_build_id(kdso)) { char filename[PATH_MAX]; - if (dso__build_id_filename(kdso, filename, sizeof(filename), - false)) + + if (dso__build_id_filename(kdso, filename, sizeof(filename), false)) printed += fprintf(fp, "[0] %s\n", filename); } - for (i = 0; i < vmlinux_path__nr_entries; ++i) - printed += fprintf(fp, "[%d] %s\n", - i + kdso->has_build_id, vmlinux_path[i]); - + for (i = 0; i < vmlinux_path__nr_entries; ++i) { + printed += fprintf(fp, "[%d] %s\n", i + dso__has_build_id(kdso), + vmlinux_path[i]); + } return printed; } @@ -1002,7 +949,7 @@ static struct dso *machine__get_kernel(struct machine *machine) DSO_SPACE__KERNEL_GUEST); } - if (kernel != NULL && (!kernel->has_build_id)) + if (kernel != NULL && (!dso__has_build_id(kernel))) dso__read_running_kernel_build_id(kernel, machine); return kernel; @@ -1367,8 +1314,8 @@ static char *get_kernel_version(const char *root_dir) static bool is_kmod_dso(struct dso *dso) { - return dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE; + return dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE; } static int maps__set_module_path(struct maps *maps, const char *path, struct kmod_path *m) @@ -1395,8 +1342,8 @@ static int maps__set_module_path(struct maps *maps, const char *path, struct kmo * we need to update the symtab_type if needed. */ if (m->comp && is_kmod_dso(dso)) { - dso->symtab_type++; - dso->comp = m->comp; + dso__set_symtab_type(dso, dso__symtab_type(dso)); + dso__set_comp(dso, m->comp); } map__put(map); return 0; @@ -1616,16 +1563,14 @@ out_put: return ret; } -static bool machine__uses_kcore(struct machine *machine) +static int machine__uses_kcore_cb(struct dso *dso, void *data __maybe_unused) { - struct dso *dso; - - list_for_each_entry(dso, &machine->dsos.head, node) { - if (dso__is_kcore(dso)) - return true; - } + return dso__is_kcore(dso) ? 1 : 0; +} - return false; +static bool machine__uses_kcore(struct machine *machine) +{ + return dsos__for_each_dso(&machine->dsos, machine__uses_kcore_cb, NULL) != 0 ? true : false; } static bool perf_event__is_extra_kernel_mmap(struct machine *machine, @@ -1692,53 +1637,20 @@ static int machine__process_kernel_mmap_event(struct machine *machine, * Should be there already, from the build-id table in * the header. */ - struct dso *kernel = NULL; - struct dso *dso; - - down_read(&machine->dsos.lock); - - list_for_each_entry(dso, &machine->dsos.head, node) { - - /* - * The cpumode passed to is_kernel_module is not the - * cpumode of *this* event. If we insist on passing - * correct cpumode to is_kernel_module, we should - * record the cpumode when we adding this dso to the - * linked list. - * - * However we don't really need passing correct - * cpumode. We know the correct cpumode must be kernel - * mode (if not, we should not link it onto kernel_dsos - * list). - * - * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN. - * is_kernel_module() treats it as a kernel cpumode. - */ - - if (!dso->kernel || - is_kernel_module(dso->long_name, - PERF_RECORD_MISC_CPUMODE_UNKNOWN)) - continue; - - - kernel = dso__get(dso); - break; - } - - up_read(&machine->dsos.lock); + struct dso *kernel = dsos__find_kernel_dso(&machine->dsos); if (kernel == NULL) kernel = machine__findnew_dso(machine, machine->mmap_name); if (kernel == NULL) goto out_problem; - kernel->kernel = dso_space; + dso__set_kernel(kernel, dso_space); if (__machine__create_kernel_maps(machine, kernel) < 0) { dso__put(kernel); goto out_problem; } - if (strstr(kernel->long_name, "vmlinux")) + if (strstr(dso__long_name(kernel), "vmlinux")) dso__set_short_name(kernel, "[kernel.vmlinux]", false); if (machine__update_kernel_mmap(machine, xm->start, xm->end) < 0) { @@ -2101,11 +2013,11 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, if (!mi) return NULL; - ip__resolve_ams(al->thread, &mi->iaddr, sample->ip); - ip__resolve_data(al->thread, al->cpumode, &mi->daddr, + ip__resolve_ams(al->thread, mem_info__iaddr(mi), sample->ip); + ip__resolve_data(al->thread, al->cpumode, mem_info__daddr(mi), sample->addr, sample->phys_addr, sample->data_page_size); - mi->data_src.val = sample->data_src; + mem_info__data_src(mi)->val = sample->data_src; return mi; } @@ -2120,14 +2032,14 @@ static char *callchain_srcline(struct map_symbol *ms, u64 ip) return srcline; dso = map__dso(map); - srcline = srcline__tree_find(&dso->srclines, ip); + srcline = srcline__tree_find(dso__srclines(dso), ip); if (!srcline) { bool show_sym = false; bool show_addr = callchain_param.key == CCKEY_ADDRESS; srcline = get_srcline(dso, map__rip_2objdump(map, ip), ms->sym, show_sym, show_addr, ip); - srcline__tree_insert(&dso->srclines, ip, srcline); + srcline__tree_insert(dso__srclines(dso), ip, srcline); } return srcline; @@ -2925,12 +2837,12 @@ static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms addr = map__rip_2objdump(map, addr); dso = map__dso(map); - inline_node = inlines__tree_find(&dso->inlined_nodes, addr); + inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr); if (!inline_node) { inline_node = dso__parse_addr_inlines(dso, addr, sym); if (!inline_node) return ret; - inlines__tree_insert(&dso->inlined_nodes, inline_node); + inlines__tree_insert(dso__inlined_nodes(dso), inline_node); } ilist_ms = (struct map_symbol) { @@ -3219,21 +3131,33 @@ char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, ch if (sym == NULL) return NULL; - *modp = __map__is_kmodule(map) ? (char *)map__dso(map)->short_name : NULL; + *modp = __map__is_kmodule(map) ? (char *)dso__short_name(map__dso(map)) : NULL; *addrp = map__unmap_ip(map, sym->start); return sym->name; } +struct machine__for_each_dso_cb_args { + struct machine *machine; + machine__dso_t fn; + void *priv; +}; + +static int machine__for_each_dso_cb(struct dso *dso, void *data) +{ + struct machine__for_each_dso_cb_args *args = data; + + return args->fn(dso, args->machine, args->priv); +} + int machine__for_each_dso(struct machine *machine, machine__dso_t fn, void *priv) { - struct dso *pos; - int err = 0; + struct machine__for_each_dso_cb_args args = { + .machine = machine, + .fn = fn, + .priv = priv, + }; - list_for_each_entry(pos, &machine->dsos.head, node) { - if (fn(pos, machine, priv)) - err = -1; - } - return err; + return dsos__for_each_dso(&machine->dsos, machine__for_each_dso_cb, &args); } int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn, void *priv) @@ -3266,6 +3190,17 @@ bool machine__is_lock_function(struct machine *machine, u64 addr) sym = machine__find_kernel_symbol_by_name(machine, "__lock_text_end", &kmap); machine->lock.text_end = map__unmap_ip(kmap, sym->start); + + sym = machine__find_kernel_symbol_by_name(machine, "__traceiter_contention_begin", &kmap); + if (sym) { + machine->traceiter.text_start = map__unmap_ip(kmap, sym->start); + machine->traceiter.text_end = map__unmap_ip(kmap, sym->end); + } + sym = machine__find_kernel_symbol_by_name(machine, "trace_contention_begin", &kmap); + if (sym) { + machine->trace.text_start = map__unmap_ip(kmap, sym->start); + machine->trace.text_end = map__unmap_ip(kmap, sym->end); + } } /* failed to get kernel symbols */ @@ -3280,5 +3215,23 @@ bool machine__is_lock_function(struct machine *machine, u64 addr) if (machine->lock.text_start <= addr && addr < machine->lock.text_end) return true; + /* traceiter functions currently don't have their own section + * but we consider them lock functions + */ + if (machine->traceiter.text_start != 0) { + if (machine->traceiter.text_start <= addr && addr < machine->traceiter.text_end) + return true; + } + + if (machine->trace.text_start != 0) { + if (machine->trace.text_start <= addr && addr < machine->trace.text_end) + return true; + } + return false; } + +int machine__hit_all_dsos(struct machine *machine) +{ + return dsos__hit_all(&machine->dsos); +} diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index e28c787616..82a47bac80 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -49,7 +49,7 @@ struct machine { struct { u64 text_start; u64 text_end; - } sched, lock; + } sched, lock, traceiter, trace; pid_t *current_tid; size_t current_tid_sz; union { /* Tool specific area */ @@ -306,4 +306,6 @@ int machine__map_x86_64_entry_trampolines(struct machine *machine, int machine__resolve(struct machine *machine, struct addr_location *al, struct perf_sample *sample); +int machine__hit_all_dsos(struct machine *machine); + #endif /* __PERF_MACHINE_H */ diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 14a5ea70d8..e1d14936a6 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -168,7 +168,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, if (dso == NULL) goto out_delete; - assert(!dso->kernel); + assert(!dso__kernel(dso)); map__init(result, start, start + len, pgoff, dso); if (anon || no_dso) { @@ -182,10 +182,9 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, if (!(prot & PROT_EXEC)) dso__set_loaded(dso); } - mutex_lock(&dso->lock); - nsinfo__put(dso->nsinfo); - dso->nsinfo = nsi; - mutex_unlock(&dso->lock); + mutex_lock(dso__lock(dso)); + dso__set_nsinfo(dso, nsi); + mutex_unlock(dso__lock(dso)); if (build_id__is_defined(bid)) { dso__set_build_id(dso, bid); @@ -196,13 +195,12 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, * reading the header will have the build ID set and all future mmaps will * have it missing. */ - down_read(&machine->dsos.lock); - header_bid_dso = __dsos__find(&machine->dsos, filename, false); - up_read(&machine->dsos.lock); - if (header_bid_dso && header_bid_dso->header_build_id) { - dso__set_build_id(dso, &header_bid_dso->bid); - dso->header_build_id = 1; + header_bid_dso = dsos__find(&machine->dsos, filename, false); + if (header_bid_dso && dso__header_build_id(header_bid_dso)) { + dso__set_build_id(dso, dso__bid(header_bid_dso)); + dso__set_header_build_id(dso, 1); } + dso__put(header_bid_dso); } dso__put(dso); } @@ -223,7 +221,7 @@ struct map *map__new2(u64 start, struct dso *dso) struct map *result; RC_STRUCT(map) *map; - map = calloc(1, sizeof(*map) + (dso->kernel ? sizeof(struct kmap) : 0)); + map = calloc(1, sizeof(*map) + (dso__kernel(dso) ? sizeof(struct kmap) : 0)); if (ADD_RC_CHK(result, map)) { /* * ->end will be filled after we load all the symbols @@ -236,7 +234,7 @@ struct map *map__new2(u64 start, struct dso *dso) bool __map__is_kernel(const struct map *map) { - if (!map__dso(map)->kernel) + if (!dso__kernel(map__dso(map))) return false; return machine__kernel_map(maps__machine(map__kmaps((struct map *)map))) == map; } @@ -253,7 +251,7 @@ bool __map__is_bpf_prog(const struct map *map) const char *name; struct dso *dso = map__dso(map); - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) return true; /* @@ -261,7 +259,7 @@ bool __map__is_bpf_prog(const struct map *map) * type of DSO_BINARY_TYPE__BPF_PROG_INFO. In such cases, we can * guess the type based on name. */ - name = dso->short_name; + name = dso__short_name(dso); return name && (strstr(name, "bpf_prog_") == name); } @@ -270,7 +268,7 @@ bool __map__is_bpf_image(const struct map *map) const char *name; struct dso *dso = map__dso(map); - if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_IMAGE) return true; /* @@ -278,7 +276,7 @@ bool __map__is_bpf_image(const struct map *map) * type of DSO_BINARY_TYPE__BPF_IMAGE. In such cases, we can * guess the type based on name. */ - name = dso->short_name; + name = dso__short_name(dso); return name && is_bpf_image(name); } @@ -286,7 +284,7 @@ bool __map__is_ool(const struct map *map) { const struct dso *dso = map__dso(map); - return dso && dso->binary_type == DSO_BINARY_TYPE__OOL; + return dso && dso__binary_type(dso) == DSO_BINARY_TYPE__OOL; } bool map__has_symbols(const struct map *map) @@ -317,7 +315,7 @@ void map__put(struct map *map) void map__fixup_start(struct map *map) { struct dso *dso = map__dso(map); - struct rb_root_cached *symbols = &dso->symbols; + struct rb_root_cached *symbols = dso__symbols(dso); struct rb_node *nd = rb_first_cached(symbols); if (nd != NULL) { @@ -330,7 +328,7 @@ void map__fixup_start(struct map *map) void map__fixup_end(struct map *map) { struct dso *dso = map__dso(map); - struct rb_root_cached *symbols = &dso->symbols; + struct rb_root_cached *symbols = dso__symbols(dso); struct rb_node *nd = rb_last(&symbols->rb_root); if (nd != NULL) { @@ -344,7 +342,7 @@ void map__fixup_end(struct map *map) int map__load(struct map *map) { struct dso *dso = map__dso(map); - const char *name = dso->long_name; + const char *name = dso__long_name(dso); int nr; if (dso__loaded(dso)) @@ -352,10 +350,10 @@ int map__load(struct map *map) nr = dso__load(dso, map); if (nr < 0) { - if (dso->has_build_id) { + if (dso__has_build_id(dso)) { char sbuild_id[SBUILD_ID_SIZE]; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); pr_debug("%s with build id %s not found", name, sbuild_id); } else pr_debug("Failed to open %s", name); @@ -417,7 +415,7 @@ struct map *map__clone(struct map *from) size_t size = sizeof(RC_STRUCT(map)); struct dso *dso = map__dso(from); - if (dso && dso->kernel) + if (dso && dso__kernel(dso)) size += sizeof(struct kmap); map = memdup(RC_CHK_ACCESS(from), size); @@ -434,14 +432,14 @@ size_t map__fprintf(struct map *map, FILE *fp) const struct dso *dso = map__dso(map); return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s\n", - map__start(map), map__end(map), map__pgoff(map), dso->name); + map__start(map), map__end(map), map__pgoff(map), dso__name(dso)); } static bool prefer_dso_long_name(const struct dso *dso, bool print_off) { - return dso->long_name && + return dso__long_name(dso) && (symbol_conf.show_kernel_path || - (print_off && (dso->name[0] == '[' || dso__is_kcore(dso)))); + (print_off && (dso__name(dso)[0] == '[' || dso__is_kcore(dso)))); } static size_t __map__fprintf_dsoname(struct map *map, bool print_off, FILE *fp) @@ -452,9 +450,9 @@ static size_t __map__fprintf_dsoname(struct map *map, bool print_off, FILE *fp) if (dso) { if (prefer_dso_long_name(dso, print_off)) - dsoname = dso->long_name; + dsoname = dso__long_name(dso); else - dsoname = dso->name; + dsoname = dso__name(dso); } if (symbol_conf.pad_output_len_dso) { @@ -547,14 +545,14 @@ u64 map__rip_2objdump(struct map *map, u64 rip) } } - if (!dso->adjust_symbols) + if (!dso__adjust_symbols(dso)) return rip; - if (dso->rel) + if (dso__rel(dso)) return rip - map__pgoff(map); - if (dso->kernel == DSO_SPACE__USER) - return rip + dso->text_offset; + if (dso__kernel(dso) == DSO_SPACE__USER) + return rip + dso__text_offset(dso); return map__unmap_ip(map, rip) - map__reloc(map); } @@ -575,18 +573,35 @@ u64 map__objdump_2mem(struct map *map, u64 ip) { const struct dso *dso = map__dso(map); - if (!dso->adjust_symbols) + if (!dso__adjust_symbols(dso)) return map__unmap_ip(map, ip); - if (dso->rel) + if (dso__rel(dso)) return map__unmap_ip(map, ip + map__pgoff(map)); - if (dso->kernel == DSO_SPACE__USER) - return map__unmap_ip(map, ip - dso->text_offset); + if (dso__kernel(dso) == DSO_SPACE__USER) + return map__unmap_ip(map, ip - dso__text_offset(dso)); return ip + map__reloc(map); } +/* convert objdump address to relative address. (To be removed) */ +u64 map__objdump_2rip(struct map *map, u64 ip) +{ + const struct dso *dso = map__dso(map); + + if (!dso__adjust_symbols(dso)) + return ip; + + if (dso__rel(dso)) + return ip + map__pgoff(map); + + if (dso__kernel(dso) == DSO_SPACE__USER) + return ip - dso__text_offset(dso); + + return map__map_ip(map, ip + map__reloc(map)); +} + bool map__contains_symbol(const struct map *map, const struct symbol *sym) { u64 ip = map__unmap_ip(map, sym->start); @@ -598,7 +613,7 @@ struct kmap *__map__kmap(struct map *map) { const struct dso *dso = map__dso(map); - if (!dso || !dso->kernel) + if (!dso || !dso__kernel(dso)) return NULL; return (struct kmap *)(&RC_CHK_ACCESS(map)[1]); } diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 49756716cb..65e2609fa1 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -132,6 +132,9 @@ u64 map__rip_2objdump(struct map *map, u64 rip); /* objdump address -> memory address */ u64 map__objdump_2mem(struct map *map, u64 ip); +/* objdump address -> rip */ +u64 map__objdump_2rip(struct map *map, u64 ip); + struct symbol; struct thread; diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index ce13145a9f..eaada3e0f5 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -76,7 +76,7 @@ static void check_invariants(const struct maps *maps __maybe_unused) /* Expect at least 1 reference count. */ assert(refcount_read(map__refcnt(map)) > 0); - if (map__dso(map) && map__dso(map)->kernel) + if (map__dso(map) && dso__kernel(map__dso(map))) assert(RC_CHK_EQUAL(map__kmap(map)->kmaps, maps)); if (i > 0) { @@ -124,11 +124,6 @@ static void maps__set_maps_by_address(struct maps *maps, struct map **new) } -static struct map ***maps__maps_by_name_addr(struct maps *maps) -{ - return &RC_CHK_ACCESS(maps)->maps_by_name; -} - static void maps__set_nr_maps_allocated(struct maps *maps, unsigned int nr_maps_allocated) { RC_CHK_ACCESS(maps)->nr_maps_allocated = nr_maps_allocated; @@ -211,11 +206,6 @@ void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libun static struct rw_semaphore *maps__lock(struct maps *maps) { - /* - * When the lock is acquired or released the maps invariants should - * hold. - */ - check_invariants(maps); return &RC_CHK_ACCESS(maps)->lock; } @@ -289,6 +279,9 @@ void maps__put(struct maps *maps) static void __maps__free_maps_by_name(struct maps *maps) { + if (!maps__maps_by_name(maps)) + return; + /* * Free everything to try to do it from the rbtree in the next search */ @@ -296,6 +289,9 @@ static void __maps__free_maps_by_name(struct maps *maps) map__put(maps__maps_by_name(maps)[i]); zfree(&RC_CHK_ACCESS(maps)->maps_by_name); + + /* Consistent with maps__init(). When maps_by_name == NULL, maps_by_name_sorted == false */ + maps__set_maps_by_name_sorted(maps, false); } static int map__start_cmp(const void *a, const void *b) @@ -346,7 +342,7 @@ static int map__strcmp(const void *a, const void *b) const struct map *map_b = *(const struct map * const *)b; const struct dso *dso_a = map__dso(map_a); const struct dso *dso_b = map__dso(map_b); - int ret = strcmp(dso_a->short_name, dso_b->short_name); + int ret = strcmp(dso__short_name(dso_a), dso__short_name(dso_b)); if (ret == 0 && RC_CHK_ACCESS(map_a) != RC_CHK_ACCESS(map_b)) { /* Ensure distinct but name equal maps have an order. */ @@ -358,6 +354,7 @@ static int map__strcmp(const void *a, const void *b) static int maps__sort_by_name(struct maps *maps) { int err = 0; + down_write(maps__lock(maps)); if (!maps__maps_by_name_sorted(maps)) { struct map **maps_by_name = maps__maps_by_name(maps); @@ -384,6 +381,7 @@ static int maps__sort_by_name(struct maps *maps) maps__set_maps_by_name_sorted(maps, true); } } + check_invariants(maps); up_write(maps__lock(maps)); return err; } @@ -485,7 +483,7 @@ static int __maps__insert(struct maps *maps, struct map *new) } if (map__end(new) < map__start(new)) RC_CHK_ACCESS(maps)->ends_broken = true; - if (dso && dso->kernel) { + if (dso && dso__kernel(dso)) { struct kmap *kmap = map__kmap(new); if (kmap) @@ -502,6 +500,7 @@ int maps__insert(struct maps *maps, struct map *map) down_write(maps__lock(maps)); ret = __maps__insert(maps, map); + check_invariants(maps); up_write(maps__lock(maps)); return ret; } @@ -536,6 +535,7 @@ void maps__remove(struct maps *maps, struct map *map) { down_write(maps__lock(maps)); __maps__remove(maps, map); + check_invariants(maps); up_write(maps__lock(maps)); } @@ -602,6 +602,7 @@ void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data else i++; } + check_invariants(maps); up_write(maps__lock(maps)); } @@ -740,7 +741,6 @@ static unsigned int first_ending_after(struct maps *maps, const struct map *map) */ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) { - struct map **maps_by_address; int err = 0; FILE *fp = debug_file(); @@ -748,12 +748,12 @@ sort_again: if (!maps__maps_by_address_sorted(maps)) __maps__sort_by_address(maps); - maps_by_address = maps__maps_by_address(maps); /* * Iterate through entries where the end of the existing entry is * greater-than the new map's start. */ for (unsigned int i = first_ending_after(maps, new); i < maps__nr_maps(maps); ) { + struct map **maps_by_address = maps__maps_by_address(maps); struct map *pos = maps_by_address[i]; struct map *before = NULL, *after = NULL; @@ -766,7 +766,7 @@ sort_again: if (use_browser) { pr_debug("overlapping maps in %s (disable tui for more info)\n", - map__dso(new)->name); + dso__name(map__dso(new))); } else if (verbose >= 2) { pr_debug("overlapping maps:\n"); map__fprintf(new, fp); @@ -820,8 +820,10 @@ sort_again: /* Maps are still ordered, go to next one. */ i++; if (after) { - __maps__insert(maps, after); + err = __maps__insert(maps, after); map__put(after); + if (err) + goto out_err; if (!maps__maps_by_address_sorted(maps)) { /* * Sorting broken so invariants don't @@ -850,7 +852,7 @@ sort_again: check_invariants(maps); } /* Add the map. */ - __maps__insert(maps, new); + err = __maps__insert(maps, new); out_err: return err; } @@ -942,6 +944,8 @@ int maps__copy_from(struct maps *dest, struct maps *parent) map__put(new); } } + check_invariants(dest); + up_read(maps__lock(parent)); up_write(maps__lock(dest)); return err; @@ -987,7 +991,7 @@ static int map__strcmp_name(const void *name, const void *b) { const struct dso *dso = map__dso(*(const struct map **)b); - return strcmp(name, dso->short_name); + return strcmp(name, dso__short_name(dso)); } struct map *maps__find_by_name(struct maps *maps, const char *name) @@ -1006,7 +1010,7 @@ struct map *maps__find_by_name(struct maps *maps, const char *name) if (i < maps__nr_maps(maps) && maps__maps_by_name(maps)) { struct dso *dso = map__dso(maps__maps_by_name(maps)[i]); - if (dso && strcmp(dso->short_name, name) == 0) { + if (dso && strcmp(dso__short_name(dso), name) == 0) { result = map__get(maps__maps_by_name(maps)[i]); done = true; } @@ -1043,7 +1047,7 @@ struct map *maps__find_by_name(struct maps *maps, const char *name) struct map *pos = maps_by_address[i]; struct dso *dso = map__dso(pos); - if (dso && strcmp(dso->short_name, name) == 0) { + if (dso && strcmp(dso__short_name(dso), name) == 0) { result = map__get(pos); break; } @@ -1097,6 +1101,7 @@ void maps__fixup_end(struct maps *maps) map__set_end(maps_by_address[n - 1], ~0ULL); RC_CHK_ACCESS(maps)->ends_broken = false; + check_invariants(maps); up_write(maps__lock(maps)); } @@ -1147,6 +1152,8 @@ int maps__merge_in(struct maps *kmaps, struct map *new_map) map__start(kmaps_maps_by_address[first_after_]) >= map__end(new_map)) { /* No overlap so regular insert suffices. */ int ret = __maps__insert(kmaps, new_map); + + check_invariants(kmaps); up_write(maps__lock(kmaps)); return ret; } @@ -1162,8 +1169,7 @@ int maps__merge_in(struct maps *kmaps, struct map *new_map) } maps__set_maps_by_address(kmaps, merged_maps_by_address); maps__set_maps_by_address_sorted(kmaps, true); - zfree(maps__maps_by_name_addr(kmaps)); - maps__set_maps_by_name_sorted(kmaps, true); + __maps__free_maps_by_name(kmaps); maps__set_nr_maps_allocated(kmaps, merged_nr_maps_allocated); /* Copy entries before the new_map that can't overlap. */ @@ -1184,6 +1190,7 @@ int maps__merge_in(struct maps *kmaps, struct map *new_map) map__zput(kmaps_maps_by_address[i]); free(kmaps_maps_by_address); + check_invariants(kmaps); up_write(maps__lock(kmaps)); return 0; } diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 637cbd4a7b..6dda47bb77 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -10,7 +10,9 @@ #include #include "map_symbol.h" #include "mem-events.h" +#include "mem-info.h" #include "debug.h" +#include "evsel.h" #include "symbol.h" #include "pmu.h" #include "pmus.h" @@ -281,7 +283,7 @@ static const char * const tlb_access[] = { "Fault", }; -int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__tlb_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { size_t l = 0, i; u64 m = PERF_MEM_TLB_NA; @@ -291,7 +293,7 @@ int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info) out[0] = '\0'; if (mem_info) - m = mem_info->data_src.mem_dtlb; + m = mem_info__const_data_src(mem_info)->mem_dtlb; hit = m & PERF_MEM_TLB_HIT; miss = m & PERF_MEM_TLB_MISS; @@ -359,13 +361,13 @@ static const char * const mem_hops[] = { "board", }; -static int perf_mem__op_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +static int perf_mem__op_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { u64 op = PERF_MEM_LOCK_NA; int l; if (mem_info) - op = mem_info->data_src.mem_op; + op = mem_info__const_data_src(mem_info)->mem_op; if (op & PERF_MEM_OP_NA) l = scnprintf(out, sz, "N/A"); @@ -383,7 +385,7 @@ static int perf_mem__op_scnprintf(char *out, size_t sz, struct mem_info *mem_inf return l; } -int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__lvl_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { union perf_mem_data_src data_src; int printed = 0; @@ -398,7 +400,7 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) if (!mem_info) goto na; - data_src = mem_info->data_src; + data_src = *mem_info__const_data_src(mem_info); if (data_src.mem_lvl & PERF_MEM_LVL_HIT) memcpy(hit_miss, "hit", 3); @@ -465,7 +467,7 @@ static const char * const snoopx_access[] = { "Peer", }; -int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__snp_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { size_t i, l = 0; u64 m = PERF_MEM_SNOOP_NA; @@ -474,7 +476,7 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info) out[0] = '\0'; if (mem_info) - m = mem_info->data_src.mem_snoop; + m = mem_info__const_data_src(mem_info)->mem_snoop; for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) { if (!(m & 0x1)) @@ -488,7 +490,7 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info) m = 0; if (mem_info) - m = mem_info->data_src.mem_snoopx; + m = mem_info__const_data_src(mem_info)->mem_snoopx; for (i = 0; m && i < ARRAY_SIZE(snoopx_access); i++, m >>= 1) { if (!(m & 0x1)) @@ -507,13 +509,13 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info) return l; } -int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__lck_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { u64 mask = PERF_MEM_LOCK_NA; int l; if (mem_info) - mask = mem_info->data_src.mem_lock; + mask = mem_info__const_data_src(mem_info)->mem_lock; if (mask & PERF_MEM_LOCK_NA) l = scnprintf(out, sz, "N/A"); @@ -525,7 +527,7 @@ int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info) return l; } -int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__blk_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { size_t l = 0; u64 mask = PERF_MEM_BLK_NA; @@ -534,7 +536,7 @@ int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info) out[0] = '\0'; if (mem_info) - mask = mem_info->data_src.mem_blk; + mask = mem_info__const_data_src(mem_info)->mem_blk; if (!mask || (mask & PERF_MEM_BLK_NA)) { l += scnprintf(out + l, sz - l, " N/A"); @@ -548,7 +550,7 @@ int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info) return l; } -int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_script__meminfo_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { int i = 0; @@ -570,8 +572,8 @@ int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_in int c2c_decode_stats(struct c2c_stats *stats, struct mem_info *mi) { - union perf_mem_data_src *data_src = &mi->data_src; - u64 daddr = mi->daddr.addr; + union perf_mem_data_src *data_src = mem_info__data_src(mi); + u64 daddr = mem_info__daddr(mi)->addr; u64 op = data_src->mem_op; u64 lvl = data_src->mem_lvl; u64 snoop = data_src->mem_snoop; @@ -698,7 +700,7 @@ do { \ return -1; } - if (!mi->daddr.ms.map || !mi->iaddr.ms.map) { + if (!mem_info__daddr(mi)->ms.map || !mem_info__iaddr(mi)->ms.map) { stats->nomap++; return -1; } diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index 15d5f0320d..ca31014d79 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -3,13 +3,7 @@ #define __PERF_MEM_EVENTS_H #include -#include -#include #include -#include -#include -#include "stat.h" -#include "evsel.h" struct perf_mem_event { bool record; @@ -21,13 +15,6 @@ struct perf_mem_event { const char *event_name; }; -struct mem_info { - struct addr_map_symbol iaddr; - struct addr_map_symbol daddr; - union perf_mem_data_src data_src; - refcount_t refcnt; -}; - enum { PERF_MEM_EVENTS__LOAD, PERF_MEM_EVENTS__STORE, @@ -35,6 +22,10 @@ enum { PERF_MEM_EVENTS__MAX, }; +struct evsel; +struct mem_info; +struct perf_pmu; + extern unsigned int perf_mem_events__loads_ldlat; extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX]; @@ -49,13 +40,13 @@ bool is_mem_loads_aux_event(struct evsel *leader); void perf_pmu__mem_events_list(struct perf_pmu *pmu); int perf_mem_events__record_args(const char **rec_argv, int *argv_nr); -int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info); +int perf_mem__tlb_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__lvl_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__snp_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__lck_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__blk_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); -int perf_script__meminfo_scnprintf(char *bf, size_t size, struct mem_info *mem_info); +int perf_script__meminfo_scnprintf(char *bf, size_t size, const struct mem_info *mem_info); struct c2c_stats { u32 nr_entries; diff --git a/tools/perf/util/mem-info.c b/tools/perf/util/mem-info.c new file mode 100644 index 0000000000..27d67721a6 --- /dev/null +++ b/tools/perf/util/mem-info.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "mem-info.h" + +struct mem_info *mem_info__get(struct mem_info *mi) +{ + struct mem_info *result; + + if (RC_CHK_GET(result, mi)) + refcount_inc(mem_info__refcnt(mi)); + + return result; +} + +void mem_info__put(struct mem_info *mi) +{ + if (mi && refcount_dec_and_test(mem_info__refcnt(mi))) { + addr_map_symbol__exit(mem_info__iaddr(mi)); + addr_map_symbol__exit(mem_info__daddr(mi)); + RC_CHK_FREE(mi); + } else { + RC_CHK_PUT(mi); + } +} + +struct mem_info *mem_info__new(void) +{ + struct mem_info *result = NULL; + RC_STRUCT(mem_info) *mi = zalloc(sizeof(*mi)); + + if (ADD_RC_CHK(result, mi)) + refcount_set(mem_info__refcnt(result), 1); + + return result; +} diff --git a/tools/perf/util/mem-info.h b/tools/perf/util/mem-info.h new file mode 100644 index 0000000000..0f68e29f31 --- /dev/null +++ b/tools/perf/util/mem-info.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_MEM_INFO_H +#define __PERF_MEM_INFO_H + +#include +#include +#include +#include "map_symbol.h" + +DECLARE_RC_STRUCT(mem_info) { + struct addr_map_symbol iaddr; + struct addr_map_symbol daddr; + union perf_mem_data_src data_src; + refcount_t refcnt; +}; + +struct mem_info *mem_info__new(void); +struct mem_info *mem_info__get(struct mem_info *mi); +void mem_info__put(struct mem_info *mi); + +static inline void __mem_info__zput(struct mem_info **mi) +{ + mem_info__put(*mi); + *mi = NULL; +} + +#define mem_info__zput(mi) __mem_info__zput(&mi) + +static inline struct addr_map_symbol *mem_info__iaddr(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->iaddr; +} + +static inline struct addr_map_symbol *mem_info__daddr(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->daddr; +} + +static inline union perf_mem_data_src *mem_info__data_src(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->data_src; +} + +static inline const union perf_mem_data_src *mem_info__const_data_src(const struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->data_src; +} + +static inline refcount_t *mem_info__refcnt(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->refcnt; +} + +#endif /* __PERF_MEM_INFO_H */ diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 79ef6095ab..69f6a46402 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -455,7 +455,7 @@ static int metricgroup__add_to_mep_groups(const struct pmu_metric *pm, const char *g; char *omg, *mg; - mg = strdup(pm->metric_group ?: "No_group"); + mg = strdup(pm->metric_group ?: pm->metric_name); if (!mg) return -ENOMEM; omg = mg; @@ -466,7 +466,7 @@ static int metricgroup__add_to_mep_groups(const struct pmu_metric *pm, if (strlen(g)) me = mep_lookup(groups, g, pm->metric_name); else - me = mep_lookup(groups, "No_group", pm->metric_name); + me = mep_lookup(groups, pm->metric_name, pm->metric_name); if (me) { me->metric_desc = pm->desc; @@ -1502,7 +1502,8 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu, pr_debug("Parsing metric events '%s'\n", events.buf); parse_events_error__init(&parse_error); ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL, - &parse_error, fake_pmu, /*warn_if_reordered=*/false); + &parse_error, fake_pmu, /*warn_if_reordered=*/false, + /*fake_tp=*/false); if (ret) { parse_events_error__print(&parse_error, events.buf); goto err_out; @@ -1690,12 +1691,15 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, bool metric_no_threshold, const char *user_requested_cpu_list, bool system_wide, + bool hardware_aware_grouping, struct rblist *metric_events) { const struct pmu_metrics_table *table = pmu_metrics_table__find(); if (!table) return -EINVAL; + if (hardware_aware_grouping) + pr_debug("Use hardware aware grouping instead of traditional metric grouping method\n"); return parse_groups(perf_evlist, pmu, str, metric_no_group, metric_no_merge, metric_no_threshold, user_requested_cpu_list, system_wide, diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index d5325c6ec8..779f6ede1b 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -77,6 +77,7 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, bool metric_no_threshold, const char *user_requested_cpu_list, bool system_wide, + bool hardware_aware_grouping, struct rblist *metric_events); int metricgroup__parse_groups_test(struct evlist *evlist, const struct pmu_metrics_table *table, diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 6f8b0fa176..6ed0f9c558 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -34,11 +34,12 @@ #ifdef PARSER_DEBUG extern int parse_events_debug; #endif -static int get_config_terms(struct parse_events_terms *head_config, struct list_head *head_terms); +static int get_config_terms(const struct parse_events_terms *head_config, + struct list_head *head_terms); static int parse_events_terms__copy(const struct parse_events_terms *src, struct parse_events_terms *dest); -struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { +const struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = { .symbol = "cpu-cycles", .alias = "cycles", @@ -81,7 +82,7 @@ struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { }, }; -struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { +const struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { [PERF_COUNT_SW_CPU_CLOCK] = { .symbol = "cpu-clock", .alias = "", @@ -154,7 +155,7 @@ const char *event_type(int type) return "unknown"; } -static char *get_config_str(struct parse_events_terms *head_terms, +static char *get_config_str(const struct parse_events_terms *head_terms, enum parse_events__term_type type_term) { struct parse_events_term *term; @@ -169,12 +170,12 @@ static char *get_config_str(struct parse_events_terms *head_terms, return NULL; } -static char *get_config_metric_id(struct parse_events_terms *head_terms) +static char *get_config_metric_id(const struct parse_events_terms *head_terms) { return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_METRIC_ID); } -static char *get_config_name(struct parse_events_terms *head_terms) +static char *get_config_name(const struct parse_events_terms *head_terms) { return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME); } @@ -358,7 +359,7 @@ static int config_term_common(struct perf_event_attr *attr, struct parse_events_term *term, struct parse_events_error *err); static int config_attr(struct perf_event_attr *attr, - struct parse_events_terms *head, + const struct parse_events_terms *head, struct parse_events_error *err, config_term_func_t config_term); @@ -442,17 +443,21 @@ bool parse_events__filter_pmu(const struct parse_events_state *parse_state, return strcmp(parse_state->pmu_filter, pmu->name) != 0; } +static int parse_events_add_pmu(struct parse_events_state *parse_state, + struct list_head *list, struct perf_pmu *pmu, + const struct parse_events_terms *const_parsed_terms, + bool auto_merge_stats); + int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_state *parse_state, - struct parse_events_terms *head_config) + struct parse_events_terms *parsed_terms) { struct perf_pmu *pmu = NULL; bool found_supported = false; - const char *config_name = get_config_name(head_config); - const char *metric_id = get_config_metric_id(head_config); + const char *config_name = get_config_name(parsed_terms); + const char *metric_id = get_config_metric_id(parsed_terms); - /* Legacy cache events are only supported by core PMUs. */ - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { LIST_HEAD(config_terms); struct perf_event_attr attr; int ret; @@ -460,6 +465,24 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, if (parse_events__filter_pmu(parse_state, pmu)) continue; + if (perf_pmu__have_event(pmu, name)) { + /* + * The PMU has the event so add as not a legacy cache + * event. + */ + ret = parse_events_add_pmu(parse_state, list, pmu, + parsed_terms, + perf_pmu__auto_merge_stats(pmu)); + if (ret) + return ret; + continue; + } + + if (!pmu->is_core) { + /* Legacy cache events are only supported by core PMUs. */ + continue; + } + memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_HW_CACHE; @@ -469,11 +492,12 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, found_supported = true; - if (head_config) { - if (config_attr(&attr, head_config, parse_state->error, config_term_common)) + if (parsed_terms) { + if (config_attr(&attr, parsed_terms, parse_state->error, + config_term_common)) return -EINVAL; - if (get_config_terms(head_config, &config_terms)) + if (get_config_terms(parsed_terms, &config_terms)) return -ENOMEM; } @@ -519,13 +543,15 @@ static void tracepoint_error(struct parse_events_error *e, int err, parse_events_error__handle(e, column, strdup(str), strdup(help)); } -static int add_tracepoint(struct list_head *list, int *idx, +static int add_tracepoint(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, void *loc_) { YYLTYPE *loc = loc_; - struct evsel *evsel = evsel__newtp_idx(sys_name, evt_name, (*idx)++); + struct evsel *evsel = evsel__newtp_idx(sys_name, evt_name, parse_state->idx++, + !parse_state->fake_tp); if (IS_ERR(evsel)) { tracepoint_error(err, PTR_ERR(evsel), sys_name, evt_name, loc->first_column); @@ -544,7 +570,8 @@ static int add_tracepoint(struct list_head *list, int *idx, return 0; } -static int add_tracepoint_multi_event(struct list_head *list, int *idx, +static int add_tracepoint_multi_event(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, YYLTYPE *loc) @@ -578,7 +605,7 @@ static int add_tracepoint_multi_event(struct list_head *list, int *idx, found++; - ret = add_tracepoint(list, idx, sys_name, evt_ent->d_name, + ret = add_tracepoint(parse_state, list, sys_name, evt_ent->d_name, err, head_config, loc); } @@ -592,19 +619,21 @@ static int add_tracepoint_multi_event(struct list_head *list, int *idx, return ret; } -static int add_tracepoint_event(struct list_head *list, int *idx, +static int add_tracepoint_event(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, YYLTYPE *loc) { return strpbrk(evt_name, "*?") ? - add_tracepoint_multi_event(list, idx, sys_name, evt_name, + add_tracepoint_multi_event(parse_state, list, sys_name, evt_name, err, head_config, loc) : - add_tracepoint(list, idx, sys_name, evt_name, + add_tracepoint(parse_state, list, sys_name, evt_name, err, head_config, loc); } -static int add_tracepoint_multi_sys(struct list_head *list, int *idx, +static int add_tracepoint_multi_sys(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, YYLTYPE *loc) @@ -630,7 +659,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx, if (!strglobmatch(events_ent->d_name, sys_name)) continue; - ret = add_tracepoint_event(list, idx, events_ent->d_name, + ret = add_tracepoint_event(parse_state, list, events_ent->d_name, evt_name, err, head_config, loc); } @@ -1085,7 +1114,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr, #endif static int config_attr(struct perf_event_attr *attr, - struct parse_events_terms *head, + const struct parse_events_terms *head, struct parse_events_error *err, config_term_func_t config_term) { @@ -1098,7 +1127,8 @@ static int config_attr(struct perf_event_attr *attr, return 0; } -static int get_config_terms(struct parse_events_terms *head_config, struct list_head *head_terms) +static int get_config_terms(const struct parse_events_terms *head_config, + struct list_head *head_terms) { #define ADD_CONFIG_TERM(__type, __weak) \ struct evsel_config_term *__t; \ @@ -1266,7 +1296,8 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head return 0; } -int parse_events_add_tracepoint(struct list_head *list, int *idx, +int parse_events_add_tracepoint(struct parse_events_state *parse_state, + struct list_head *list, const char *sys, const char *event, struct parse_events_error *err, struct parse_events_terms *head_config, void *loc_) @@ -1282,14 +1313,14 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx, } if (strpbrk(sys, "*?")) - return add_tracepoint_multi_sys(list, idx, sys, event, + return add_tracepoint_multi_sys(parse_state, list, sys, event, err, head_config, loc); else - return add_tracepoint_event(list, idx, sys, event, + return add_tracepoint_event(parse_state, list, sys, event, err, head_config, loc); #else + (void)parse_state; (void)list; - (void)idx; (void)sys; (void)event; (void)head_config; @@ -1302,7 +1333,7 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx, static int __parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, struct perf_pmu *pmu, u32 type, u32 extended_type, - u64 config, struct parse_events_terms *head_config) + u64 config, const struct parse_events_terms *head_config) { struct perf_event_attr attr; LIST_HEAD(config_terms); @@ -1338,7 +1369,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state, int parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, u32 type, u64 config, - struct parse_events_terms *head_config, + const struct parse_events_terms *head_config, bool wildcard) { struct perf_pmu *pmu = NULL; @@ -1385,56 +1416,34 @@ static bool config_term_percore(struct list_head *config_terms) return false; } -int parse_events_add_pmu(struct parse_events_state *parse_state, - struct list_head *list, const char *name, - const struct parse_events_terms *const_parsed_terms, - bool auto_merge_stats, void *loc_) +static int parse_events_add_pmu(struct parse_events_state *parse_state, + struct list_head *list, struct perf_pmu *pmu, + const struct parse_events_terms *const_parsed_terms, + bool auto_merge_stats) { struct perf_event_attr attr; struct perf_pmu_info info; - struct perf_pmu *pmu; struct evsel *evsel; struct parse_events_error *err = parse_state->error; - YYLTYPE *loc = loc_; LIST_HEAD(config_terms); struct parse_events_terms parsed_terms; bool alias_rewrote_terms = false; - pmu = parse_state->fake_pmu ?: perf_pmus__find(name); - - if (!pmu) { - char *err_str; - - if (asprintf(&err_str, - "Cannot find PMU `%s'. Missing kernel support?", - name) >= 0) - parse_events_error__handle(err, loc->first_column, err_str, NULL); - return -EINVAL; - } - - parse_events_terms__init(&parsed_terms); - if (const_parsed_terms) { - int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms); - - if (ret) - return ret; - } - if (verbose > 1) { struct strbuf sb; strbuf_init(&sb, /*hint=*/ 0); - if (pmu->selectable && list_empty(&parsed_terms.terms)) { - strbuf_addf(&sb, "%s//", name); + if (pmu->selectable && const_parsed_terms && + list_empty(&const_parsed_terms->terms)) { + strbuf_addf(&sb, "%s//", pmu->name); } else { - strbuf_addf(&sb, "%s/", name); - parse_events_terms__to_strbuf(&parsed_terms, &sb); + strbuf_addf(&sb, "%s/", pmu->name); + parse_events_terms__to_strbuf(const_parsed_terms, &sb); strbuf_addch(&sb, '/'); } fprintf(stderr, "Attempt to add: %s\n", sb.buf); strbuf_release(&sb); } - fix_raw(&parsed_terms, pmu); memset(&attr, 0, sizeof(attr)); if (pmu->perf_event_attr_init_default) @@ -1442,7 +1451,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, attr.type = pmu->type; - if (list_empty(&parsed_terms.terms)) { + if (!const_parsed_terms || list_empty(&const_parsed_terms->terms)) { evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true, /*name=*/NULL, /*metric_id=*/NULL, pmu, @@ -1451,6 +1460,15 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, return evsel ? 0 : -ENOMEM; } + parse_events_terms__init(&parsed_terms); + if (const_parsed_terms) { + int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms); + + if (ret) + return ret; + } + fix_raw(&parsed_terms, pmu); + /* Configure attr/terms with a known PMU, this will set hardcoded terms. */ if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) { parse_events_terms__exit(&parsed_terms); @@ -1469,7 +1487,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, strbuf_init(&sb, /*hint=*/ 0); parse_events_terms__to_strbuf(&parsed_terms, &sb); - fprintf(stderr, "..after resolving event: %s/%s/\n", name, sb.buf); + fprintf(stderr, "..after resolving event: %s/%s/\n", pmu->name, sb.buf); strbuf_release(&sb); } @@ -1583,8 +1601,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, continue; auto_merge_stats = perf_pmu__auto_merge_stats(pmu); - if (!parse_events_add_pmu(parse_state, list, pmu->name, - &parsed_terms, auto_merge_stats, loc)) { + if (!parse_events_add_pmu(parse_state, list, pmu, + &parsed_terms, auto_merge_stats)) { struct strbuf sb; strbuf_init(&sb, /*hint=*/ 0); @@ -1596,8 +1614,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } if (parse_state->fake_pmu) { - if (!parse_events_add_pmu(parse_state, list, event_name, &parsed_terms, - /*auto_merge_stats=*/true, loc)) { + if (!parse_events_add_pmu(parse_state, list, parse_state->fake_pmu, &parsed_terms, + /*auto_merge_stats=*/true)) { struct strbuf sb; strbuf_init(&sb, /*hint=*/ 0); @@ -1618,10 +1636,59 @@ out_err: return ok ? 0 : -1; } -int parse_events__modifier_group(struct list_head *list, - char *event_mod) +int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state, + const char *event_or_pmu, + const struct parse_events_terms *const_parsed_terms, + struct list_head **listp, + void *loc_) { - return parse_events__modifier_event(list, event_mod, true); + YYLTYPE *loc = loc_; + struct perf_pmu *pmu; + int ok = 0; + char *help; + + *listp = malloc(sizeof(**listp)); + if (!*listp) + return -ENOMEM; + + INIT_LIST_HEAD(*listp); + + /* Attempt to add to list assuming event_or_pmu is a PMU name. */ + pmu = parse_state->fake_pmu ?: perf_pmus__find(event_or_pmu); + if (pmu && !parse_events_add_pmu(parse_state, *listp, pmu, const_parsed_terms, + /*auto_merge_stats=*/false)) + return 0; + + pmu = NULL; + /* Failed to add, try wildcard expansion of event_or_pmu as a PMU name. */ + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + if (!parse_events__filter_pmu(parse_state, pmu) && + perf_pmu__match(pmu, event_or_pmu)) { + bool auto_merge_stats = perf_pmu__auto_merge_stats(pmu); + + if (!parse_events_add_pmu(parse_state, *listp, pmu, + const_parsed_terms, + auto_merge_stats)) { + ok++; + parse_state->wild_card_pmus = true; + } + } + } + if (ok) + return 0; + + /* Failure to add, assume event_or_pmu is an event name. */ + zfree(listp); + if (!parse_events_multi_pmu_add(parse_state, event_or_pmu, const_parsed_terms, listp, loc)) + return 0; + + if (asprintf(&help, "Unable to find PMU or event on a PMU of '%s'", event_or_pmu) < 0) + help = NULL; + parse_events_error__handle(parse_state->error, loc->first_column, + strdup("Bad event or PMU"), + help); + zfree(listp); + return -EINVAL; } void parse_events__set_leader(char *name, struct list_head *list) @@ -1635,213 +1702,146 @@ void parse_events__set_leader(char *name, struct list_head *list) leader = list_first_entry(list, struct evsel, core.node); __perf_evlist__set_leader(list, &leader->core); + zfree(&leader->group_name); leader->group_name = name; } -/* list_event is assumed to point to malloc'ed memory */ -void parse_events_update_lists(struct list_head *list_event, - struct list_head *list_all) +static int parse_events__modifier_list(struct parse_events_state *parse_state, + YYLTYPE *loc, + struct list_head *list, + struct parse_events_modifier mod, + bool group) { - /* - * Called for single event definition. Update the - * 'all event' list, and reinit the 'single event' - * list, for next event definition. - */ - list_splice_tail(list_event, list_all); - free(list_event); -} - -struct event_modifier { - int eu; - int ek; - int eh; - int eH; - int eG; - int eI; - int precise; - int precise_max; - int exclude_GH; - int sample_read; - int pinned; - int weak; - int exclusive; - int bpf_counter; -}; + struct evsel *evsel; + + if (!group && mod.weak) { + parse_events_error__handle(parse_state->error, loc->first_column, + strdup("Weak modifier is for use with groups"), NULL); + return -EINVAL; + } -static int get_event_modifier(struct event_modifier *mod, char *str, - struct evsel *evsel) -{ - int eu = evsel ? evsel->core.attr.exclude_user : 0; - int ek = evsel ? evsel->core.attr.exclude_kernel : 0; - int eh = evsel ? evsel->core.attr.exclude_hv : 0; - int eH = evsel ? evsel->core.attr.exclude_host : 0; - int eG = evsel ? evsel->core.attr.exclude_guest : 0; - int eI = evsel ? evsel->core.attr.exclude_idle : 0; - int precise = evsel ? evsel->core.attr.precise_ip : 0; - int precise_max = 0; - int sample_read = 0; - int pinned = evsel ? evsel->core.attr.pinned : 0; - int exclusive = evsel ? evsel->core.attr.exclusive : 0; - - int exclude = eu | ek | eh; - int exclude_GH = evsel ? evsel->exclude_GH : 0; - int weak = 0; - int bpf_counter = 0; - - memset(mod, 0, sizeof(*mod)); - - while (*str) { - if (*str == 'u') { + __evlist__for_each_entry(list, evsel) { + /* Translate modifiers into the equivalent evsel excludes. */ + int eu = group ? evsel->core.attr.exclude_user : 0; + int ek = group ? evsel->core.attr.exclude_kernel : 0; + int eh = group ? evsel->core.attr.exclude_hv : 0; + int eH = group ? evsel->core.attr.exclude_host : 0; + int eG = group ? evsel->core.attr.exclude_guest : 0; + int exclude = eu | ek | eh; + int exclude_GH = group ? evsel->exclude_GH : 0; + + if (mod.precise) { + /* use of precise requires exclude_guest */ + eG = 1; + } + if (mod.user) { if (!exclude) exclude = eu = ek = eh = 1; if (!exclude_GH && !perf_guest) eG = 1; eu = 0; - } else if (*str == 'k') { + } + if (mod.kernel) { if (!exclude) exclude = eu = ek = eh = 1; ek = 0; - } else if (*str == 'h') { + } + if (mod.hypervisor) { if (!exclude) exclude = eu = ek = eh = 1; eh = 0; - } else if (*str == 'G') { + } + if (mod.guest) { if (!exclude_GH) exclude_GH = eG = eH = 1; eG = 0; - } else if (*str == 'H') { + } + if (mod.host) { if (!exclude_GH) exclude_GH = eG = eH = 1; eH = 0; - } else if (*str == 'I') { - eI = 1; - } else if (*str == 'p') { - precise++; - /* use of precise requires exclude_guest */ - if (!exclude_GH) - eG = 1; - } else if (*str == 'P') { - precise_max = 1; - } else if (*str == 'S') { - sample_read = 1; - } else if (*str == 'D') { - pinned = 1; - } else if (*str == 'e') { - exclusive = 1; - } else if (*str == 'W') { - weak = 1; - } else if (*str == 'b') { - bpf_counter = 1; - } else - break; - - ++str; + } + evsel->core.attr.exclude_user = eu; + evsel->core.attr.exclude_kernel = ek; + evsel->core.attr.exclude_hv = eh; + evsel->core.attr.exclude_host = eH; + evsel->core.attr.exclude_guest = eG; + evsel->exclude_GH = exclude_GH; + + /* Simple modifiers copied to the evsel. */ + if (mod.precise) { + u8 precise = evsel->core.attr.precise_ip + mod.precise; + /* + * precise ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + if (precise > 3) { + char *help; + + if (asprintf(&help, + "Maximum combined precise value is 3, adding precision to \"%s\"", + evsel__name(evsel)) > 0) { + parse_events_error__handle(parse_state->error, + loc->first_column, + help, NULL); + } + return -EINVAL; + } + evsel->core.attr.precise_ip = precise; + } + if (mod.precise_max) + evsel->precise_max = 1; + if (mod.non_idle) + evsel->core.attr.exclude_idle = 1; + if (mod.sample_read) + evsel->sample_read = 1; + if (mod.pinned && evsel__is_group_leader(evsel)) + evsel->core.attr.pinned = 1; + if (mod.exclusive && evsel__is_group_leader(evsel)) + evsel->core.attr.exclusive = 1; + if (mod.weak) + evsel->weak_group = true; + if (mod.bpf) + evsel->bpf_counter = true; } - - /* - * precise ip: - * - * 0 - SAMPLE_IP can have arbitrary skid - * 1 - SAMPLE_IP must have constant skid - * 2 - SAMPLE_IP requested to have 0 skid - * 3 - SAMPLE_IP must have 0 skid - * - * See also PERF_RECORD_MISC_EXACT_IP - */ - if (precise > 3) - return -EINVAL; - - mod->eu = eu; - mod->ek = ek; - mod->eh = eh; - mod->eH = eH; - mod->eG = eG; - mod->eI = eI; - mod->precise = precise; - mod->precise_max = precise_max; - mod->exclude_GH = exclude_GH; - mod->sample_read = sample_read; - mod->pinned = pinned; - mod->weak = weak; - mod->bpf_counter = bpf_counter; - mod->exclusive = exclusive; - return 0; } -/* - * Basic modifier sanity check to validate it contains only one - * instance of any modifier (apart from 'p') present. - */ -static int check_modifier(char *str) +int parse_events__modifier_group(struct parse_events_state *parse_state, void *loc, + struct list_head *list, + struct parse_events_modifier mod) { - char *p = str; - - /* The sizeof includes 0 byte as well. */ - if (strlen(str) > (sizeof("ukhGHpppPSDIWeb") - 1)) - return -1; - - while (*p) { - if (*p != 'p' && strchr(p + 1, *p)) - return -1; - p++; - } - - return 0; + return parse_events__modifier_list(parse_state, loc, list, mod, /*group=*/true); } -int parse_events__modifier_event(struct list_head *list, char *str, bool add) +int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc, + struct list_head *list, + struct parse_events_modifier mod) { - struct evsel *evsel; - struct event_modifier mod; - - if (str == NULL) - return 0; - - if (check_modifier(str)) - return -EINVAL; - - if (!add && get_event_modifier(&mod, str, NULL)) - return -EINVAL; - - __evlist__for_each_entry(list, evsel) { - if (add && get_event_modifier(&mod, str, evsel)) - return -EINVAL; - - evsel->core.attr.exclude_user = mod.eu; - evsel->core.attr.exclude_kernel = mod.ek; - evsel->core.attr.exclude_hv = mod.eh; - evsel->core.attr.precise_ip = mod.precise; - evsel->core.attr.exclude_host = mod.eH; - evsel->core.attr.exclude_guest = mod.eG; - evsel->core.attr.exclude_idle = mod.eI; - evsel->exclude_GH = mod.exclude_GH; - evsel->sample_read = mod.sample_read; - evsel->precise_max = mod.precise_max; - evsel->weak_group = mod.weak; - evsel->bpf_counter = mod.bpf_counter; - - if (evsel__is_group_leader(evsel)) { - evsel->core.attr.pinned = mod.pinned; - evsel->core.attr.exclusive = mod.exclusive; - } - } - - return 0; + return parse_events__modifier_list(parse_state, loc, list, mod, /*group=*/false); } -int parse_events_name(struct list_head *list, const char *name) +int parse_events__set_default_name(struct list_head *list, char *name) { struct evsel *evsel; + bool used_name = false; __evlist__for_each_entry(list, evsel) { if (!evsel->name) { - evsel->name = strdup(name); + evsel->name = used_name ? strdup(name) : name; + used_name = true; if (!evsel->name) return -ENOMEM; } } - + if (!used_name) + free(name); return 0; } @@ -2121,7 +2121,7 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter, struct parse_events_error *err, struct perf_pmu *fake_pmu, - bool warn_if_reordered) + bool warn_if_reordered, bool fake_tp) { struct parse_events_state parse_state = { .list = LIST_HEAD_INIT(parse_state.list), @@ -2129,6 +2129,7 @@ int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filte .error = err, .stoken = PE_START_EVENTS, .fake_pmu = fake_pmu, + .fake_tp = fake_tp, .pmu_filter = pmu_filter, .match_legacy_cache_terms = true, }; @@ -2338,7 +2339,8 @@ int parse_events_option(const struct option *opt, const char *str, parse_events_error__init(&err); ret = __parse_events(*args->evlistp, str, args->pmu_filter, &err, - /*fake_pmu=*/NULL, /*warn_if_reordered=*/true); + /*fake_pmu=*/NULL, /*warn_if_reordered=*/true, + /*fake_tp=*/false); if (ret) { parse_events_error__print(&err, str); @@ -2576,7 +2578,7 @@ int parse_events_term__term(struct parse_events_term **term, } int parse_events_term__clone(struct parse_events_term **new, - struct parse_events_term *term) + const struct parse_events_term *term) { char *str; struct parse_events_term temp = *term; @@ -2691,15 +2693,6 @@ int parse_events_terms__to_strbuf(const struct parse_events_terms *terms, struct return 0; } -void parse_events_evlist_error(struct parse_events_state *parse_state, - int idx, const char *str) -{ - if (!parse_state->error) - return; - - parse_events_error__handle(parse_state->error, idx, strdup(str), NULL); -} - static void config_terms_list(char *buf, size_t buf_sz) { int i; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 809359e854..e13de2c8b7 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -32,14 +32,14 @@ int parse_events_option_new_evlist(const struct option *opt, const char *str, in __attribute__((nonnull(1, 2, 4))) int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter, struct parse_events_error *error, struct perf_pmu *fake_pmu, - bool warn_if_reordered); + bool warn_if_reordered, bool fake_tp); __attribute__((nonnull(1, 2, 3))) static inline int parse_events(struct evlist *evlist, const char *str, struct parse_events_error *err) { return __parse_events(evlist, str, /*pmu_filter=*/NULL, err, /*fake_pmu=*/NULL, - /*warn_if_reordered=*/true); + /*warn_if_reordered=*/true, /*fake_tp=*/false); } int parse_event(struct evlist *evlist, const char *str); @@ -152,6 +152,8 @@ struct parse_events_state { int stoken; /* Special fake PMU marker for testing. */ struct perf_pmu *fake_pmu; + /* Skip actual tracepoint processing for testing. */ + bool fake_tp; /* If non-null, when wildcard matching only match the given PMU. */ const char *pmu_filter; /* Should PE_LEGACY_NAME tokens be generated for config terms? */ @@ -178,7 +180,7 @@ int parse_events_term__term(struct parse_events_term **term, enum parse_events__term_type term_rhs, void *loc_term, void *loc_val); int parse_events_term__clone(struct parse_events_term **new, - struct parse_events_term *term); + const struct parse_events_term *term); void parse_events_term__delete(struct parse_events_term *term); void parse_events_terms__delete(struct parse_events_terms *terms); @@ -186,33 +188,49 @@ void parse_events_terms__init(struct parse_events_terms *terms); void parse_events_terms__exit(struct parse_events_terms *terms); int parse_events_terms(struct parse_events_terms *terms, const char *str, FILE *input); int parse_events_terms__to_strbuf(const struct parse_events_terms *terms, struct strbuf *sb); -int parse_events__modifier_event(struct list_head *list, char *str, bool add); -int parse_events__modifier_group(struct list_head *list, char *event_mod); -int parse_events_name(struct list_head *list, const char *name); -int parse_events_add_tracepoint(struct list_head *list, int *idx, + +struct parse_events_modifier { + u8 precise; /* Number of repeated 'p' for precision. */ + bool precise_max : 1; /* 'P' */ + bool non_idle : 1; /* 'I' */ + bool sample_read : 1; /* 'S' */ + bool pinned : 1; /* 'D' */ + bool exclusive : 1; /* 'e' */ + bool weak : 1; /* 'W' */ + bool bpf : 1; /* 'b' */ + bool user : 1; /* 'u' */ + bool kernel : 1; /* 'k' */ + bool hypervisor : 1; /* 'h' */ + bool guest : 1; /* 'G' */ + bool host : 1; /* 'H' */ +}; + +int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc, + struct list_head *list, struct parse_events_modifier mod); +int parse_events__modifier_group(struct parse_events_state *parse_state, void *loc, + struct list_head *list, struct parse_events_modifier mod); +int parse_events__set_default_name(struct list_head *list, char *name); +int parse_events_add_tracepoint(struct parse_events_state *parse_state, + struct list_head *list, const char *sys, const char *event, struct parse_events_error *error, struct parse_events_terms *head_config, void *loc); int parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, u32 type, u64 config, - struct parse_events_terms *head_config, + const struct parse_events_terms *head_config, bool wildcard); int parse_events_add_tool(struct parse_events_state *parse_state, struct list_head *list, int tool_event); int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_state *parse_state, - struct parse_events_terms *head_config); + struct parse_events_terms *parsed_terms); int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config); int parse_events_add_breakpoint(struct parse_events_state *parse_state, struct list_head *list, u64 addr, char *type, u64 len, struct parse_events_terms *head_config); -int parse_events_add_pmu(struct parse_events_state *parse_state, - struct list_head *list, const char *name, - const struct parse_events_terms *const_parsed_terms, - bool auto_merge_stats, void *loc); struct evsel *parse_events__add_event(int idx, struct perf_event_attr *attr, const char *name, const char *metric_id, @@ -223,18 +241,20 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, const struct parse_events_terms *const_parsed_terms, struct list_head **listp, void *loc); +int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state, + const char *event_or_pmu, + const struct parse_events_terms *const_parsed_terms, + struct list_head **listp, + void *loc_); + void parse_events__set_leader(char *name, struct list_head *list); -void parse_events_update_lists(struct list_head *list_event, - struct list_head *list_all); -void parse_events_evlist_error(struct parse_events_state *parse_state, - int idx, const char *str); struct event_symbol { const char *symbol; const char *alias; }; -extern struct event_symbol event_symbols_hw[]; -extern struct event_symbol event_symbols_sw[]; +extern const struct event_symbol event_symbols_hw[]; +extern const struct event_symbol event_symbols_sw[]; char *parse_events_formats_error_string(char *additional_terms); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index e86c45675e..16045c383a 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -18,26 +18,34 @@ char *parse_events_get_text(yyscan_t yyscanner); YYSTYPE *parse_events_get_lval(yyscan_t yyscanner); +int parse_events_get_column(yyscan_t yyscanner); +int parse_events_get_leng(yyscan_t yyscanner); -static int __value(YYSTYPE *yylval, char *str, int base, int token) +static int get_column(yyscan_t scanner) { - u64 num; - - errno = 0; - num = strtoull(str, NULL, base); - if (errno) - return PE_ERROR; - - yylval->num = num; - return token; + return parse_events_get_column(scanner) - parse_events_get_leng(scanner); } -static int value(yyscan_t scanner, int base) +static int value(struct parse_events_state *parse_state, yyscan_t scanner, int base) { YYSTYPE *yylval = parse_events_get_lval(scanner); char *text = parse_events_get_text(scanner); + u64 num; - return __value(yylval, text, base, PE_VALUE); + errno = 0; + num = strtoull(text, NULL, base); + if (errno) { + struct parse_events_error *error = parse_state->error; + char *help = NULL; + + if (asprintf(&help, "Bad base %d number \"%s\"", base, text) > 0) + parse_events_error__handle(error, get_column(scanner), help , NULL); + + return PE_ERROR; + } + + yylval->num = num; + return PE_VALUE; } static int str(yyscan_t scanner, int token) @@ -88,6 +96,11 @@ static int drv_str(yyscan_t scanner, int token) return token; } +/* + * Use yyless to return all the characaters to the input. Update the column for + * location debugging. If __alloc is non-zero set yylval to the text for the + * returned token's value. + */ #define REWIND(__alloc) \ do { \ YYSTYPE *__yylval = parse_events_get_lval(yyscanner); \ @@ -134,6 +147,77 @@ static int hw_term(yyscan_t scanner, int config) return PE_TERM_HW; } +static void modifiers_error(struct parse_events_state *parse_state, yyscan_t scanner, + int pos, char mod_char, const char *mod_name) +{ + struct parse_events_error *error = parse_state->error; + char *help = NULL; + + if (asprintf(&help, "Duplicate modifier '%c' (%s)", mod_char, mod_name) > 0) + parse_events_error__handle(error, get_column(scanner) + pos, help , NULL); +} + +static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner) +{ + YYSTYPE *yylval = parse_events_get_lval(scanner); + char *text = parse_events_get_text(scanner); + struct parse_events_modifier mod = { .precise = 0, }; + + for (size_t i = 0, n = strlen(text); i < n; i++) { +#define CASE(c, field) \ + case c: \ + if (mod.field) { \ + modifiers_error(parse_state, scanner, i, c, #field); \ + return PE_ERROR; \ + } \ + mod.field = true; \ + break + + switch (text[i]) { + CASE('u', user); + CASE('k', kernel); + CASE('h', hypervisor); + CASE('I', non_idle); + CASE('G', guest); + CASE('H', host); + case 'p': + mod.precise++; + /* + * precise ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + if (mod.precise > 3) { + struct parse_events_error *error = parse_state->error; + char *help = strdup("Maximum precise value is 3"); + + if (help) { + parse_events_error__handle(error, get_column(scanner) + i, + help , NULL); + } + return PE_ERROR; + } + break; + CASE('P', precise_max); + CASE('S', sample_read); + CASE('D', pinned); + CASE('W', weak); + CASE('e', exclusive); + CASE('b', bpf); + default: + return PE_ERROR; + } +#undef CASE + } + yylval->mod = mod; + return PE_MODIFIER_EVENT; +} + #define YY_USER_ACTION \ do { \ yylloc->last_column = yylloc->first_column; \ @@ -158,15 +242,15 @@ event [^,{}/]+ num_dec [0-9]+ num_hex 0x[a-fA-F0-9]{1,16} num_raw_hex [a-fA-F0-9]{1,16} -name [a-zA-Z_*?\[\]][a-zA-Z0-9_*?.\[\]!\-]* -name_tag [\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] +name [a-zA-Z0-9_*?\[\]][a-zA-Z0-9_*?.\[\]!\-]* +name_tag [\'][a-zA-Z0-9_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]* drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? /* * If you add a modifier you need to update check_modifier(). * Also, the letters in modifier_event must not be in modifier_bp. */ -modifier_event [ukhpPGHSDIWeb]+ +modifier_event [ukhpPGHSDIWeb]{1,15} modifier_bp [rwx]{1,3} lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node) lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss) @@ -283,8 +367,8 @@ r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } */ "/"/{digit} { return PE_BP_SLASH; } "/"/{non_digit} { BEGIN(config); return '/'; } -{num_dec} { return value(yyscanner, 10); } -{num_hex} { return value(yyscanner, 16); } +{num_dec} { return value(_parse_state, yyscanner, 10); } +{num_hex} { return value(_parse_state, yyscanner, 16); } /* * We need to separate 'mem:' scanner part, in order to get specific * modifier bits parsed out. Otherwise we would need to handle PE_NAME @@ -330,10 +414,10 @@ cgroup-switches { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CG {lc_type}-{lc_op_result}-{lc_op_result} { return str(yyscanner, PE_LEGACY_CACHE); } mem: { BEGIN(mem); return PE_PREFIX_MEM; } r{num_raw_hex} { return str(yyscanner, PE_RAW); } -{num_dec} { return value(yyscanner, 10); } -{num_hex} { return value(yyscanner, 16); } +{num_dec} { return value(_parse_state, yyscanner, 10); } +{num_hex} { return value(_parse_state, yyscanner, 16); } -{modifier_event} { return str(yyscanner, PE_MODIFIER_EVENT); } +{modifier_event} { return modifiers(_parse_state, yyscanner); } {name} { return str(yyscanner, PE_NAME); } {name_tag} { return str(yyscanner, PE_NAME); } "/" { BEGIN(config); return '/'; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index d70f5d84af..b3c51f06cb 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -69,12 +69,12 @@ static void free_list_evsel(struct list_head* list_evsel) %type PE_VALUE_SYM_HW %type PE_VALUE_SYM_SW %type PE_VALUE_SYM_TOOL +%type PE_MODIFIER_EVENT %type PE_TERM %type value_sym %type PE_RAW %type PE_NAME %type PE_LEGACY_CACHE -%type PE_MODIFIER_EVENT %type PE_MODIFIER_BP %type PE_EVENT_NAME %type PE_DRV_CFG_TERM @@ -111,6 +111,7 @@ static void free_list_evsel(struct list_head* list_evsel) { char *str; u64 num; + struct parse_events_modifier mod; enum parse_events__term_type term_type; struct list_head *list_evsel; struct parse_events_terms *list_terms; @@ -126,6 +127,10 @@ static void free_list_evsel(struct list_head* list_evsel) } %% + /* + * Entry points. We are either parsing events or terminals. Just terminal + * parsing is used for parsing events in sysfs. + */ start: PE_START_EVENTS start_events | @@ -133,31 +138,36 @@ PE_START_TERMS start_terms start_events: groups { + /* Take the parsed events, groups.. and place into parse_state. */ + struct list_head *groups = $1; struct parse_events_state *parse_state = _parse_state; - /* frees $1 */ - parse_events_update_lists($1, &parse_state->list); + list_splice_tail(groups, &parse_state->list); + free(groups); } -groups: +groups: /* A list of groups or events. */ groups ',' group { - struct list_head *list = $1; - struct list_head *group = $3; + /* Merge group into the list of events/groups. */ + struct list_head *groups = $1; + struct list_head *group = $3; - /* frees $3 */ - parse_events_update_lists(group, list); - $$ = list; + list_splice_tail(group, groups); + free(group); + $$ = groups; } | groups ',' event { - struct list_head *list = $1; + /* Merge event into the list of events/groups. */ + struct list_head *groups = $1; struct list_head *event = $3; - /* frees $3 */ - parse_events_update_lists(event, list); - $$ = list; + + list_splice_tail(event, groups); + free(event); + $$ = groups; } | group @@ -167,20 +177,13 @@ event group: group_def ':' PE_MODIFIER_EVENT { + /* Apply the modifier to the events in the group_def. */ struct list_head *list = $1; int err; - err = parse_events__modifier_group(list, $3); - free($3); - if (err) { - struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; - - parse_events_error__handle(error, @3.first_column, - strdup("Bad modifier"), NULL); - free_list_evsel(list); + err = parse_events__modifier_group(_parse_state, &@3, list, $3); + if (err) YYABORT; - } $$ = list; } | @@ -191,7 +194,10 @@ PE_NAME '{' events '}' { struct list_head *list = $3; - /* Takes ownership of $1. */ + /* + * Set the first entry of list to be the leader. Set the group name on + * the leader to $1 taking ownership. + */ parse_events__set_leader($1, list); $$ = list; } @@ -200,6 +206,7 @@ PE_NAME '{' events '}' { struct list_head *list = $2; + /* Set the first entry of list to be the leader clearing the group name. */ parse_events__set_leader(NULL, list); $$ = list; } @@ -207,12 +214,12 @@ PE_NAME '{' events '}' events: events ',' event { + struct list_head *events = $1; struct list_head *event = $3; - struct list_head *list = $1; - /* frees $3 */ - parse_events_update_lists(event, list); - $$ = list; + list_splice_tail(event, events); + free(event); + $$ = events; } | event @@ -230,17 +237,9 @@ event_name PE_MODIFIER_EVENT * (there could be more events added for multiple tracepoint * definitions via '*?'. */ - err = parse_events__modifier_event(list, $2, false); - free($2); - if (err) { - struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; - - parse_events_error__handle(error, @2.first_column, - strdup("Bad modifier"), NULL); - free_list_evsel(list); + err = parse_events__modifier_event(_parse_state, &@2, list, $2); + if (err) YYABORT; - } $$ = list; } | @@ -249,10 +248,14 @@ event_name event_name: PE_EVENT_NAME event_def { - int err; + /* + * When an event is parsed the text is rewound and the entire text of + * the event is set to the str of PE_EVENT_NAME token matched here. If + * no name was on an event via a term, set the name to the entire text + * taking ownership of the allocation. + */ + int err = parse_events__set_default_name($2, $1); - err = parse_events_name($2, $1); - free($1); if (err) { free_list_evsel($2); YYNOMEM; @@ -273,78 +276,15 @@ event_def: event_pmu | event_pmu: PE_NAME opt_pmu_config { - struct parse_events_state *parse_state = _parse_state; /* List of created evsels. */ struct list_head *list = NULL; - char *pattern = NULL; + int err = parse_events_multi_pmu_add_or_add_pmu(_parse_state, $1, $2, &list, &@1); -#define CLEANUP \ - do { \ - parse_events_terms__delete($2); \ - free(list); \ - free($1); \ - free(pattern); \ - } while(0) - - list = alloc_list(); - if (!list) { - CLEANUP; - YYNOMEM; - } - /* Attempt to add to list assuming $1 is a PMU name. */ - if (parse_events_add_pmu(parse_state, list, $1, $2, /*auto_merge_stats=*/false, &@1)) { - struct perf_pmu *pmu = NULL; - int ok = 0; - - /* Failure to add, try wildcard expansion of $1 as a PMU name. */ - if (asprintf(&pattern, "%s*", $1) < 0) { - CLEANUP; - YYNOMEM; - } - - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - const char *name = pmu->name; - - if (parse_events__filter_pmu(parse_state, pmu)) - continue; - - if (!strncmp(name, "uncore_", 7) && - strncmp($1, "uncore_", 7)) - name += 7; - if (!perf_pmu__match(pattern, name, $1) || - !perf_pmu__match(pattern, pmu->alias_name, $1)) { - bool auto_merge_stats = perf_pmu__auto_merge_stats(pmu); - - if (!parse_events_add_pmu(parse_state, list, pmu->name, $2, - auto_merge_stats, &@1)) { - ok++; - parse_state->wild_card_pmus = true; - } - } - } - - if (!ok) { - /* Failure to add, assume $1 is an event name. */ - zfree(&list); - ok = !parse_events_multi_pmu_add(parse_state, $1, $2, &list, &@1); - } - if (!ok) { - struct parse_events_error *error = parse_state->error; - char *help; - - if (asprintf(&help, "Unable to find PMU or event on a PMU of '%s'", $1) < 0) - help = NULL; - parse_events_error__handle(error, @1.first_column, - strdup("Bad event or PMU"), - help); - CLEANUP; - YYABORT; - } - } + parse_events_terms__delete($2); + free($1); + if (err) + PE_ABORT(err); $$ = list; - list = NULL; - CLEANUP; -#undef CLEANUP } | PE_NAME sep_dc @@ -537,7 +477,7 @@ tracepoint_name opt_event_config if (!list) YYNOMEM; - err = parse_events_add_tracepoint(list, &parse_state->idx, $1.sys, $1.event, + err = parse_events_add_tracepoint(parse_state, list, $1.sys, $1.event, error, $2, &@1); parse_events_terms__delete($2); @@ -666,6 +606,11 @@ event_term } name_or_raw: PE_RAW | PE_NAME | PE_LEGACY_CACHE +| +PE_TERM_HW +{ + $$ = $1.str; +} event_term: PE_RAW @@ -707,20 +652,6 @@ name_or_raw '=' PE_VALUE $$ = term; } | -name_or_raw '=' PE_TERM_HW -{ - struct parse_events_term *term; - int err = parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER, - $1, $3.str, &@1, &@3); - - if (err) { - free($1); - free($3.str); - PE_ABORT(err); - } - $$ = term; -} -| PE_LEGACY_CACHE { struct parse_events_term *term; @@ -773,18 +704,6 @@ PE_TERM '=' name_or_raw $$ = term; } | -PE_TERM '=' PE_TERM_HW -{ - struct parse_events_term *term; - int err = parse_events_term__str(&term, $1, /*config=*/NULL, $3.str, &@1, &@3); - - if (err) { - free($3.str); - PE_ABORT(err); - } - $$ = term; -} -| PE_TERM '=' PE_TERM { struct parse_events_term *term; @@ -845,9 +764,15 @@ sep_slash_slash_dc: '/' '/' | ':' | %% -void parse_events_error(YYLTYPE *loc, void *parse_state, +void parse_events_error(YYLTYPE *loc, void *_parse_state, void *scanner __maybe_unused, char const *msg __maybe_unused) { - parse_events_evlist_error(parse_state, loc->last_column, "parser error"); + struct parse_events_state *parse_state = _parse_state; + + if (!parse_state->error || !list_empty(&parse_state->error->list)) + return; + + parse_events_error__handle(parse_state->error, loc->last_column, + strdup("Unrecognized input"), NULL); } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index cc349d9cb0..888ce99122 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -194,7 +194,7 @@ static void perf_pmu_format__load(const struct perf_pmu *pmu, struct perf_pmu_fo * Parse & process all the sysfs attributes located under * the directory specified in 'dir' parameter. */ -int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load) +static int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load) { struct dirent *evt_ent; DIR *format_dir; @@ -244,7 +244,7 @@ int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load) * located at: * /sys/bus/event_source/devices//format as sysfs group attributes. */ -static int pmu_format(struct perf_pmu *pmu, int dirfd, const char *name) +static int pmu_format(struct perf_pmu *pmu, int dirfd, const char *name, bool eager_load) { int fd; @@ -253,7 +253,7 @@ static int pmu_format(struct perf_pmu *pmu, int dirfd, const char *name) return 0; /* it'll close the fd */ - if (perf_pmu__format_parse(pmu, fd, /*eager_load=*/false)) + if (perf_pmu__format_parse(pmu, fd, eager_load)) return -1; return 0; @@ -551,7 +551,8 @@ static int perf_pmu__new_alias(struct perf_pmu *pmu, const char *name, unit = pe->unit; perpkg = pe->perpkg; deprecated = pe->deprecated; - pmu_name = pe->pmu; + if (pe->pmu && strcmp(pe->pmu, "default_core")) + pmu_name = pe->pmu; } alias = zalloc(sizeof(*alias)); @@ -634,33 +635,18 @@ static inline bool pmu_alias_info_file(const char *name) * Reading the pmu event aliases definition, which should be located at: * /sys/bus/event_source/devices//events as sysfs group attributes. */ -static int pmu_aliases_parse(struct perf_pmu *pmu) +static int __pmu_aliases_parse(struct perf_pmu *pmu, int events_dir_fd) { - char path[PATH_MAX]; struct dirent *evt_ent; DIR *event_dir; - size_t len; - int fd, dir_fd; - - len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path)); - if (!len) - return 0; - scnprintf(path + len, sizeof(path) - len, "%s/events", pmu->name); - - dir_fd = open(path, O_DIRECTORY); - if (dir_fd == -1) { - pmu->sysfs_aliases_loaded = true; - return 0; - } - event_dir = fdopendir(dir_fd); - if (!event_dir){ - close (dir_fd); + event_dir = fdopendir(events_dir_fd); + if (!event_dir) return -EINVAL; - } while ((evt_ent = readdir(event_dir))) { char *name = evt_ent->d_name; + int fd; FILE *file; if (!strcmp(name, ".") || !strcmp(name, "..")) @@ -672,7 +658,7 @@ static int pmu_aliases_parse(struct perf_pmu *pmu) if (pmu_alias_info_file(name)) continue; - fd = openat(dir_fd, name, O_RDONLY); + fd = openat(events_dir_fd, name, O_RDONLY); if (fd == -1) { pr_debug("Cannot open %s\n", name); continue; @@ -691,11 +677,50 @@ static int pmu_aliases_parse(struct perf_pmu *pmu) } closedir(event_dir); - close (dir_fd); pmu->sysfs_aliases_loaded = true; return 0; } +static int pmu_aliases_parse(struct perf_pmu *pmu) +{ + char path[PATH_MAX]; + size_t len; + int events_dir_fd, ret; + + if (pmu->sysfs_aliases_loaded) + return 0; + + len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path)); + if (!len) + return 0; + scnprintf(path + len, sizeof(path) - len, "%s/events", pmu->name); + + events_dir_fd = open(path, O_DIRECTORY); + if (events_dir_fd == -1) { + pmu->sysfs_aliases_loaded = true; + return 0; + } + ret = __pmu_aliases_parse(pmu, events_dir_fd); + close(events_dir_fd); + return ret; +} + +static int pmu_aliases_parse_eager(struct perf_pmu *pmu, int sysfs_fd) +{ + char path[FILENAME_MAX + 7]; + int ret, events_dir_fd; + + scnprintf(path, sizeof(path), "%s/events", pmu->name); + events_dir_fd = openat(sysfs_fd, path, O_DIRECTORY, 0); + if (events_dir_fd == -1) { + pmu->sysfs_aliases_loaded = true; + return 0; + } + ret = __pmu_aliases_parse(pmu, events_dir_fd); + close(events_dir_fd); + return ret; +} + static int pmu_alias_terms(struct perf_pmu_alias *alias, int err_loc, struct list_head *terms) { struct parse_events_term *term, *cloned; @@ -1034,7 +1059,8 @@ perf_pmu__arch_init(struct perf_pmu *pmu) pmu->mem_events = perf_mem_events; } -struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *name) +struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *name, + bool eager_load) { struct perf_pmu *pmu; __u32 type; @@ -1063,7 +1089,7 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char * type value and format definitions. Load both right * now. */ - if (pmu_format(pmu, dirfd, name)) + if (pmu_format(pmu, dirfd, name, eager_load)) goto err; pmu->is_core = is_pmu_core(name); @@ -1087,6 +1113,9 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char perf_pmu__arch_init(pmu); + if (eager_load) + pmu_aliases_parse_eager(pmu, dirfd); + return pmu; err: zfree(&pmu->name); @@ -1649,6 +1678,62 @@ bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name) return false; } +int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb) +{ + static const char *const terms[] = { + "config=0..0xffffffffffffffff", + "config1=0..0xffffffffffffffff", + "config2=0..0xffffffffffffffff", + "config3=0..0xffffffffffffffff", + "name=string", + "period=number", + "freq=number", + "branch_type=(u|k|hv|any|...)", + "time", + "call-graph=(fp|dwarf|lbr)", + "stack-size=number", + "max-stack=number", + "nr=number", + "inherit", + "no-inherit", + "overwrite", + "no-overwrite", + "percore", + "aux-output", + "aux-sample-size=number", + }; + struct perf_pmu_format *format; + int ret; + + /* + * max-events and driver-config are missing above as are the internal + * types user, metric-id, raw, legacy cache and hardware. Assert against + * the enum parse_events__term_type so they are kept in sync. + */ + _Static_assert(ARRAY_SIZE(terms) == __PARSE_EVENTS__TERM_TYPE_NR - 6, + "perf_pmu__for_each_format()'s terms must be kept in sync with enum parse_events__term_type"); + list_for_each_entry(format, &pmu->format, list) { + perf_pmu_format__load(pmu, format); + ret = cb(state, format->name, (int)format->value, format->bits); + if (ret) + return ret; + } + if (!pmu->is_core) + return 0; + + for (size_t i = 0; i < ARRAY_SIZE(terms); i++) { + int config = PERF_PMU_FORMAT_VALUE_CONFIG; + + if (i < PERF_PMU_FORMAT_VALUE_CONFIG_END) + config = i; + + ret = cb(state, terms[i], config, /*bits=*/NULL); + if (ret) + return ret; + } + return 0; +} + bool is_pmu_core(const char *name) { return !strcmp(name, "cpu") || !strcmp(name, "cpum_cf") || is_sysfs_pmu_core(name); @@ -1744,8 +1829,12 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus, pmu_add_cpu_aliases(pmu); list_for_each_entry(event, &pmu->aliases, list) { size_t buf_used; + int pmu_name_len; info.pmu_name = event->pmu_name ?: pmu->name; + pmu_name_len = skip_duplicate_pmus + ? pmu_name_len_no_suffix(info.pmu_name, /*num=*/NULL) + : (int)strlen(info.pmu_name); info.alias = NULL; if (event->desc) { info.name = event->name; @@ -1770,7 +1859,7 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus, info.encoding_desc = buf + buf_used; parse_events_terms__to_strbuf(&event->terms, &sb); buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used, - "%s/%s/", info.pmu_name, sb.buf) + 1; + "%.*s/%s/", pmu_name_len, info.pmu_name, sb.buf) + 1; info.topic = event->topic; info.str = sb.buf; info.deprecated = event->deprecated; @@ -2051,18 +2140,29 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, name ?: "N/A", buf, config_name, config); } -int perf_pmu__match(const char *pattern, const char *name, const char *tok) +bool perf_pmu__match(const struct perf_pmu *pmu, const char *tok) { - if (!name) - return -1; + const char *name = pmu->name; + bool need_fnmatch = strchr(tok, '*') != NULL; - if (fnmatch(pattern, name, 0)) - return -1; + if (!strncmp(tok, "uncore_", 7)) + tok += 7; + if (!strncmp(name, "uncore_", 7)) + name += 7; - if (tok && !perf_pmu__match_ignoring_suffix(name, tok)) - return -1; + if (perf_pmu__match_ignoring_suffix(name, tok) || + (need_fnmatch && !fnmatch(tok, name, 0))) + return true; - return 0; + name = pmu->alias_name; + if (!name) + return false; + + if (!strncmp(name, "uncore_", 7)) + name += 7; + + return perf_pmu__match_ignoring_suffix(name, tok) || + (need_fnmatch && !fnmatch(tok, name, 0)); } double __weak perf_pmu__cpu_slots_per_cycle(void) diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 77c59ebc05..b2d3fd291f 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -198,6 +198,8 @@ struct pmu_event_info { }; typedef int (*pmu_event_callback)(void *state, struct pmu_event_info *info); +typedef int (*pmu_format_callback)(void *state, const char *name, int config, + const unsigned long *bits); void pmu_add_sys_aliases(struct perf_pmu *pmu); int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, @@ -214,9 +216,9 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_ struct parse_events_error *err); int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb); -int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load); void perf_pmu_format__set_value(void *format, int config, unsigned long *bits); bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name); +int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb); bool is_pmu_core(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); @@ -262,7 +264,7 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, const char *config_name); void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu); -int perf_pmu__match(const char *pattern, const char *name, const char *tok); +bool perf_pmu__match(const struct perf_pmu *pmu, const char *tok); double perf_pmu__cpu_slots_per_cycle(void); int perf_pmu__event_source_devices_scnprintf(char *pathname, size_t size); @@ -271,7 +273,8 @@ int perf_pmu__pathname_scnprintf(char *buf, size_t size, int perf_pmu__event_source_devices_fd(void); int perf_pmu__pathname_fd(int dirfd, const char *pmu_name, const char *filename, int flags); -struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *lookup_name); +struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *lookup_name, + bool eager_load); struct perf_pmu *perf_pmu__create_placeholder_core_pmu(struct list_head *core_pmus); void perf_pmu__delete(struct perf_pmu *pmu); struct perf_pmu *perf_pmus__find_core_pmu(void); diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 16505071d3..6907e3e7fb 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -16,6 +16,7 @@ #include "pmus.h" #include "pmu.h" #include "print-events.h" +#include "strbuf.h" /* * core_pmus: A PMU belongs to core_pmus if it's name is "cpu" or it's sysfs @@ -123,7 +124,8 @@ struct perf_pmu *perf_pmus__find(const char *name) return NULL; dirfd = perf_pmu__event_source_devices_fd(); - pmu = perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name); + pmu = perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name, + /*eager_load=*/false); close(dirfd); if (!pmu) { @@ -158,7 +160,8 @@ static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) if (core_pmu && read_sysfs_core_pmus) return NULL; - return perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name); + return perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name, + /*eager_load=*/false); } static int pmus_cmp(void *priv __maybe_unused, @@ -474,8 +477,8 @@ void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *p qsort(aliases, len, sizeof(struct sevent), cmp_sevent); for (int j = 0; j < len; j++) { /* Skip duplicates */ - if (j > 0 && pmu_alias_is_duplicate(&aliases[j], &aliases[j - 1])) - continue; + if (j < len - 1 && pmu_alias_is_duplicate(&aliases[j], &aliases[j + 1])) + goto free; print_cb->print_event(print_state, aliases[j].pmu_name, @@ -488,6 +491,7 @@ void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *p aliases[j].desc, aliases[j].long_desc, aliases[j].encoding_desc); +free: zfree(&aliases[j].name); zfree(&aliases[j].alias); zfree(&aliases[j].scale_unit); @@ -503,6 +507,99 @@ void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *p zfree(&aliases); } +struct build_format_string_args { + struct strbuf short_string; + struct strbuf long_string; + int num_formats; +}; + +static int build_format_string(void *state, const char *name, int config, + const unsigned long *bits) +{ + struct build_format_string_args *args = state; + unsigned int num_bits; + int ret1, ret2 = 0; + + (void)config; + args->num_formats++; + if (args->num_formats > 1) { + strbuf_addch(&args->long_string, ','); + if (args->num_formats < 4) + strbuf_addch(&args->short_string, ','); + } + num_bits = bits ? bitmap_weight(bits, PERF_PMU_FORMAT_BITS) : 0; + if (num_bits <= 1) { + ret1 = strbuf_addf(&args->long_string, "%s", name); + if (args->num_formats < 4) + ret2 = strbuf_addf(&args->short_string, "%s", name); + } else if (num_bits > 8) { + ret1 = strbuf_addf(&args->long_string, "%s=0..0x%llx", name, + ULLONG_MAX >> (64 - num_bits)); + if (args->num_formats < 4) { + ret2 = strbuf_addf(&args->short_string, "%s=0..0x%llx", name, + ULLONG_MAX >> (64 - num_bits)); + } + } else { + ret1 = strbuf_addf(&args->long_string, "%s=0..%llu", name, + ULLONG_MAX >> (64 - num_bits)); + if (args->num_formats < 4) { + ret2 = strbuf_addf(&args->short_string, "%s=0..%llu", name, + ULLONG_MAX >> (64 - num_bits)); + } + } + return ret1 < 0 ? ret1 : (ret2 < 0 ? ret2 : 0); +} + +void perf_pmus__print_raw_pmu_events(const struct print_callbacks *print_cb, void *print_state) +{ + bool skip_duplicate_pmus = print_cb->skip_duplicate_pmus(print_state); + struct perf_pmu *(*scan_fn)(struct perf_pmu *); + struct perf_pmu *pmu = NULL; + + if (skip_duplicate_pmus) + scan_fn = perf_pmus__scan_skip_duplicates; + else + scan_fn = perf_pmus__scan; + + while ((pmu = scan_fn(pmu)) != NULL) { + struct build_format_string_args format_args = { + .short_string = STRBUF_INIT, + .long_string = STRBUF_INIT, + .num_formats = 0, + }; + int len = pmu_name_len_no_suffix(pmu->name, /*num=*/NULL); + const char *desc = "(see 'man perf-list' or 'man perf-record' on how to encode it)"; + + if (!pmu->is_core) + desc = NULL; + + strbuf_addf(&format_args.short_string, "%.*s/", len, pmu->name); + strbuf_addf(&format_args.long_string, "%.*s/", len, pmu->name); + perf_pmu__for_each_format(pmu, &format_args, build_format_string); + + if (format_args.num_formats > 3) + strbuf_addf(&format_args.short_string, ",.../modifier"); + else + strbuf_addf(&format_args.short_string, "/modifier"); + + strbuf_addf(&format_args.long_string, "/modifier"); + print_cb->print_event(print_state, + /*topic=*/NULL, + /*pmu_name=*/NULL, + format_args.short_string.buf, + /*event_alias=*/NULL, + /*scale_unit=*/NULL, + /*deprecated=*/false, + "Raw event descriptor", + desc, + /*long_desc=*/NULL, + format_args.long_string.buf); + + strbuf_release(&format_args.short_string); + strbuf_release(&format_args.long_string); + } +} + bool perf_pmus__have_event(const char *pname, const char *name) { struct perf_pmu *pmu = perf_pmus__find(pname); @@ -602,3 +699,13 @@ struct perf_pmu *perf_pmus__find_core_pmu(void) { return perf_pmus__scan_core(NULL); } + +struct perf_pmu *perf_pmus__add_test_pmu(int test_sysfs_dirfd, const char *name) +{ + /* + * Some PMU functions read from the sysfs mount point, so care is + * needed, hence passing the eager_load flag to load things like the + * format files. + */ + return perf_pmu__lookup(&other_pmus, test_sysfs_dirfd, name, /*eager_load=*/true); +} diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 94d2a08d89..9d4ded80b8 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -18,9 +18,12 @@ struct perf_pmu *perf_pmus__scan_core(struct perf_pmu *pmu); const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str); void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state); +void perf_pmus__print_raw_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool perf_pmus__have_event(const char *pname, const char *name); int perf_pmus__num_core_pmus(void); bool perf_pmus__supports_extended_type(void); char *perf_pmus__default_pmu_name(void); +struct perf_pmu *perf_pmus__add_test_pmu(int test_sysfs_dirfd, const char *name); + #endif /* __PMUS_H */ diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 7b54e93854..3f38c27f01 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -38,7 +39,7 @@ static const char * const event_type_descriptors[] = { "Software event", "Tracepoint event", "Hardware cache event", - "Raw hardware event descriptor", + "Raw event descriptor", "Hardware breakpoint", }; @@ -92,34 +93,48 @@ void print_tracepoint_events(const struct print_callbacks *print_cb __maybe_unus evt_items = scandirat(events_fd, sys_dirent->d_name, &evt_namelist, NULL, alphasort); for (int j = 0; j < evt_items; j++) { + /* + * Buffer sized at twice the max filename length + 1 + * separator + 1 \0 terminator. + */ + char buf[NAME_MAX * 2 + 2]; + /* 16 possible hex digits and 22 other characters and \0. */ + char encoding[16 + 22]; struct dirent *evt_dirent = evt_namelist[j]; - char evt_path[MAXPATHLEN]; - int evt_fd; + struct io id; + __u64 config; if (evt_dirent->d_type != DT_DIR || !strcmp(evt_dirent->d_name, ".") || !strcmp(evt_dirent->d_name, "..")) goto next_evt; - snprintf(evt_path, sizeof(evt_path), "%s/id", evt_dirent->d_name); - evt_fd = openat(dir_fd, evt_path, O_RDONLY); - if (evt_fd < 0) + snprintf(buf, sizeof(buf), "%s/id", evt_dirent->d_name); + io__init(&id, openat(dir_fd, buf, O_RDONLY), buf, sizeof(buf)); + + if (id.fd < 0) + goto next_evt; + + if (io__get_dec(&id, &config) < 0) { + close(id.fd); goto next_evt; - close(evt_fd); + } + close(id.fd); - snprintf(evt_path, MAXPATHLEN, "%s:%s", + snprintf(buf, sizeof(buf), "%s:%s", sys_dirent->d_name, evt_dirent->d_name); + snprintf(encoding, sizeof(encoding), "tracepoint/config=0x%llx/", config); print_cb->print_event(print_state, /*topic=*/NULL, - /*pmu_name=*/NULL, - evt_path, + /*pmu_name=*/NULL, /* really "tracepoint" */ + /*event_name=*/buf, /*event_alias=*/NULL, /*scale_unit=*/NULL, /*deprecated=*/false, "Tracepoint event", /*desc=*/NULL, /*long_desc=*/NULL, - /*encoding_desc=*/NULL); + encoding); next_evt: free(evt_namelist[j]); } @@ -401,8 +416,6 @@ void print_symbol_events(const struct print_callbacks *print_cb, void *print_sta */ void print_events(const struct print_callbacks *print_cb, void *print_state) { - char *tmp; - print_symbol_events(print_cb, print_state, PERF_TYPE_HARDWARE, event_symbols_hw, PERF_COUNT_HW_MAX); print_symbol_events(print_cb, print_state, PERF_TYPE_SOFTWARE, @@ -426,21 +439,7 @@ void print_events(const struct print_callbacks *print_cb, void *print_state) /*long_desc=*/NULL, /*encoding_desc=*/NULL); - if (asprintf(&tmp, "%s/t1=v1[,t2=v2,t3 ...]/modifier", - perf_pmus__scan_core(/*pmu=*/NULL)->name) > 0) { - print_cb->print_event(print_state, - /*topic=*/NULL, - /*pmu_name=*/NULL, - tmp, - /*event_alias=*/NULL, - /*scale_unit=*/NULL, - /*deprecated=*/false, - event_type_descriptors[PERF_TYPE_RAW], - "(see 'man perf-list' on how to encode it)", - /*long_desc=*/NULL, - /*encoding_desc=*/NULL); - free(tmp); - } + perf_pmus__print_raw_pmu_events(print_cb, print_state); print_cb->print_event(print_state, /*topic=*/NULL, diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index 459e0e93d7..a950e9157d 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -4,6 +4,7 @@ * * Author(s): Changbin Du */ +#include #include #include #include "debug.h" @@ -12,6 +13,9 @@ #include "machine.h" #include "thread.h" #include "print_insn.h" +#include "dump-insn.h" +#include "map.h" +#include "dso.h" size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp) { @@ -28,12 +32,12 @@ size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp) #ifdef HAVE_LIBCAPSTONE_SUPPORT #include -static int capstone_init(struct machine *machine, csh *cs_handle) +static int capstone_init(struct machine *machine, csh *cs_handle, bool is64) { cs_arch arch; cs_mode mode; - if (machine__is(machine, "x86_64")) { + if (machine__is(machine, "x86_64") && is64) { arch = CS_ARCH_X86; mode = CS_MODE_64; } else if (machine__normalized_is(machine, "x86")) { @@ -69,8 +73,8 @@ static int capstone_init(struct machine *machine, csh *cs_handle) return 0; } -static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread, - cs_insn *insn, FILE *fp) +static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn, + int print_opts, FILE *fp) { struct addr_location al; size_t printed = 0; @@ -80,9 +84,11 @@ static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread, addr_location__init(&al); if (op->type == X86_OP_IMM && - thread__find_symbol(thread, sample->cpumode, op->imm, &al)) { + thread__find_symbol(thread, cpumode, op->imm, &al)) { printed += fprintf(fp, "%s ", insn[0].mnemonic); printed += symbol__fprintf_symname_offs(al.sym, &al, fp); + if (print_opts & PRINT_INSN_IMM_HEX) + printed += fprintf(fp, " [%#" PRIx64 "]", op->imm); addr_location__exit(&al); return printed; } @@ -93,42 +99,71 @@ static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread, return printed; } -size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, - struct machine *machine, FILE *fp) +static bool is64bitip(struct machine *machine, struct addr_location *al) { - csh cs_handle; + const struct dso *dso = al->map ? map__dso(al->map) : NULL; + + if (dso) + return dso__is_64_bit(dso); + + return machine__is(machine, "x86_64") || + machine__normalized_is(machine, "arm64") || + machine__normalized_is(machine, "s390"); +} + +ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpumode, + bool is64bit, const uint8_t *code, size_t code_size, + uint64_t ip, int *lenp, int print_opts, FILE *fp) +{ + size_t printed; cs_insn *insn; + csh cs_handle; size_t count; - size_t printed = 0; int ret; /* TODO: Try to initiate capstone only once but need a proper place. */ - ret = capstone_init(machine, &cs_handle); - if (ret < 0) { - /* fallback */ - return sample__fprintf_insn_raw(sample, fp); - } + ret = capstone_init(machine, &cs_handle, is64bit); + if (ret < 0) + return ret; - count = cs_disasm(cs_handle, (uint8_t *)sample->insn, sample->insn_len, - sample->ip, 1, &insn); + count = cs_disasm(cs_handle, code, code_size, ip, 1, &insn); if (count > 0) { if (machine__normalized_is(machine, "x86")) - printed += print_insn_x86(sample, thread, &insn[0], fp); + printed = print_insn_x86(thread, cpumode, &insn[0], print_opts, fp); else - printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); + printed = fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); + if (lenp) + *lenp = insn->size; cs_free(insn, count); } else { - printed += fprintf(fp, "illegal instruction"); + printed = -1; } cs_close(&cs_handle); return printed; } + +size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, + struct machine *machine, FILE *fp, + struct addr_location *al) +{ + bool is64bit = is64bitip(machine, al); + ssize_t printed; + + printed = fprintf_insn_asm(machine, thread, sample->cpumode, is64bit, + (uint8_t *)sample->insn, sample->insn_len, + sample->ip, NULL, 0, fp); + if (printed < 0) + return sample__fprintf_insn_raw(sample, fp); + + return printed; +} #else size_t sample__fprintf_insn_asm(struct perf_sample *sample __maybe_unused, struct thread *thread __maybe_unused, struct machine *machine __maybe_unused, - FILE *fp __maybe_unused) + FILE *fp __maybe_unused, + struct addr_location *al __maybe_unused) { return 0; } diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h index 465bdcfcc2..07d11af3fc 100644 --- a/tools/perf/util/print_insn.h +++ b/tools/perf/util/print_insn.h @@ -8,9 +8,15 @@ struct perf_sample; struct thread; struct machine; +struct perf_insn; + +#define PRINT_INSN_IMM_HEX (1<<0) size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, - struct machine *machine, FILE *fp); + struct machine *machine, FILE *fp, struct addr_location *al); size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp); +ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpumode, + bool is64bit, const uint8_t *code, size_t code_size, + uint64_t ip, int *lenp, int print_opts, FILE *fp); #endif /* PERF_PRINT_INSN_H */ diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 5c12459e97..a17c9b8a7a 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -159,8 +159,8 @@ static int kernel_get_module_map_cb(struct map *map, void *data) { struct kernel_get_module_map_cb_args *args = data; struct dso *dso = map__dso(map); - const char *short_name = dso->short_name; /* short_name is "[module]" */ - u16 short_name_len = dso->short_name_len; + const char *short_name = dso__short_name(dso); + u16 short_name_len = dso__short_name_len(dso); if (strncmp(short_name + 1, args->module, short_name_len - 2) == 0 && args->module[short_name_len - 2] == '\0') { @@ -202,10 +202,9 @@ struct map *get_target_map(const char *target, struct nsinfo *nsi, bool user) map = dso__new_map(target); dso = map ? map__dso(map) : NULL; if (dso) { - mutex_lock(&dso->lock); - nsinfo__put(dso->nsinfo); - dso->nsinfo = nsinfo__get(nsi); - mutex_unlock(&dso->lock); + mutex_lock(dso__lock(dso)); + dso__set_nsinfo(dso, nsinfo__get(nsi)); + mutex_unlock(dso__lock(dso)); } return map; } else { @@ -236,7 +235,7 @@ static int convert_exec_to_group(const char *exec, char **result) } } - ret = e_snprintf(buf, 64, "%s_%s", PERFPROBE_GROUP, ptr1); + ret = e_snprintf(buf, sizeof(buf), "%s_%s", PERFPROBE_GROUP, ptr1); if (ret < 0) goto out; @@ -368,11 +367,11 @@ static int kernel_get_module_dso(const char *module, struct dso **pdso) map = machine__kernel_map(host_machine); dso = map__dso(map); - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) dso__read_running_kernel_build_id(dso, host_machine); vmlinux_name = symbol_conf.vmlinux_name; - dso->load_errno = 0; + *dso__load_errno(dso) = 0; if (vmlinux_name) ret = dso__load_vmlinux(dso, map, vmlinux_name, false); else @@ -499,7 +498,7 @@ static struct debuginfo *open_from_debuginfod(struct dso *dso, struct nsinfo *ns if (!c) return NULL; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); fd = debuginfod_find_debuginfo(c, (const unsigned char *)sbuild_id, 0, &path); if (fd >= 0) @@ -542,7 +541,7 @@ static struct debuginfo *open_debuginfo(const char *module, struct nsinfo *nsi, if (!module || !strchr(module, '/')) { err = kernel_get_module_dso(module, &dso); if (err < 0) { - if (!dso || dso->load_errno == 0) { + if (!dso || *dso__load_errno(dso) == 0) { if (!str_error_r(-err, reason, STRERR_BUFSIZE)) strcpy(reason, "(unknown)"); } else @@ -559,7 +558,7 @@ static struct debuginfo *open_debuginfo(const char *module, struct nsinfo *nsi, } return NULL; } - path = dso->long_name; + path = dso__long_name(dso); } nsinfo__mountns_enter(nsi, &nsc); ret = debuginfo__new(path); @@ -2758,7 +2757,7 @@ static int get_new_event_name(char *buf, size_t len, const char *base, /* Try no suffix number */ ret = e_snprintf(buf, len, "%s%s", nbase, ret_event ? "__return" : ""); if (ret < 0) { - pr_debug("snprintf() failed: %d\n", ret); + pr_warning("snprintf() failed: %d; the event name nbase='%s' is too long\n", ret, nbase); goto out; } if (!strlist__has_entry(namelist, buf)) @@ -2867,7 +2866,7 @@ static int probe_trace_event__set_name(struct probe_trace_event *tev, group = PERFPROBE_GROUP; /* Get an unused new event name */ - ret = get_new_event_name(buf, 64, event, namelist, + ret = get_new_event_name(buf, sizeof(buf), event, namelist, tev->point.retprobe, allow_suffix); if (ret < 0) return ret; @@ -3795,8 +3794,8 @@ int show_available_funcs(const char *target, struct nsinfo *nsi, /* Show all (filtered) symbols */ setup_pager(); - for (size_t i = 0; i < dso->symbol_names_len; i++) { - struct symbol *pos = dso->symbol_names[i]; + for (size_t i = 0; i < dso__symbol_names_len(dso); i++) { + struct symbol *pos = dso__symbol_names(dso)[i]; if (strfilter__compare(_filter, pos->name)) printf("%s\n", pos->name); diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index c8923375e3..630e16c54e 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -186,8 +186,6 @@ static_var: return ret2; } -#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_LONG / sizeof(long)) - static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg *tvar, const char *cast, bool user_access) @@ -217,7 +215,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, total = dwarf_bytesize(vr_die); if (boffs < 0 || total < 0) return -ENOENT; - ret = snprintf(buf, 16, "b%d@%d/%zd", bsize, boffs, + ret = snprintf(buf, 16, "b%d@%d/%d", bsize, boffs, BYTES_TO_BITS(total)); goto formatted; } diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 87e817b3cf..e867de8dda 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -237,7 +237,7 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str) evsel = evlist__last(temp_evlist); - if (!evlist || perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus)) { + if (!evlist || perf_cpu_map__is_any_cpu_or_is_empty(evlist->core.user_requested_cpus)) { struct perf_cpu_map *cpus = perf_cpu_map__new_online_cpus(); if (cpus) diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index b072ac5d3b..e16257d5ab 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -320,10 +320,10 @@ static SV *perl_process_callchain(struct perf_sample *sample, const char *dsoname = "[unknown]"; if (dso) { - if (symbol_conf.show_kernel_path && dso->long_name) - dsoname = dso->long_name; + if (symbol_conf.show_kernel_path && dso__long_name(dso)) + dsoname = dso__long_name(dso); else - dsoname = dso->name; + dsoname = dso__name(dso); } if (!hv_stores(elem, "dso", newSVpv(dsoname,0))) { hv_undef(elem); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index b4f0f60e60..fb00f3ad68 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -45,6 +45,7 @@ #include "../thread.h" #include "../comm.h" #include "../machine.h" +#include "../mem-info.h" #include "../db-export.h" #include "../thread-stack.h" #include "../trace-event.h" @@ -393,10 +394,10 @@ static const char *get_dsoname(struct map *map) struct dso *dso = map ? map__dso(map) : NULL; if (dso) { - if (symbol_conf.show_kernel_path && dso->long_name) - dsoname = dso->long_name; + if (symbol_conf.show_kernel_path && dso__long_name(dso)) + dsoname = dso__long_name(dso); else - dsoname = dso->name; + dsoname = dso__name(dso); } return dsoname; @@ -720,15 +721,20 @@ static void set_sample_read_in_dict(PyObject *dict_sample, } static void set_sample_datasrc_in_dict(PyObject *dict, - struct perf_sample *sample) + struct perf_sample *sample) { - struct mem_info mi = { .data_src.val = sample->data_src }; + struct mem_info *mi = mem_info__new(); char decode[100]; + if (!mi) + Py_FatalError("couldn't create mem-info"); + pydict_set_item_string_decref(dict, "datasrc", PyLong_FromUnsignedLongLong(sample->data_src)); - perf_script__meminfo_scnprintf(decode, 100, &mi); + mem_info__data_src(mi)->val = sample->data_src; + perf_script__meminfo_scnprintf(decode, 100, mi); + mem_info__put(mi); pydict_set_item_string_decref(dict, "datasrc_decode", _PyUnicode_FromString(decode)); @@ -799,8 +805,9 @@ static void set_sym_in_dict(PyObject *dict, struct addr_location *al, if (al->map) { struct dso *dso = map__dso(al->map); - pydict_set_item_string_decref(dict, dso_field, _PyUnicode_FromString(dso->name)); - build_id__sprintf(&dso->bid, sbuild_id); + pydict_set_item_string_decref(dict, dso_field, + _PyUnicode_FromString(dso__name(dso))); + build_id__sprintf(dso__bid(dso), sbuild_id); pydict_set_item_string_decref(dict, dso_bid_field, _PyUnicode_FromString(sbuild_id)); pydict_set_item_string_decref(dict, dso_map_start, @@ -1246,14 +1253,14 @@ static int python_export_dso(struct db_export *dbe, struct dso *dso, char sbuild_id[SBUILD_ID_SIZE]; PyObject *t; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); t = tuple_new(5); - tuple_set_d64(t, 0, dso->db_id); + tuple_set_d64(t, 0, dso__db_id(dso)); tuple_set_d64(t, 1, machine->db_id); - tuple_set_string(t, 2, dso->short_name); - tuple_set_string(t, 3, dso->long_name); + tuple_set_string(t, 2, dso__short_name(dso)); + tuple_set_string(t, 3, dso__long_name(dso)); tuple_set_string(t, 4, sbuild_id); call_object(tables->dso_handler, t, "dso_table"); @@ -1273,7 +1280,7 @@ static int python_export_symbol(struct db_export *dbe, struct symbol *sym, t = tuple_new(6); tuple_set_d64(t, 0, *sym_db_id); - tuple_set_d64(t, 1, dso->db_id); + tuple_set_d64(t, 1, dso__db_id(dso)); tuple_set_d64(t, 2, sym->start); tuple_set_d64(t, 3, sym->end); tuple_set_s32(t, 4, sym->binding); @@ -1699,13 +1706,15 @@ static void python_process_stat(struct perf_stat_config *config, { struct perf_thread_map *threads = counter->core.threads; struct perf_cpu_map *cpus = counter->core.cpus; - int cpu, thread; - for (thread = 0; thread < perf_thread_map__nr(threads); thread++) { - for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { - process_stat(counter, perf_cpu_map__cpu(cpus, cpu), + for (int thread = 0; thread < perf_thread_map__nr(threads); thread++) { + int idx; + struct perf_cpu cpu; + + perf_cpu_map__for_each_cpu(cpu, idx, cpus) { + process_stat(counter, cpu, perf_thread_map__pid(threads, thread), tstamp, - perf_counts(counter->counts, cpu, thread)); + perf_counts(counter->counts, idx, thread)); } } } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 06d0bd7fb4..a10343b9dc 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2749,6 +2749,7 @@ int perf_session__cpu_bitmap(struct perf_session *session, int i, err = -1; struct perf_cpu_map *map; int nr_cpus = min(session->header.env.nr_cpus_avail, MAX_NR_CPUS); + struct perf_cpu cpu; for (i = 0; i < PERF_TYPE_MAX; ++i) { struct evsel *evsel; @@ -2770,9 +2771,7 @@ int perf_session__cpu_bitmap(struct perf_session *session, return -1; } - for (i = 0; i < perf_cpu_map__nr(map); i++) { - struct perf_cpu cpu = perf_cpu_map__cpu(map, i); - + perf_cpu_map__for_each_cpu(cpu, i, map) { if (cpu.cpu >= nr_cpus) { pr_err("Requested CPU %d too large. " "Consider raising MAX_NR_CPUS\n", cpu.cpu); @@ -2917,3 +2916,24 @@ int perf_event__process_id_index(struct perf_session *session, } return 0; } + +int perf_session__dsos_hit_all(struct perf_session *session) +{ + struct rb_node *nd; + int err; + + err = machine__hit_all_dsos(&session->machines.host); + if (err) + return err; + + for (nd = rb_first_cached(&session->machines.guests); nd; + nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + + err = machine__hit_all_dsos(pos); + if (err) + return err; + } + + return 0; +} diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 5064c6ec11..3b0256e977 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -156,6 +156,8 @@ int perf_session__deliver_synth_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample); +int perf_session__dsos_hit_all(struct perf_session *session); + int perf_event__process_id_index(struct perf_session *session, union perf_event *event); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 92a1bd695e..ab7c7ff35f 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -23,6 +23,7 @@ #include "strlist.h" #include "strbuf.h" #include "mem-events.h" +#include "mem-info.h" #include "annotate.h" #include "annotate-data.h" #include "event.h" @@ -239,11 +240,11 @@ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r) return cmp_null(dso_r, dso_l); if (verbose > 0) { - dso_name_l = dso_l->long_name; - dso_name_r = dso_r->long_name; + dso_name_l = dso__long_name(dso_l); + dso_name_r = dso__long_name(dso_r); } else { - dso_name_l = dso_l->short_name; - dso_name_r = dso_r->short_name; + dso_name_l = dso__short_name(dso_l); + dso_name_r = dso__short_name(dso_r); } return strcmp(dso_name_l, dso_name_r); @@ -262,7 +263,7 @@ static int _hist_entry__dso_snprintf(struct map *map, char *bf, const char *dso_name = "[unknown]"; if (dso) - dso_name = verbose > 0 ? dso->long_name : dso->short_name; + dso_name = verbose > 0 ? dso__long_name(dso) : dso__short_name(dso); return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name); } @@ -333,7 +334,7 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right) * comparing symbol address alone is not enough since it's a * relative address within a dso. */ - if (!hists__has(left->hists, dso) || hists__has(right->hists, dso)) { + if (!hists__has(left->hists, dso)) { ret = sort__dso_cmp(left, right); if (ret != 0) return ret; @@ -364,7 +365,7 @@ static int _hist_entry__sym_snprintf(struct map_symbol *ms, char o = dso ? dso__symtab_origin(dso) : '!'; u64 rip = ip; - if (dso && dso->kernel && dso->adjust_symbols) + if (dso && dso__kernel(dso) && dso__adjust_symbols(dso)) rip = map__unmap_ip(map, ip); ret += repsep_snprintf(bf, size, "%-#*llx %c ", @@ -1364,9 +1365,9 @@ sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->daddr.addr; + l = mem_info__daddr(left->mem_info)->addr; if (right->mem_info) - r = right->mem_info->daddr.addr; + r = mem_info__daddr(right->mem_info)->addr; return (int64_t)(r - l); } @@ -1378,8 +1379,8 @@ static int hist_entry__daddr_snprintf(struct hist_entry *he, char *bf, struct map_symbol *ms = NULL; if (he->mem_info) { - addr = he->mem_info->daddr.addr; - ms = &he->mem_info->daddr.ms; + addr = mem_info__daddr(he->mem_info)->addr; + ms = &mem_info__daddr(he->mem_info)->ms; } return _hist_entry__sym_snprintf(ms, addr, he->level, bf, size, width); } @@ -1390,9 +1391,9 @@ sort__iaddr_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->iaddr.addr; + l = mem_info__iaddr(left->mem_info)->addr; if (right->mem_info) - r = right->mem_info->iaddr.addr; + r = mem_info__iaddr(right->mem_info)->addr; return (int64_t)(r - l); } @@ -1404,8 +1405,8 @@ static int hist_entry__iaddr_snprintf(struct hist_entry *he, char *bf, struct map_symbol *ms = NULL; if (he->mem_info) { - addr = he->mem_info->iaddr.addr; - ms = &he->mem_info->iaddr.ms; + addr = mem_info__iaddr(he->mem_info)->addr; + ms = &mem_info__iaddr(he->mem_info)->ms; } return _hist_entry__sym_snprintf(ms, addr, he->level, bf, size, width); } @@ -1417,9 +1418,9 @@ sort__dso_daddr_cmp(struct hist_entry *left, struct hist_entry *right) struct map *map_r = NULL; if (left->mem_info) - map_l = left->mem_info->daddr.ms.map; + map_l = mem_info__daddr(left->mem_info)->ms.map; if (right->mem_info) - map_r = right->mem_info->daddr.ms.map; + map_r = mem_info__daddr(right->mem_info)->ms.map; return _sort__dso_cmp(map_l, map_r); } @@ -1430,7 +1431,7 @@ static int hist_entry__dso_daddr_snprintf(struct hist_entry *he, char *bf, struct map *map = NULL; if (he->mem_info) - map = he->mem_info->daddr.ms.map; + map = mem_info__daddr(he->mem_info)->ms.map; return _hist_entry__dso_snprintf(map, bf, size, width); } @@ -1442,12 +1443,12 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_lock = PERF_MEM_LOCK_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_lock = PERF_MEM_LOCK_NA; @@ -1470,12 +1471,12 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_dtlb = PERF_MEM_TLB_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_dtlb = PERF_MEM_TLB_NA; @@ -1498,12 +1499,12 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_lvl = PERF_MEM_LVL_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_lvl = PERF_MEM_LVL_NA; @@ -1526,12 +1527,12 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_snoop = PERF_MEM_SNOOP_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_snoop = PERF_MEM_SNOOP_NA; @@ -1562,8 +1563,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) if (left->cpumode > right->cpumode) return -1; if (left->cpumode < right->cpumode) return 1; - l_map = left->mem_info->daddr.ms.map; - r_map = right->mem_info->daddr.ms.map; + l_map = mem_info__daddr(left->mem_info)->ms.map; + r_map = mem_info__daddr(right->mem_info)->ms.map; /* if both are NULL, jump to sort on al_addr instead */ if (!l_map && !r_map) @@ -1586,8 +1587,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) */ if ((left->cpumode != PERF_RECORD_MISC_KERNEL) && - (!(map__flags(l_map) & MAP_SHARED)) && !l_dso->id.maj && !l_dso->id.min && - !l_dso->id.ino && !l_dso->id.ino_generation) { + (!(map__flags(l_map) & MAP_SHARED)) && !dso__id(l_dso)->maj && !dso__id(l_dso)->min && + !dso__id(l_dso)->ino && !dso__id(l_dso)->ino_generation) { /* userspace anonymous */ if (thread__pid(left->thread) > thread__pid(right->thread)) @@ -1598,8 +1599,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) addr: /* al_addr does all the right addr - start + offset calculations */ - l = cl_address(left->mem_info->daddr.al_addr, chk_double_cl); - r = cl_address(right->mem_info->daddr.al_addr, chk_double_cl); + l = cl_address(mem_info__daddr(left->mem_info)->al_addr, chk_double_cl); + r = cl_address(mem_info__daddr(right->mem_info)->al_addr, chk_double_cl); if (l > r) return -1; if (l < r) return 1; @@ -1616,17 +1617,18 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf, char level = he->level; if (he->mem_info) { - struct map *map = he->mem_info->daddr.ms.map; + struct map *map = mem_info__daddr(he->mem_info)->ms.map; struct dso *dso = map ? map__dso(map) : NULL; - addr = cl_address(he->mem_info->daddr.al_addr, chk_double_cl); - ms = &he->mem_info->daddr.ms; + addr = cl_address(mem_info__daddr(he->mem_info)->al_addr, chk_double_cl); + ms = &mem_info__daddr(he->mem_info)->ms; /* print [s] for shared data mmaps */ if ((he->cpumode != PERF_RECORD_MISC_KERNEL) && map && !(map__prot(map) & PROT_EXEC) && (map__flags(map) & MAP_SHARED) && - (dso->id.maj || dso->id.min || dso->id.ino || dso->id.ino_generation)) + (dso__id(dso)->maj || dso__id(dso)->min || dso__id(dso)->ino || + dso__id(dso)->ino_generation)) level = 's'; else if (!map) level = 'X'; @@ -1804,12 +1806,12 @@ sort__blocked_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_blk = PERF_MEM_BLK_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_blk = PERF_MEM_BLK_NA; @@ -1838,9 +1840,9 @@ sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->daddr.phys_addr; + l = mem_info__daddr(left->mem_info)->phys_addr; if (right->mem_info) - r = right->mem_info->daddr.phys_addr; + r = mem_info__daddr(right->mem_info)->phys_addr; return (int64_t)(r - l); } @@ -1852,7 +1854,7 @@ static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf, size_t ret = 0; size_t len = BITS_PER_LONG / 4; - addr = he->mem_info->daddr.phys_addr; + addr = mem_info__daddr(he->mem_info)->phys_addr; ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", he->level); @@ -1879,9 +1881,9 @@ sort__data_page_size_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->daddr.data_page_size; + l = mem_info__daddr(left->mem_info)->data_page_size; if (right->mem_info) - r = right->mem_info->daddr.data_page_size; + r = mem_info__daddr(right->mem_info)->data_page_size; return (int64_t)(r - l); } @@ -1892,7 +1894,7 @@ static int hist_entry__data_page_size_snprintf(struct hist_entry *he, char *bf, char str[PAGE_SIZE_NAME_LEN]; return repsep_snprintf(bf, size, "%-*s", width, - get_page_size_name(he->mem_info->daddr.data_page_size, str)); + get_page_size_name(mem_info__daddr(he->mem_info)->data_page_size, str)); } struct sort_entry sort_mem_data_page_size = { @@ -2441,6 +2443,13 @@ static struct hpp_dimension hpp_sort_dimensions[] = { DIM(PERF_HPP__OVERHEAD_ACC, "overhead_children"), DIM(PERF_HPP__SAMPLES, "sample"), DIM(PERF_HPP__PERIOD, "period"), + DIM(PERF_HPP__WEIGHT1, "weight1"), + DIM(PERF_HPP__WEIGHT2, "weight2"), + DIM(PERF_HPP__WEIGHT3, "weight3"), + /* aliases for weight_struct */ + DIM(PERF_HPP__WEIGHT2, "ins_lat"), + DIM(PERF_HPP__WEIGHT3, "retire_lat"), + DIM(PERF_HPP__WEIGHT3, "p_stage_cyc"), }; #undef DIM @@ -3743,26 +3752,29 @@ void sort__setup_elide(FILE *output) } } -int output_field_add(struct perf_hpp_list *list, char *tok) +int output_field_add(struct perf_hpp_list *list, const char *tok) { unsigned int i; - for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { - struct sort_dimension *sd = &common_sort_dimensions[i]; + for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) { + struct hpp_dimension *hd = &hpp_sort_dimensions[i]; - if (!sd->name || strncasecmp(tok, sd->name, strlen(tok))) + if (strncasecmp(tok, hd->name, strlen(tok))) continue; - return __sort_dimension__add_output(list, sd); + if (!strcasecmp(tok, "weight")) + ui__warning("--fields weight shows the average value unlike in the --sort key.\n"); + + return __hpp_dimension__add_output(list, hd); } - for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) { - struct hpp_dimension *hd = &hpp_sort_dimensions[i]; + for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { + struct sort_dimension *sd = &common_sort_dimensions[i]; - if (strncasecmp(tok, hd->name, strlen(tok))) + if (!sd->name || strncasecmp(tok, sd->name, strlen(tok))) continue; - return __hpp_dimension__add_output(list, hd); + return __sort_dimension__add_output(list, sd); } for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) { diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 6f6b4189a3..0bd0ee3ae7 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -3,19 +3,9 @@ #define __PERF_SORT_H #include #include -#include -#include -#include "map_symbol.h" -#include "symbol_conf.h" -#include "callchain.h" -#include "values.h" #include "hist.h" -#include "stat.h" -#include "spark.h" struct option; -struct thread; -struct annotated_data_type; extern regex_t parent_regex; extern const char *sort_order; @@ -39,175 +29,6 @@ extern struct sort_entry sort_type; extern const char default_mem_sort_order[]; extern bool chk_double_cl; -struct res_sample { - u64 time; - int cpu; - int tid; -}; - -struct he_stat { - u64 period; - u64 period_sys; - u64 period_us; - u64 period_guest_sys; - u64 period_guest_us; - u32 nr_events; -}; - -struct namespace_id { - u64 dev; - u64 ino; -}; - -struct hist_entry_diff { - bool computed; - union { - /* PERF_HPP__DELTA */ - double period_ratio_delta; - - /* PERF_HPP__RATIO */ - double period_ratio; - - /* HISTC_WEIGHTED_DIFF */ - s64 wdiff; - - /* PERF_HPP_DIFF__CYCLES */ - s64 cycles; - }; - struct stats stats; - unsigned long svals[NUM_SPARKS]; -}; - -struct hist_entry_ops { - void *(*new)(size_t size); - void (*free)(void *ptr); -}; - -/** - * struct hist_entry - histogram entry - * - * @row_offset - offset from the first callchain expanded to appear on screen - * @nr_rows - rows expanded in callchain, recalculated on folding/unfolding - */ -struct hist_entry { - struct rb_node rb_node_in; - struct rb_node rb_node; - union { - struct list_head node; - struct list_head head; - } pairs; - struct he_stat stat; - struct he_stat *stat_acc; - struct map_symbol ms; - struct thread *thread; - struct comm *comm; - struct namespace_id cgroup_id; - u64 cgroup; - u64 ip; - u64 transaction; - s32 socket; - s32 cpu; - u64 code_page_size; - u64 weight; - u64 ins_lat; - u64 p_stage_cyc; - u8 cpumode; - u8 depth; - int mem_type_off; - struct simd_flags simd_flags; - - /* We are added by hists__add_dummy_entry. */ - bool dummy; - bool leaf; - - char level; - u8 filtered; - - u16 callchain_size; - union { - /* - * Since perf diff only supports the stdio output, TUI - * fields are only accessed from perf report (or perf - * top). So make it a union to reduce memory usage. - */ - struct hist_entry_diff diff; - struct /* for TUI */ { - u16 row_offset; - u16 nr_rows; - bool init_have_children; - bool unfolded; - bool has_children; - bool has_no_entry; - }; - }; - char *srcline; - char *srcfile; - struct symbol *parent; - struct branch_info *branch_info; - long time; - struct hists *hists; - struct mem_info *mem_info; - struct block_info *block_info; - struct kvm_info *kvm_info; - void *raw_data; - u32 raw_size; - int num_res; - struct res_sample *res_samples; - void *trace_output; - struct perf_hpp_list *hpp_list; - struct hist_entry *parent_he; - struct hist_entry_ops *ops; - struct annotated_data_type *mem_type; - union { - /* this is for hierarchical entry structure */ - struct { - struct rb_root_cached hroot_in; - struct rb_root_cached hroot_out; - }; /* non-leaf entries */ - struct rb_root sorted_chain; /* leaf entry has callchains */ - }; - struct callchain_root callchain[0]; /* must be last member */ -}; - -static __pure inline bool hist_entry__has_callchains(struct hist_entry *he) -{ - return he->callchain_size != 0; -} - -int hist_entry__sym_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width); - -static inline bool hist_entry__has_pairs(struct hist_entry *he) -{ - return !list_empty(&he->pairs.node); -} - -static inline struct hist_entry *hist_entry__next_pair(struct hist_entry *he) -{ - if (hist_entry__has_pairs(he)) - return list_entry(he->pairs.node.next, struct hist_entry, pairs.node); - return NULL; -} - -static inline void hist_entry__add_pair(struct hist_entry *pair, - struct hist_entry *he) -{ - list_add_tail(&pair->pairs.node, &he->pairs.head); -} - -static inline float hist_entry__get_percent_limit(struct hist_entry *he) -{ - u64 period = he->stat.period; - u64 total_period = hists__total_period(he->hists); - - if (unlikely(total_period == 0)) - return 0; - - if (symbol_conf.cumulate_callchain) - period = he->stat_acc->period; - - return period * 100.0 / total_period; -} - enum sort_mode { SORT_MODE__NORMAL, SORT_MODE__BRANCH, @@ -299,15 +120,6 @@ struct sort_entry { u8 se_width_idx; }; -struct block_hist { - struct hists block_hists; - struct perf_hpp_list block_list; - struct perf_hpp_fmt block_fmt; - int block_idx; - bool valid; - struct hist_entry he; -}; - extern struct sort_entry sort_thread; struct evlist; @@ -329,7 +141,7 @@ void reset_dimensions(void); int sort_dimension__add(struct perf_hpp_list *list, const char *tok, struct evlist *evlist, int level); -int output_field_add(struct perf_hpp_list *list, char *tok); +int output_field_add(struct perf_hpp_list *list, const char *tok); int64_t sort__iaddr_cmp(struct hist_entry *left, struct hist_entry *right); int64_t diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 7addc34afc..4d67c1e095 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -27,14 +27,14 @@ bool srcline_full_filename; char *srcline__unknown = (char *)"??:0"; -static const char *dso__name(struct dso *dso) +static const char *srcline_dso_name(struct dso *dso) { const char *dso_name; - if (dso->symsrc_filename) - dso_name = dso->symsrc_filename; + if (dso__symsrc_filename(dso)) + dso_name = dso__symsrc_filename(dso); else - dso_name = dso->long_name; + dso_name = dso__long_name(dso); if (dso_name[0] == '[') return NULL; @@ -288,7 +288,7 @@ static int inline_list__append_dso_a2l(struct dso *dso, struct inline_node *node, struct symbol *sym) { - struct a2l_data *a2l = dso->a2l; + struct a2l_data *a2l = dso__a2l(dso); struct symbol *inline_sym = new_inline_sym(dso, sym, a2l->funcname); char *srcline = NULL; @@ -304,11 +304,11 @@ static int addr2line(const char *dso_name, u64 addr, struct symbol *sym) { int ret = 0; - struct a2l_data *a2l = dso->a2l; + struct a2l_data *a2l = dso__a2l(dso); if (!a2l) { - dso->a2l = addr2line_init(dso_name); - a2l = dso->a2l; + a2l = addr2line_init(dso_name); + dso__set_a2l(dso, a2l); } if (a2l == NULL) { @@ -360,14 +360,14 @@ static int addr2line(const char *dso_name, u64 addr, void dso__free_a2l(struct dso *dso) { - struct a2l_data *a2l = dso->a2l; + struct a2l_data *a2l = dso__a2l(dso); if (!a2l) return; addr2line_cleanup(a2l); - dso->a2l = NULL; + dso__set_a2l(dso, NULL); } #else /* HAVE_LIBBFD_SUPPORT */ @@ -638,7 +638,7 @@ static int addr2line(const char *dso_name, u64 addr, struct inline_node *node, struct symbol *sym __maybe_unused) { - struct child_process *a2l = dso->a2l; + struct child_process *a2l = dso__a2l(dso); char *record_function = NULL; char *record_filename = NULL; unsigned int record_line_nr = 0; @@ -655,8 +655,9 @@ static int addr2line(const char *dso_name, u64 addr, if (!filename__has_section(dso_name, ".debug_line")) goto out; - dso->a2l = addr2line_subprocess_init(symbol_conf.addr2line_path, dso_name); - a2l = dso->a2l; + dso__set_a2l(dso, + addr2line_subprocess_init(symbol_conf.addr2line_path, dso_name)); + a2l = dso__a2l(dso); } if (a2l == NULL) { @@ -770,7 +771,7 @@ out: free(record_function); free(record_filename); if (io.eof) { - dso->a2l = NULL; + dso__set_a2l(dso, NULL); addr2line_subprocess_cleanup(a2l); } return ret; @@ -778,14 +779,14 @@ out: void dso__free_a2l(struct dso *dso) { - struct child_process *a2l = dso->a2l; + struct child_process *a2l = dso__a2l(dso); if (!a2l) return; addr2line_subprocess_cleanup(a2l); - dso->a2l = NULL; + dso__set_a2l(dso, NULL); } #endif /* HAVE_LIBBFD_SUPPORT */ @@ -823,33 +824,34 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym, char *srcline; const char *dso_name; - if (!dso->has_srcline) + if (!dso__has_srcline(dso)) goto out; - dso_name = dso__name(dso); + dso_name = srcline_dso_name(dso); if (dso_name == NULL) - goto out; + goto out_err; if (!addr2line(dso_name, addr, &file, &line, dso, unwind_inlines, NULL, sym)) - goto out; + goto out_err; srcline = srcline_from_fileline(file, line); free(file); if (!srcline) - goto out; + goto out_err; - dso->a2l_fails = 0; + dso__set_a2l_fails(dso, 0); return srcline; -out: - if (dso->a2l_fails && ++dso->a2l_fails > A2L_FAIL_LIMIT) { - dso->has_srcline = 0; +out_err: + dso__set_a2l_fails(dso, dso__a2l_fails(dso) + 1); + if (dso__a2l_fails(dso) > A2L_FAIL_LIMIT) { + dso__set_has_srcline(dso, false); dso__free_a2l(dso); } - +out: if (!show_addr) return (show_sym && sym) ? strndup(sym->name, sym->namelen) : SRCLINE_UNKNOWN; @@ -858,7 +860,7 @@ out: if (asprintf(&srcline, "%s+%" PRIu64, show_sym ? sym->name : "", ip - sym->start) < 0) return SRCLINE_UNKNOWN; - } else if (asprintf(&srcline, "%s[%" PRIx64 "]", dso->short_name, addr) < 0) + } else if (asprintf(&srcline, "%s[%" PRIx64 "]", dso__short_name(dso), addr) < 0) return SRCLINE_UNKNOWN; return srcline; } @@ -869,22 +871,23 @@ char *get_srcline_split(struct dso *dso, u64 addr, unsigned *line) char *file = NULL; const char *dso_name; - if (!dso->has_srcline) - goto out; + if (!dso__has_srcline(dso)) + return NULL; - dso_name = dso__name(dso); + dso_name = srcline_dso_name(dso); if (dso_name == NULL) - goto out; + goto out_err; if (!addr2line(dso_name, addr, &file, line, dso, true, NULL, NULL)) - goto out; + goto out_err; - dso->a2l_fails = 0; + dso__set_a2l_fails(dso, 0); return file; -out: - if (dso->a2l_fails && ++dso->a2l_fails > A2L_FAIL_LIMIT) { - dso->has_srcline = 0; +out_err: + dso__set_a2l_fails(dso, dso__a2l_fails(dso) + 1); + if (dso__a2l_fails(dso) > A2L_FAIL_LIMIT) { + dso__set_has_srcline(dso, false); dso__free_a2l(dso); } @@ -982,7 +985,7 @@ struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr, { const char *dso_name; - dso_name = dso__name(dso); + dso_name = srcline_dso_name(dso); if (dso_name == NULL) return NULL; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 91d2f7f65d..186305fd2d 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -38,6 +38,7 @@ static int aggr_header_lens[] = { [AGGR_CORE] = 18, [AGGR_CACHE] = 22, + [AGGR_CLUSTER] = 20, [AGGR_DIE] = 12, [AGGR_SOCKET] = 6, [AGGR_NODE] = 6, @@ -49,6 +50,7 @@ static int aggr_header_lens[] = { static const char *aggr_header_csv[] = { [AGGR_CORE] = "core,cpus,", [AGGR_CACHE] = "cache,cpus,", + [AGGR_CLUSTER] = "cluster,cpus,", [AGGR_DIE] = "die,cpus,", [AGGR_SOCKET] = "socket,cpus,", [AGGR_NONE] = "cpu,", @@ -60,6 +62,7 @@ static const char *aggr_header_csv[] = { static const char *aggr_header_std[] = { [AGGR_CORE] = "core", [AGGR_CACHE] = "cache", + [AGGR_CLUSTER] = "cluster", [AGGR_DIE] = "die", [AGGR_SOCKET] = "socket", [AGGR_NONE] = "cpu", diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 3466aa9524..6bb975e46d 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -176,6 +176,13 @@ static double find_stat(const struct evsel *evsel, int aggr_idx, enum stat_type if (type != evsel__stat_type(cur)) continue; + /* + * Except the SW CLOCK events, + * ignore if not the PMU we're looking for. + */ + if ((type != STAT_NSECS) && (evsel->pmu != cur->pmu)) + continue; + aggr = &cur->stats->aggr[aggr_idx]; if (type == STAT_NSECS) return aggr->counts.val; diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index b0bcf92f0f..0bd5467389 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -315,7 +315,7 @@ static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals, if (!counter->per_pkg) return 0; - if (perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (perf_cpu_map__is_any_cpu_or_is_empty(cpus)) return 0; if (!mask) { diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index d6e5c8787b..fd7a187551 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -87,6 +87,7 @@ struct perf_stat_config { bool metric_no_group; bool metric_no_merge; bool metric_no_threshold; + bool hardware_aware_grouping; bool stop_read_counter; bool iostat_run; char *user_requested_cpu_list; diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 1892e9b6aa..2b04f47f4d 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -725,26 +725,24 @@ static void scan_core_topology(int *map, struct topology *t, int nr_cpus) static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus) { - int i; - int ret = 0; - struct perf_cpu_map *m; - struct perf_cpu c; + int idx, ret = 0; + struct perf_cpu_map *map; + struct perf_cpu cpu; - m = perf_cpu_map__new(s); - if (!m) + map = perf_cpu_map__new(s); + if (!map) return -1; - for (i = 0; i < perf_cpu_map__nr(m); i++) { - c = perf_cpu_map__cpu(m, i); - if (c.cpu >= nr_cpus) { + perf_cpu_map__for_each_cpu(cpu, idx, map) { + if (cpu.cpu >= nr_cpus) { ret = -1; break; } - __set_bit(c.cpu, cpumask_bits(b)); + __set_bit(cpu.cpu, cpumask_bits(b)); } - perf_cpu_map__put(m); + perf_cpu_map__put(map); return ret; } diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 0b91f813c4..e398abfd13 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -174,7 +174,7 @@ static inline bool elf_sec__is_data(const GElf_Shdr *shdr, static bool elf_sec__filter(GElf_Shdr *shdr, Elf_Data *secstrs) { - return elf_sec__is_text(shdr, secstrs) || + return elf_sec__is_text(shdr, secstrs) || elf_sec__is_data(shdr, secstrs); } @@ -312,8 +312,8 @@ static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name) * DWARF DW_compile_unit has this, but we don't always have access * to it... */ - if (!want_demangle(dso->kernel || kmodule)) - return demangled; + if (!want_demangle(dso__kernel(dso) || kmodule)) + return demangled; demangled = cxx_demangle_sym(elf_name, verbose > 0, verbose > 0); if (demangled == NULL) { @@ -470,7 +470,7 @@ static bool get_plt_sizes(struct dso *dso, GElf_Ehdr *ehdr, GElf_Shdr *shdr_plt, } if (*plt_entry_size) return true; - pr_debug("Missing PLT entry size for %s\n", dso->long_name); + pr_debug("Missing PLT entry size for %s\n", dso__long_name(dso)); return false; } @@ -654,7 +654,7 @@ static int dso__synthesize_plt_got_symbols(struct dso *dso, Elf *elf, sym = symbol__new(shdr.sh_offset + i, shdr.sh_entsize, STB_GLOBAL, STT_FUNC, buf); if (!sym) goto out; - symbols__insert(&dso->symbols, sym); + symbols__insert(dso__symbols(dso), sym); } err = 0; out: @@ -708,7 +708,7 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss) plt_sym = symbol__new(shdr_plt.sh_offset, plt_header_size, STB_GLOBAL, STT_FUNC, ".plt"); if (!plt_sym) goto out_elf_end; - symbols__insert(&dso->symbols, plt_sym); + symbols__insert(dso__symbols(dso), plt_sym); /* Only x86 has .plt.got */ if (machine_is_x86(ehdr.e_machine) && @@ -830,7 +830,7 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss) goto out_elf_end; plt_offset += plt_entry_size; - symbols__insert(&dso->symbols, f); + symbols__insert(dso__symbols(dso), f); ++nr; } @@ -840,7 +840,7 @@ out_elf_end: if (err == 0) return nr; pr_debug("%s: problems reading %s PLT info.\n", - __func__, dso->long_name); + __func__, dso__long_name(dso)); return 0; } @@ -1175,19 +1175,19 @@ static int dso__swap_init(struct dso *dso, unsigned char eidata) { static unsigned int const endian = 1; - dso->needs_swap = DSO_SWAP__NO; + dso__set_needs_swap(dso, DSO_SWAP__NO); switch (eidata) { case ELFDATA2LSB: /* We are big endian, DSO is little endian. */ if (*(unsigned char const *)&endian != 1) - dso->needs_swap = DSO_SWAP__YES; + dso__set_needs_swap(dso, DSO_SWAP__YES); break; case ELFDATA2MSB: /* We are little endian, DSO is big endian. */ if (*(unsigned char const *)&endian != 0) - dso->needs_swap = DSO_SWAP__YES; + dso__set_needs_swap(dso, DSO_SWAP__YES); break; default: @@ -1238,11 +1238,11 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, if (fd < 0) return -1; - type = dso->symtab_type; + type = dso__symtab_type(dso); } else { fd = open(name, O_RDONLY); if (fd < 0) { - dso->load_errno = errno; + *dso__load_errno(dso) = errno; return -1; } } @@ -1250,37 +1250,37 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); if (elf == NULL) { pr_debug("%s: cannot read %s ELF file.\n", __func__, name); - dso->load_errno = DSO_LOAD_ERRNO__INVALID_ELF; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__INVALID_ELF; goto out_close; } if (gelf_getehdr(elf, &ehdr) == NULL) { - dso->load_errno = DSO_LOAD_ERRNO__INVALID_ELF; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__INVALID_ELF; pr_debug("%s: cannot get elf header.\n", __func__); goto out_elf_end; } if (dso__swap_init(dso, ehdr.e_ident[EI_DATA])) { - dso->load_errno = DSO_LOAD_ERRNO__INTERNAL_ERROR; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__INTERNAL_ERROR; goto out_elf_end; } /* Always reject images with a mismatched build-id: */ - if (dso->has_build_id && !symbol_conf.ignore_vmlinux_buildid) { + if (dso__has_build_id(dso) && !symbol_conf.ignore_vmlinux_buildid) { u8 build_id[BUILD_ID_SIZE]; struct build_id bid; int size; size = elf_read_build_id(elf, build_id, BUILD_ID_SIZE); if (size <= 0) { - dso->load_errno = DSO_LOAD_ERRNO__CANNOT_READ_BUILDID; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__CANNOT_READ_BUILDID; goto out_elf_end; } build_id__init(&bid, build_id, size); if (!dso__build_id_equal(dso, &bid)) { pr_debug("%s: build id mismatch for %s.\n", __func__, name); - dso->load_errno = DSO_LOAD_ERRNO__MISMATCHING_BUILDID; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__MISMATCHING_BUILDID; goto out_elf_end; } } @@ -1305,14 +1305,14 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, if (ss->opdshdr.sh_type != SHT_PROGBITS) ss->opdsec = NULL; - if (dso->kernel == DSO_SPACE__USER) + if (dso__kernel(dso) == DSO_SPACE__USER) ss->adjust_symbols = true; else ss->adjust_symbols = elf__needs_adjust_symbols(ehdr); ss->name = strdup(name); if (!ss->name) { - dso->load_errno = errno; + *dso__load_errno(dso) = errno; goto out_elf_end; } @@ -1419,7 +1419,7 @@ void __weak arch__sym_update(struct symbol *s __maybe_unused, static int dso__process_kernel_symbol(struct dso *dso, struct map *map, GElf_Sym *sym, GElf_Shdr *shdr, struct maps *kmaps, struct kmap *kmap, - struct dso **curr_dsop, struct map **curr_mapp, + struct dso **curr_dsop, const char *section_name, bool adjust_kernel_syms, bool kmodule, bool *remap_kernel, u64 max_text_sh_offset) @@ -1432,7 +1432,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, if (adjust_kernel_syms) sym->st_value -= shdr->sh_addr - shdr->sh_offset; - if (strcmp(section_name, (curr_dso->short_name + dso->short_name_len)) == 0) + if (strcmp(section_name, (dso__short_name(curr_dso) + dso__short_name_len(dso))) == 0) return 0; if (strcmp(section_name, ".text") == 0) { @@ -1441,7 +1441,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, * kallsyms and identity maps. Overwrite it to * map to the kernel dso. */ - if (*remap_kernel && dso->kernel && !kmodule) { + if (*remap_kernel && dso__kernel(dso) && !kmodule) { *remap_kernel = false; map__set_start(map, shdr->sh_addr + ref_reloc(kmap)); map__set_end(map, map__start(map) + shdr->sh_size); @@ -1470,8 +1470,8 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, map__set_pgoff(map, shdr->sh_offset); } - *curr_mapp = map; - *curr_dsop = dso; + dso__put(*curr_dsop); + *curr_dsop = dso__get(dso); return 0; } @@ -1484,12 +1484,12 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, */ if (kmodule && adjust_kernel_syms && is_exe_text(shdr->sh_flags) && shdr->sh_offset <= max_text_sh_offset) { - *curr_mapp = map; - *curr_dsop = dso; + dso__put(*curr_dsop); + *curr_dsop = dso__get(dso); return 0; } - snprintf(dso_name, sizeof(dso_name), "%s%s", dso->short_name, section_name); + snprintf(dso_name, sizeof(dso_name), "%s%s", dso__short_name(dso), section_name); curr_map = maps__find_by_name(kmaps, dso_name); if (curr_map == NULL) { @@ -1501,17 +1501,17 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, curr_dso = dso__new(dso_name); if (curr_dso == NULL) return -1; - curr_dso->kernel = dso->kernel; - curr_dso->long_name = dso->long_name; - curr_dso->long_name_len = dso->long_name_len; - curr_dso->binary_type = dso->binary_type; - curr_dso->adjust_symbols = dso->adjust_symbols; + dso__set_kernel(curr_dso, dso__kernel(dso)); + RC_CHK_ACCESS(curr_dso)->long_name = dso__long_name(dso); + RC_CHK_ACCESS(curr_dso)->long_name_len = dso__long_name_len(dso); + dso__set_binary_type(curr_dso, dso__binary_type(dso)); + dso__set_adjust_symbols(curr_dso, dso__adjust_symbols(dso)); curr_map = map__new2(start, curr_dso); - dso__put(curr_dso); - if (curr_map == NULL) + if (curr_map == NULL) { + dso__put(curr_dso); return -1; - - if (curr_dso->kernel) + } + if (dso__kernel(curr_dso)) map__kmap(curr_map)->kmaps = kmaps; if (adjust_kernel_syms) { @@ -1521,24 +1521,18 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, } else { map__set_mapping_type(curr_map, MAPPING_TYPE__IDENTITY); } - curr_dso->symtab_type = dso->symtab_type; + dso__set_symtab_type(curr_dso, dso__symtab_type(dso)); if (maps__insert(kmaps, curr_map)) return -1; - /* - * Add it before we drop the reference to curr_map, i.e. while - * we still are sure to have a reference to this DSO via - * *curr_map->dso. - */ dsos__add(&maps__machine(kmaps)->dsos, curr_dso); - /* kmaps already got it */ - map__put(curr_map); dso__set_loaded(curr_dso); - *curr_mapp = curr_map; + dso__put(*curr_dsop); *curr_dsop = curr_dso; } else { - *curr_dsop = map__dso(curr_map); - map__put(curr_map); + dso__put(*curr_dsop); + *curr_dsop = dso__get(map__dso(curr_map)); } + map__put(curr_map); return 0; } @@ -1547,13 +1541,11 @@ static int dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, struct symsrc *runtime_ss, int kmodule, int dynsym) { - struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL; + struct kmap *kmap = dso__kernel(dso) ? map__kmap(map) : NULL; struct maps *kmaps = kmap ? map__kmaps(map) : NULL; - struct map *curr_map = map; - struct dso *curr_dso = dso; + struct dso *curr_dso = NULL; Elf_Data *symstrs, *secstrs, *secstrs_run, *secstrs_sym; uint32_t nr_syms; - int err = -1; uint32_t idx; GElf_Ehdr ehdr; GElf_Shdr shdr; @@ -1581,8 +1573,8 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, if (elf_section_by_name(runtime_ss->elf, &runtime_ss->ehdr, &tshdr, ".text", NULL)) { - dso->text_offset = tshdr.sh_addr - tshdr.sh_offset; - dso->text_end = tshdr.sh_offset + tshdr.sh_size; + dso__set_text_offset(dso, tshdr.sh_addr - tshdr.sh_offset); + dso__set_text_end(dso, tshdr.sh_offset + tshdr.sh_size); } if (runtime_ss->opdsec) @@ -1641,21 +1633,22 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, * attempted to prelink vdso to its virtual address. */ if (dso__is_vdso(dso)) - map__set_reloc(map, map__start(map) - dso->text_offset); + map__set_reloc(map, map__start(map) - dso__text_offset(dso)); - dso->adjust_symbols = runtime_ss->adjust_symbols || ref_reloc(kmap); + dso__set_adjust_symbols(dso, runtime_ss->adjust_symbols || ref_reloc(kmap)); /* * Initial kernel and module mappings do not map to the dso. * Flag the fixups. */ - if (dso->kernel) { + if (dso__kernel(dso)) { remap_kernel = true; - adjust_kernel_syms = dso->adjust_symbols; + adjust_kernel_syms = dso__adjust_symbols(dso); } if (kmodule && adjust_kernel_syms) max_text_sh_offset = max_text_section(runtime_ss->elf, &runtime_ss->ehdr); + curr_dso = dso__get(dso); elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) { struct symbol *f; const char *elf_name = elf_sym__name(&sym, symstrs); @@ -1743,10 +1736,14 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, (sym.st_value & 1)) --sym.st_value; - if (dso->kernel) { - if (dso__process_kernel_symbol(dso, map, &sym, &shdr, kmaps, kmap, &curr_dso, &curr_map, - section_name, adjust_kernel_syms, kmodule, - &remap_kernel, max_text_sh_offset)) + if (dso__kernel(dso)) { + if (dso__process_kernel_symbol(dso, map, &sym, &shdr, + kmaps, kmap, &curr_dso, + section_name, + adjust_kernel_syms, + kmodule, + &remap_kernel, + max_text_sh_offset)) goto out_elf_end; } else if ((used_opd && runtime_ss->adjust_symbols) || (!used_opd && syms_ss->adjust_symbols)) { @@ -1792,16 +1789,17 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, arch__sym_update(f, &sym); - __symbols__insert(&curr_dso->symbols, f, dso->kernel); + __symbols__insert(dso__symbols(curr_dso), f, dso__kernel(dso)); nr++; } + dso__put(curr_dso); /* * For misannotated, zeroed, ASM function sizes. */ if (nr > 0) { - symbols__fixup_end(&dso->symbols, false); - symbols__fixup_duplicate(&dso->symbols); + symbols__fixup_end(dso__symbols(dso), false); + symbols__fixup_duplicate(dso__symbols(dso)); if (kmap) { /* * We need to fixup this here too because we create new @@ -1810,9 +1808,10 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, maps__fixup_end(kmaps); } } - err = nr; + return nr; out_elf_end: - return err; + dso__put(curr_dso); + return -1; } int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, @@ -1821,16 +1820,16 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, int nr = 0; int err = -1; - dso->symtab_type = syms_ss->type; - dso->is_64_bit = syms_ss->is_64_bit; - dso->rel = syms_ss->ehdr.e_type == ET_REL; + dso__set_symtab_type(dso, syms_ss->type); + dso__set_is_64_bit(dso, syms_ss->is_64_bit); + dso__set_rel(dso, syms_ss->ehdr.e_type == ET_REL); /* * Modules may already have symbols from kallsyms, but those symbols * have the wrong values for the dso maps, so remove them. */ if (kmodule && syms_ss->symtab) - symbols__delete(&dso->symbols); + symbols__delete(dso__symbols(dso)); if (!syms_ss->symtab) { /* @@ -1838,7 +1837,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, * to using kallsyms. The vmlinux runtime symbols aren't * of much use. */ - if (dso->kernel) + if (dso__kernel(dso)) return err; } else { err = dso__load_sym_internal(dso, map, syms_ss, runtime_ss, diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 1da8b71350..c6f369b5d8 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -273,7 +273,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, out_close: close(fd); out_errno: - dso->load_errno = errno; + RC_CHK_ACCESS(dso)->load_errno = errno; return -1; } @@ -348,7 +348,7 @@ int dso__load_sym(struct dso *dso, struct map *map __maybe_unused, ret = fd__is_64_bit(ss->fd); if (ret >= 0) - dso->is_64_bit = ret; + RC_CHK_ACCESS(dso)->is_64_bit = ret; if (filename__read_build_id(ss->name, &bid) > 0) dso__set_build_id(dso, &bid); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 68dbeae8d2..22646f0cca 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -27,6 +27,7 @@ #include "symbol.h" #include "map_symbol.h" #include "mem-events.h" +#include "mem-info.h" #include "symsrc.h" #include "strlist.h" #include "intlist.h" @@ -532,52 +533,52 @@ static struct symbol *symbols__find_by_name(struct symbol *symbols[], void dso__reset_find_symbol_cache(struct dso *dso) { - dso->last_find_result.addr = 0; - dso->last_find_result.symbol = NULL; + dso__set_last_find_result_addr(dso, 0); + dso__set_last_find_result_symbol(dso, NULL); } void dso__insert_symbol(struct dso *dso, struct symbol *sym) { - __symbols__insert(&dso->symbols, sym, dso->kernel); + __symbols__insert(dso__symbols(dso), sym, dso__kernel(dso)); /* update the symbol cache if necessary */ - if (dso->last_find_result.addr >= sym->start && - (dso->last_find_result.addr < sym->end || + if (dso__last_find_result_addr(dso) >= sym->start && + (dso__last_find_result_addr(dso) < sym->end || sym->start == sym->end)) { - dso->last_find_result.symbol = sym; + dso__set_last_find_result_symbol(dso, sym); } } void dso__delete_symbol(struct dso *dso, struct symbol *sym) { - rb_erase_cached(&sym->rb_node, &dso->symbols); + rb_erase_cached(&sym->rb_node, dso__symbols(dso)); symbol__delete(sym); dso__reset_find_symbol_cache(dso); } struct symbol *dso__find_symbol(struct dso *dso, u64 addr) { - if (dso->last_find_result.addr != addr || dso->last_find_result.symbol == NULL) { - dso->last_find_result.addr = addr; - dso->last_find_result.symbol = symbols__find(&dso->symbols, addr); + if (dso__last_find_result_addr(dso) != addr || dso__last_find_result_symbol(dso) == NULL) { + dso__set_last_find_result_addr(dso, addr); + dso__set_last_find_result_symbol(dso, symbols__find(dso__symbols(dso), addr)); } - return dso->last_find_result.symbol; + return dso__last_find_result_symbol(dso); } struct symbol *dso__find_symbol_nocache(struct dso *dso, u64 addr) { - return symbols__find(&dso->symbols, addr); + return symbols__find(dso__symbols(dso), addr); } struct symbol *dso__first_symbol(struct dso *dso) { - return symbols__first(&dso->symbols); + return symbols__first(dso__symbols(dso)); } struct symbol *dso__last_symbol(struct dso *dso) { - return symbols__last(&dso->symbols); + return symbols__last(dso__symbols(dso)); } struct symbol *dso__next_symbol(struct symbol *sym) @@ -587,11 +588,11 @@ struct symbol *dso__next_symbol(struct symbol *sym) struct symbol *dso__next_symbol_by_name(struct dso *dso, size_t *idx) { - if (*idx + 1 >= dso->symbol_names_len) + if (*idx + 1 >= dso__symbol_names_len(dso)) return NULL; ++*idx; - return dso->symbol_names[*idx]; + return dso__symbol_names(dso)[*idx]; } /* @@ -599,27 +600,29 @@ struct symbol *dso__next_symbol_by_name(struct dso *dso, size_t *idx) */ struct symbol *dso__find_symbol_by_name(struct dso *dso, const char *name, size_t *idx) { - struct symbol *s = symbols__find_by_name(dso->symbol_names, dso->symbol_names_len, - name, SYMBOL_TAG_INCLUDE__NONE, idx); - if (!s) - s = symbols__find_by_name(dso->symbol_names, dso->symbol_names_len, - name, SYMBOL_TAG_INCLUDE__DEFAULT_ONLY, idx); + struct symbol *s = symbols__find_by_name(dso__symbol_names(dso), + dso__symbol_names_len(dso), + name, SYMBOL_TAG_INCLUDE__NONE, idx); + if (!s) { + s = symbols__find_by_name(dso__symbol_names(dso), dso__symbol_names_len(dso), + name, SYMBOL_TAG_INCLUDE__DEFAULT_ONLY, idx); + } return s; } void dso__sort_by_name(struct dso *dso) { - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); if (!dso__sorted_by_name(dso)) { size_t len; - dso->symbol_names = symbols__sort_by_name(&dso->symbols, &len); - if (dso->symbol_names) { - dso->symbol_names_len = len; + dso__set_symbol_names(dso, symbols__sort_by_name(dso__symbols(dso), &len)); + if (dso__symbol_names(dso)) { + dso__set_symbol_names_len(dso, len); dso__set_sorted_by_name(dso); } } - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); } /* @@ -746,7 +749,7 @@ static int map__process_kallsym_symbol(void *arg, const char *name, { struct symbol *sym; struct dso *dso = arg; - struct rb_root_cached *root = &dso->symbols; + struct rb_root_cached *root = dso__symbols(dso); if (!symbol_type__filter(type)) return 0; @@ -786,8 +789,8 @@ static int maps__split_kallsyms_for_kcore(struct maps *kmaps, struct dso *dso) { struct symbol *pos; int count = 0; - struct rb_root_cached old_root = dso->symbols; - struct rb_root_cached *root = &dso->symbols; + struct rb_root_cached *root = dso__symbols(dso); + struct rb_root_cached old_root = *root; struct rb_node *next = rb_first_cached(root); if (!kmaps) @@ -821,13 +824,13 @@ static int maps__split_kallsyms_for_kcore(struct maps *kmaps, struct dso *dso) pos->end = map__end(curr_map); if (pos->end) pos->end -= map__start(curr_map) - map__pgoff(curr_map); - symbols__insert(&curr_map_dso->symbols, pos); + symbols__insert(dso__symbols(curr_map_dso), pos); ++count; map__put(curr_map); } /* Symbols have been adjusted */ - dso->adjust_symbols = 1; + dso__set_adjust_symbols(dso, true); return count; } @@ -844,7 +847,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, struct map *curr_map = map__get(initial_map); struct symbol *pos; int count = 0, moved = 0; - struct rb_root_cached *root = &dso->symbols; + struct rb_root_cached *root = dso__symbols(dso); struct rb_node *next = rb_first_cached(root); int kernel_range = 0; bool x86_64; @@ -871,9 +874,9 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, *module++ = '\0'; curr_map_dso = map__dso(curr_map); - if (strcmp(curr_map_dso->short_name, module)) { + if (strcmp(dso__short_name(curr_map_dso), module)) { if (!RC_CHK_EQUAL(curr_map, initial_map) && - dso->kernel == DSO_SPACE__KERNEL_GUEST && + dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST && machine__is_default_guest(machine)) { /* * We assume all symbols of a module are @@ -896,7 +899,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, goto discard_symbol; } curr_map_dso = map__dso(curr_map); - if (curr_map_dso->loaded && + if (dso__loaded(curr_map_dso) && !machine__is_default_guest(machine)) goto discard_symbol; } @@ -932,7 +935,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, goto add_symbol; } - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) snprintf(dso_name, sizeof(dso_name), "[guest.kernel].%d", kernel_range++); @@ -946,7 +949,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, if (ndso == NULL) return -1; - ndso->kernel = dso->kernel; + dso__set_kernel(ndso, dso__kernel(dso)); curr_map = map__new2(pos->start, ndso); if (curr_map == NULL) { @@ -971,7 +974,7 @@ add_symbol: struct dso *curr_map_dso = map__dso(curr_map); rb_erase_cached(&pos->rb_node, root); - symbols__insert(&curr_map_dso->symbols, pos); + symbols__insert(dso__symbols(curr_map_dso), pos); ++moved; } else ++count; @@ -983,7 +986,7 @@ discard_symbol: } if (!RC_CHK_EQUAL(curr_map, initial_map) && - dso->kernel == DSO_SPACE__KERNEL_GUEST && + dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST && machine__is_default_guest(maps__machine(kmaps))) { dso__set_loaded(map__dso(curr_map)); } @@ -1157,7 +1160,7 @@ static int do_validate_kcore_modules_cb(struct map *old_map, void *data) dso = map__dso(old_map); /* Module must be in memory at the same address */ - mi = find_module(dso->short_name, modules); + mi = find_module(dso__short_name(dso), modules); if (!mi || mi->start != map__start(old_map)) return -EINVAL; @@ -1326,7 +1329,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map, &is_64_bit); if (err) goto out_err; - dso->is_64_bit = is_64_bit; + dso__set_is_64_bit(dso, is_64_bit); if (list_empty(&md.maps)) { err = -EINVAL; @@ -1422,10 +1425,10 @@ static int dso__load_kcore(struct dso *dso, struct map *map, * Set the data type and long name so that kcore can be read via * dso__data_read_addr(). */ - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) - dso->binary_type = DSO_BINARY_TYPE__GUEST_KCORE; + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) + dso__set_binary_type(dso, DSO_BINARY_TYPE__GUEST_KCORE); else - dso->binary_type = DSO_BINARY_TYPE__KCORE; + dso__set_binary_type(dso, DSO_BINARY_TYPE__KCORE); dso__set_long_name(dso, strdup(kcore_filename), true); close(fd); @@ -1486,13 +1489,13 @@ int __dso__load_kallsyms(struct dso *dso, const char *filename, if (kallsyms__delta(kmap, filename, &delta)) return -1; - symbols__fixup_end(&dso->symbols, true); - symbols__fixup_duplicate(&dso->symbols); + symbols__fixup_end(dso__symbols(dso), true); + symbols__fixup_duplicate(dso__symbols(dso)); - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) - dso->symtab_type = DSO_BINARY_TYPE__GUEST_KALLSYMS; + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) + dso__set_symtab_type(dso, DSO_BINARY_TYPE__GUEST_KALLSYMS); else - dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS; + dso__set_symtab_type(dso, DSO_BINARY_TYPE__KALLSYMS); if (!no_kcore && !dso__load_kcore(dso, map, filename)) return maps__split_kallsyms_for_kcore(kmap->kmaps, dso); @@ -1548,7 +1551,7 @@ static int dso__load_perf_map(const char *map_path, struct dso *dso) if (sym == NULL) goto out_delete_line; - symbols__insert(&dso->symbols, sym); + symbols__insert(dso__symbols(dso), sym); nr_syms++; } @@ -1604,7 +1607,7 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) if (!bfd_check_format(abfd, bfd_object)) { pr_debug2("%s: cannot read %s bfd file.\n", __func__, - dso->long_name); + dso__long_name(dso)); goto out_close; } @@ -1637,12 +1640,13 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) } if (i < symbols_count) { /* PE symbols can only have 4 bytes, so use .text high bits */ - dso->text_offset = section->vma - (u32)section->vma; - dso->text_offset += (u32)bfd_asymbol_value(symbols[i]); - dso->text_end = (section->vma - dso->text_offset) + section->size; + u64 text_offset = (section->vma - (u32)section->vma) + + (u32)bfd_asymbol_value(symbols[i]); + dso__set_text_offset(dso, text_offset); + dso__set_text_end(dso, (section->vma - text_offset) + section->size); } else { - dso->text_offset = section->vma - section->filepos; - dso->text_end = section->filepos + section->size; + dso__set_text_offset(dso, section->vma - section->filepos); + dso__set_text_end(dso, section->filepos + section->size); } } @@ -1668,21 +1672,21 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) else len = section->size - sym->value; - start = bfd_asymbol_value(sym) - dso->text_offset; + start = bfd_asymbol_value(sym) - dso__text_offset(dso); symbol = symbol__new(start, len, bfd2elf_binding(sym), STT_FUNC, bfd_asymbol_name(sym)); if (!symbol) goto out_free; - symbols__insert(&dso->symbols, symbol); + symbols__insert(dso__symbols(dso), symbol); } #ifdef bfd_get_section #undef bfd_asymbol_section #endif - symbols__fixup_end(&dso->symbols, false); - symbols__fixup_duplicate(&dso->symbols); - dso->adjust_symbols = 1; + symbols__fixup_end(dso__symbols(dso), false); + symbols__fixup_duplicate(dso__symbols(dso)); + dso__set_adjust_symbols(dso, true); err = 0; out_free: @@ -1705,17 +1709,17 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, case DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO: case DSO_BINARY_TYPE__BUILDID_DEBUGINFO: case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO: - return !kmod && dso->kernel == DSO_SPACE__USER; + return !kmod && dso__kernel(dso) == DSO_SPACE__USER; case DSO_BINARY_TYPE__KALLSYMS: case DSO_BINARY_TYPE__VMLINUX: case DSO_BINARY_TYPE__KCORE: - return dso->kernel == DSO_SPACE__KERNEL; + return dso__kernel(dso) == DSO_SPACE__KERNEL; case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__GUEST_VMLINUX: case DSO_BINARY_TYPE__GUEST_KCORE: - return dso->kernel == DSO_SPACE__KERNEL_GUEST; + return dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST; case DSO_BINARY_TYPE__GUEST_KMODULE: case DSO_BINARY_TYPE__GUEST_KMODULE_COMP: @@ -1725,7 +1729,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, * kernel modules know their symtab type - it's set when * creating a module dso in machine__addnew_module_map(). */ - return kmod && dso->symtab_type == type; + return kmod && dso__symtab_type(dso) == type; case DSO_BINARY_TYPE__BUILD_ID_CACHE: case DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO: @@ -1793,18 +1797,20 @@ int dso__load(struct dso *dso, struct map *map) struct build_id bid; struct nscookie nsc; char newmapname[PATH_MAX]; - const char *map_path = dso->long_name; + const char *map_path = dso__long_name(dso); + + mutex_lock(dso__lock(dso)); + perfmap = is_perf_pid_map_name(map_path); - mutex_lock(&dso->lock); - perfmap = strncmp(dso->name, "/tmp/perf-", 10) == 0; if (perfmap) { - if (dso->nsinfo && (dso__find_perf_map(newmapname, - sizeof(newmapname), &dso->nsinfo) == 0)) { + if (dso__nsinfo(dso) && + (dso__find_perf_map(newmapname, sizeof(newmapname), + dso__nsinfo_ptr(dso)) == 0)) { map_path = newmapname; } } - nsinfo__mountns_enter(dso->nsinfo, &nsc); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); /* check again under the dso->lock */ if (dso__loaded(dso)) { @@ -1812,15 +1818,15 @@ int dso__load(struct dso *dso, struct map *map) goto out; } - kmod = dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; + kmod = dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || + dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; - if (dso->kernel && !kmod) { - if (dso->kernel == DSO_SPACE__KERNEL) + if (dso__kernel(dso) && !kmod) { + if (dso__kernel(dso) == DSO_SPACE__KERNEL) ret = dso__load_kernel_sym(dso, map); - else if (dso->kernel == DSO_SPACE__KERNEL_GUEST) + else if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) ret = dso__load_guest_kernel_sym(dso, map); machine = maps__machine(map__kmaps(map)); @@ -1829,12 +1835,13 @@ int dso__load(struct dso *dso, struct map *map) goto out; } - dso->adjust_symbols = 0; + dso__set_adjust_symbols(dso, false); if (perfmap) { ret = dso__load_perf_map(map_path, dso); - dso->symtab_type = ret > 0 ? DSO_BINARY_TYPE__JAVA_JIT : - DSO_BINARY_TYPE__NOT_FOUND; + dso__set_symtab_type(dso, ret > 0 + ? DSO_BINARY_TYPE__JAVA_JIT + : DSO_BINARY_TYPE__NOT_FOUND); goto out; } @@ -1849,9 +1856,9 @@ int dso__load(struct dso *dso, struct map *map) * Read the build id if possible. This is required for * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work */ - if (!dso->has_build_id && - is_regular_file(dso->long_name)) { - __symbol__join_symfs(name, PATH_MAX, dso->long_name); + if (!dso__has_build_id(dso) && + is_regular_file(dso__long_name(dso))) { + __symbol__join_symfs(name, PATH_MAX, dso__long_name(dso)); if (filename__read_build_id(name, &bid) > 0) dso__set_build_id(dso, &bid); } @@ -1885,7 +1892,7 @@ int dso__load(struct dso *dso, struct map *map) nsinfo__mountns_exit(&nsc); is_reg = is_regular_file(name); - if (!is_reg && errno == ENOENT && dso->nsinfo) { + if (!is_reg && errno == ENOENT && dso__nsinfo(dso)) { char *new_name = dso__filename_with_chroot(dso, name); if (new_name) { is_reg = is_regular_file(new_name); @@ -1902,7 +1909,7 @@ int dso__load(struct dso *dso, struct map *map) sirc = symsrc__init(ss, dso, name, symtab_type); if (nsexit) - nsinfo__mountns_enter(dso->nsinfo, &nsc); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); if (bfdrc == 0) { ret = 0; @@ -1915,8 +1922,8 @@ int dso__load(struct dso *dso, struct map *map) if (!syms_ss && symsrc__has_symtab(ss)) { syms_ss = ss; next_slot = true; - if (!dso->symsrc_filename) - dso->symsrc_filename = strdup(name); + if (!dso__symsrc_filename(dso)) + dso__set_symsrc_filename(dso, strdup(name)); } if (!runtime_ss && symsrc__possibly_runtime(ss)) { @@ -1963,11 +1970,11 @@ int dso__load(struct dso *dso, struct map *map) symsrc__destroy(&ss_[ss_pos - 1]); out_free: free(name); - if (ret < 0 && strstr(dso->name, " (deleted)") != NULL) + if (ret < 0 && strstr(dso__name(dso), " (deleted)") != NULL) ret = 0; out: dso__set_loaded(dso); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); nsinfo__mountns_exit(&nsc); return ret; @@ -1990,7 +1997,7 @@ int dso__load_vmlinux(struct dso *dso, struct map *map, else symbol__join_symfs(symfs_vmlinux, vmlinux); - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) symtab_type = DSO_BINARY_TYPE__GUEST_VMLINUX; else symtab_type = DSO_BINARY_TYPE__VMLINUX; @@ -2006,10 +2013,10 @@ int dso__load_vmlinux(struct dso *dso, struct map *map, * an incorrect long name unless we set it here first. */ dso__set_long_name(dso, vmlinux, vmlinux_allocated); - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) - dso->binary_type = DSO_BINARY_TYPE__GUEST_VMLINUX; + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) + dso__set_binary_type(dso, DSO_BINARY_TYPE__GUEST_VMLINUX); else - dso->binary_type = DSO_BINARY_TYPE__VMLINUX; + dso__set_binary_type(dso, DSO_BINARY_TYPE__VMLINUX); err = dso__load_sym(dso, map, &ss, &ss, 0); symsrc__destroy(&ss); @@ -2101,7 +2108,7 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map) bool is_host = false; char path[PATH_MAX]; - if (!dso->has_build_id) { + if (!dso__has_build_id(dso)) { /* * Last resort, if we don't have a build-id and couldn't find * any vmlinux file, try the running kernel kallsyms table. @@ -2126,7 +2133,7 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map) goto proc_kallsyms; } - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); /* Find kallsyms in build-id cache with kcore */ scnprintf(path, sizeof(path), "%s/%s/%s", @@ -2218,7 +2225,7 @@ do_kallsyms: free(kallsyms_allocated_filename); if (err > 0 && !dso__is_kcore(dso)) { - dso->binary_type = DSO_BINARY_TYPE__KALLSYMS; + dso__set_binary_type(dso, DSO_BINARY_TYPE__KALLSYMS); dso__set_long_name(dso, DSO__NAME_KALLSYMS, false); map__fixup_start(map); map__fixup_end(map); @@ -2261,7 +2268,7 @@ static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map) if (err > 0) pr_debug("Using %s for symbols\n", kallsyms_filename); if (err > 0 && !dso__is_kcore(dso)) { - dso->binary_type = DSO_BINARY_TYPE__GUEST_KALLSYMS; + dso__set_binary_type(dso, DSO_BINARY_TYPE__GUEST_KALLSYMS); dso__set_long_name(dso, machine->mmap_name, false); map__fixup_start(map); map__fixup_end(map); @@ -2575,31 +2582,6 @@ int symbol__config_symfs(const struct option *opt __maybe_unused, return 0; } -struct mem_info *mem_info__get(struct mem_info *mi) -{ - if (mi) - refcount_inc(&mi->refcnt); - return mi; -} - -void mem_info__put(struct mem_info *mi) -{ - if (mi && refcount_dec_and_test(&mi->refcnt)) { - addr_map_symbol__exit(&mi->iaddr); - addr_map_symbol__exit(&mi->daddr); - free(mi); - } -} - -struct mem_info *mem_info__new(void) -{ - struct mem_info *mi = zalloc(sizeof(*mi)); - - if (mi) - refcount_set(&mi->refcnt, 1); - return mi; -} - /* * Checks that user supplied symbol kernel files are accessible because * the default mechanism for accessing elf files fails silently. i.e. if diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 071837ddce..3fb5d146d9 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -268,18 +268,6 @@ enum { SDT_NOTE_IDX_REFCTR, }; -struct mem_info *mem_info__new(void); -struct mem_info *mem_info__get(struct mem_info *mi); -void mem_info__put(struct mem_info *mi); - -static inline void __mem_info__zput(struct mem_info **mi) -{ - mem_info__put(*mi); - *mi = NULL; -} - -#define mem_info__zput(mi) __mem_info__zput(&mi) - int symbol__validate_sym_arguments(void); #endif /* __PERF_SYMBOL */ diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c index 088f4abf23..53e1af4ed9 100644 --- a/tools/perf/util/symbol_fprintf.c +++ b/tools/perf/util/symbol_fprintf.c @@ -64,8 +64,8 @@ size_t dso__fprintf_symbols_by_name(struct dso *dso, { size_t ret = 0; - for (size_t i = 0; i < dso->symbol_names_len; i++) { - struct symbol *pos = dso->symbol_names[i]; + for (size_t i = 0; i < dso__symbol_names_len(dso); i++) { + struct symbol *pos = dso__symbol_names(dso)[i]; ret += fprintf(fp, "%s\n", pos->name); } diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 2a0289c149..5498048f56 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -385,8 +385,8 @@ static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event, id.ino_generation = event->ino_generation; dso = dsos__findnew_id(&machine->dsos, event->filename, &id); - if (dso && dso->has_build_id) { - bid = dso->bid; + if (dso && dso__has_build_id(dso)) { + bid = *dso__bid(dso); rc = 0; goto out; } @@ -407,7 +407,7 @@ out: event->__reserved_1 = 0; event->__reserved_2 = 0; - if (dso && !dso->has_build_id) + if (dso && !dso__has_build_id(dso)) dso__set_build_id(dso, &bid); } else { if (event->filename[0] == '/') { @@ -684,7 +684,7 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) dso = map__dso(map); if (symbol_conf.buildid_mmap2) { - size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); + size = PERF_ALIGN(dso__long_name_len(dso) + 1, sizeof(u64)); event->mmap2.header.type = PERF_RECORD_MMAP2; event->mmap2.header.size = (sizeof(event->mmap2) - (sizeof(event->mmap2.filename) - size)); @@ -694,11 +694,11 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) event->mmap2.len = map__size(map); event->mmap2.pid = args->machine->pid; - memcpy(event->mmap2.filename, dso->long_name, dso->long_name_len + 1); + memcpy(event->mmap2.filename, dso__long_name(dso), dso__long_name_len(dso) + 1); perf_record_mmap2__read_build_id(&event->mmap2, args->machine, false); } else { - size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); + size = PERF_ALIGN(dso__long_name_len(dso) + 1, sizeof(u64)); event->mmap.header.type = PERF_RECORD_MMAP; event->mmap.header.size = (sizeof(event->mmap) - (sizeof(event->mmap.filename) - size)); @@ -708,7 +708,7 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) event->mmap.len = map__size(map); event->mmap.pid = args->machine->pid; - memcpy(event->mmap.filename, dso->long_name, dso->long_name_len + 1); + memcpy(event->mmap.filename, dso__long_name(dso), dso__long_name_len(dso) + 1); } if (perf_tool__process_synth_event(args->tool, event, args->machine, args->process) != 0) @@ -2231,20 +2231,20 @@ int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 union perf_event ev; size_t len; - if (!pos->hit) + if (!dso__hit(pos)) return 0; memset(&ev, 0, sizeof(ev)); - len = pos->long_name_len + 1; + len = dso__long_name_len(pos) + 1; len = PERF_ALIGN(len, NAME_ALIGN); - ev.build_id.size = min(pos->bid.size, sizeof(pos->bid.data)); - memcpy(&ev.build_id.build_id, pos->bid.data, ev.build_id.size); + ev.build_id.size = min(dso__bid(pos)->size, sizeof(dso__bid(pos)->data)); + memcpy(&ev.build_id.build_id, dso__bid(pos)->data, ev.build_id.size); ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID; ev.build_id.header.misc = misc | PERF_RECORD_MISC_BUILD_ID_SIZE; ev.build_id.pid = machine->pid; ev.build_id.header.size = sizeof(ev.build_id) + len; - memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); + memcpy(&ev.build_id.filename, dso__long_name(pos), dso__long_name_len(pos)); return process(tool, &ev, NULL, machine); } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 515726489e..87c59aa9fe 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -453,14 +453,14 @@ int thread__memcpy(struct thread *thread, struct machine *machine, dso = map__dso(al.map); - if (!dso || dso->data.status == DSO_DATA_STATUS_ERROR || map__load(al.map) < 0) { + if (!dso || dso__data(dso)->status == DSO_DATA_STATUS_ERROR || map__load(al.map) < 0) { addr_location__exit(&al); return -1; } offset = map__map_ip(al.map, ip); if (is64bit) - *is64bit = dso->is_64_bit; + *is64bit = dso__is_64_bit(dso); addr_location__exit(&al); diff --git a/tools/perf/util/tracepoint.c b/tools/perf/util/tracepoint.c index 92dd8b455b..95377ed5d8 100644 --- a/tools/perf/util/tracepoint.c +++ b/tools/perf/util/tracepoint.c @@ -4,10 +4,12 @@ #include #include #include +#include #include #include #include +#include "fncache.h" int tp_event_has_id(const char *dir_path, struct dirent *evt_dir) { @@ -26,39 +28,25 @@ int tp_event_has_id(const char *dir_path, struct dirent *evt_dir) /* * Check whether event is in /tracing/events */ -int is_valid_tracepoint(const char *event_string) +bool is_valid_tracepoint(const char *event_string) { - DIR *sys_dir, *evt_dir; - struct dirent *sys_dirent, *evt_dirent; - char evt_path[MAXPATHLEN]; - char *dir_path; - - sys_dir = tracing_events__opendir(); - if (!sys_dir) - return 0; - - for_each_subsystem(sys_dir, sys_dirent) { - dir_path = get_events_file(sys_dirent->d_name); - if (!dir_path) - continue; - evt_dir = opendir(dir_path); - if (!evt_dir) - goto next; - - for_each_event(dir_path, evt_dir, evt_dirent) { - snprintf(evt_path, MAXPATHLEN, "%s:%s", - sys_dirent->d_name, evt_dirent->d_name); - if (!strcmp(evt_path, event_string)) { - closedir(evt_dir); - put_events_file(dir_path); - closedir(sys_dir); - return 1; - } - } - closedir(evt_dir); -next: - put_events_file(dir_path); - } - closedir(sys_dir); - return 0; + char *dst, *path = malloc(strlen(event_string) + 4); /* Space for "/id\0". */ + bool have_file = false; /* Conservatively return false if memory allocation failed. */ + const char *src; + + if (!path) + return false; + + /* Copy event_string replacing the ':' with '/'. */ + for (src = event_string, dst = path; *src; src++, dst++) + *dst = (*src == ':') ? '/' : *src; + /* Add "/id\0". */ + memcpy(dst, "/id", 4); + + dst = get_events_file(path); + if (dst) + have_file = file_available(dst); + free(dst); + free(path); + return have_file; } diff --git a/tools/perf/util/tracepoint.h b/tools/perf/util/tracepoint.h index c4a110fe87..65ccb01fc3 100644 --- a/tools/perf/util/tracepoint.h +++ b/tools/perf/util/tracepoint.h @@ -4,6 +4,7 @@ #include #include +#include int tp_event_has_id(const char *dir_path, struct dirent *evt_dir); @@ -20,6 +21,6 @@ int tp_event_has_id(const char *dir_path, struct dirent *evt_dir); (strcmp(sys_dirent->d_name, ".")) && \ (strcmp(sys_dirent->d_name, ".."))) -int is_valid_tracepoint(const char *event_string); +bool is_valid_tracepoint(const char *event_string); #endif /* __PERF_TRACEPOINT_H */ diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index b38d322734..bde216e630 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -29,8 +29,8 @@ static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata, const struct dso *dso = *userdata; assert(dso); - if (dso->symsrc_filename && strcmp (file_name, dso->symsrc_filename)) - *debuginfo_file_name = strdup(dso->symsrc_filename); + if (dso__symsrc_filename(dso) && strcmp(file_name, dso__symsrc_filename(dso))) + *debuginfo_file_name = strdup(dso__symsrc_filename(dso)); return -1; } @@ -66,7 +66,7 @@ static int __report_module(struct addr_location *al, u64 ip, * a different code in another DSO. So just use the map->start * directly to pick the correct one. */ - if (!strncmp(dso->long_name, "/tmp/jitted-", 12)) + if (!strncmp(dso__long_name(dso), "/tmp/jitted-", 12)) base = map__start(al->map); else base = map__start(al->map) - map__pgoff(al->map); @@ -83,15 +83,15 @@ static int __report_module(struct addr_location *al, u64 ip, if (!mod) { char filename[PATH_MAX]; - __symbol__join_symfs(filename, sizeof(filename), dso->long_name); - mod = dwfl_report_elf(ui->dwfl, dso->short_name, filename, -1, + __symbol__join_symfs(filename, sizeof(filename), dso__long_name(dso)); + mod = dwfl_report_elf(ui->dwfl, dso__short_name(dso), filename, -1, base, false); } if (!mod) { char filename[PATH_MAX]; if (dso__build_id_filename(dso, filename, sizeof(filename), false)) - mod = dwfl_report_elf(ui->dwfl, dso->short_name, filename, -1, + mod = dwfl_report_elf(ui->dwfl, dso__short_name(dso), filename, -1, base, false); } diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 6a5ac0faa6..7460bb96bd 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -329,27 +329,27 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct unwind_info *ui, }; int ret, fd; - if (dso->data.eh_frame_hdr_offset == 0) { + if (dso__data(dso)->eh_frame_hdr_offset == 0) { fd = dso__data_get_fd(dso, ui->machine); if (fd < 0) return -EINVAL; /* Check the .eh_frame section for unwinding info */ ret = elf_section_address_and_offset(fd, ".eh_frame_hdr", - &dso->data.eh_frame_hdr_addr, - &dso->data.eh_frame_hdr_offset); - dso->data.elf_base_addr = elf_base_address(fd); + &dso__data(dso)->eh_frame_hdr_addr, + &dso__data(dso)->eh_frame_hdr_offset); + dso__data(dso)->elf_base_addr = elf_base_address(fd); dso__data_put_fd(dso); - if (ret || dso->data.eh_frame_hdr_offset == 0) + if (ret || dso__data(dso)->eh_frame_hdr_offset == 0) return -EINVAL; } maps__for_each_map(thread__maps(ui->thread), read_unwind_spec_eh_frame_maps_cb, &args); - args.base_addr -= dso->data.elf_base_addr; + args.base_addr -= dso__data(dso)->elf_base_addr; /* Address of .eh_frame_hdr */ - *segbase = args.base_addr + dso->data.eh_frame_hdr_addr; - ret = unwind_spec_ehframe(dso, ui->machine, dso->data.eh_frame_hdr_offset, + *segbase = args.base_addr + dso__data(dso)->eh_frame_hdr_addr; + ret = unwind_spec_ehframe(dso, ui->machine, dso__data(dso)->eh_frame_hdr_offset, table_data, fde_count); if (ret) return ret; @@ -363,7 +363,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso, struct machine *machine, u64 *offset) { int fd; - u64 ofs = dso->data.debug_frame_offset; + u64 ofs = dso__data(dso)->debug_frame_offset; /* debug_frame can reside in: * - dso @@ -379,7 +379,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso, } if (ofs <= 0) { - fd = open(dso->symsrc_filename, O_RDONLY); + fd = open(dso__symsrc_filename(dso), O_RDONLY); if (fd >= 0) { ofs = elf_section_offset(fd, ".debug_frame"); close(fd); @@ -402,21 +402,21 @@ static int read_unwind_spec_debug_frame(struct dso *dso, } } if (ofs > 0) { - if (dso->symsrc_filename != NULL) { + if (dso__symsrc_filename(dso) != NULL) { pr_warning( "%s: overwrite symsrc(%s,%s)\n", __func__, - dso->symsrc_filename, + dso__symsrc_filename(dso), debuglink); - zfree(&dso->symsrc_filename); + dso__free_symsrc_filename(dso); } - dso->symsrc_filename = debuglink; + dso__set_symsrc_filename(dso, debuglink); } else { free(debuglink); } } - dso->data.debug_frame_offset = ofs; + dso__data(dso)->debug_frame_offset = ofs; } *offset = ofs; @@ -460,7 +460,7 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, return -EINVAL; } - pr_debug("unwind: find_proc_info dso %s\n", dso->name); + pr_debug("unwind: find_proc_info dso %s\n", dso__name(dso)); /* Check the .eh_frame section for unwinding info */ if (!read_unwind_spec_eh_frame(dso, ui, &table_data, &segbase, &fde_count)) { @@ -481,7 +481,7 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, if (ret < 0 && !read_unwind_spec_debug_frame(dso, ui->machine, &segbase)) { int fd = dso__data_get_fd(dso, ui->machine); - int is_exec = elf_is_exec(fd, dso->name); + int is_exec = elf_is_exec(fd, dso__name(dso)); u64 start = map__start(map); unw_word_t base = is_exec ? 0 : start; const char *symfile; @@ -489,7 +489,7 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, if (fd >= 0) dso__data_put_fd(dso); - symfile = dso->symsrc_filename ?: dso->name; + symfile = dso__symsrc_filename(dso) ?: dso__name(dso); memset(&di, 0, sizeof(di)); if (dwarf_find_debug_frame(0, &di, ip, base, symfile, start, map__end(map))) diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c index 2728eb4f13..cb8be6acfb 100644 --- a/tools/perf/util/unwind-libunwind.c +++ b/tools/perf/util/unwind-libunwind.c @@ -25,7 +25,7 @@ int unwind__prepare_access(struct maps *maps, struct map *map, bool *initialized return 0; if (maps__addr_space(maps)) { - pr_debug("unwind: thread map already set, dso=%s\n", dso->name); + pr_debug("unwind: thread map already set, dso=%s\n", dso__name(dso)); if (initialized) *initialized = true; return 0; diff --git a/tools/perf/util/values.h b/tools/perf/util/values.h index 8c41f22f42..791c1ad606 100644 --- a/tools/perf/util/values.h +++ b/tools/perf/util/values.h @@ -2,6 +2,7 @@ #ifndef __PERF_VALUES_H #define __PERF_VALUES_H +#include #include struct perf_read_values { diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index df89637961..1b6f8f6db7 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -133,8 +133,6 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s if (dso != NULL) { __dsos__add(&machine->dsos, dso); dso__set_long_name(dso, long_name, false); - /* Put dso here because __dsos_add already got it */ - dso__put(dso); } return dso; @@ -150,7 +148,7 @@ static int machine__thread_dso_type_maps_cb(struct map *map, void *data) struct machine__thread_dso_type_maps_cb_args *args = data; struct dso *dso = map__dso(map); - if (!dso || dso->long_name[0] != '/') + if (!dso || dso__long_name(dso)[0] != '/') return 0; args->dso_type = dso__type(dso, args->machine); @@ -252,17 +250,15 @@ static struct dso *__machine__findnew_compat(struct machine *machine, const char *file_name; struct dso *dso; - dso = __dsos__find(&machine->dsos, vdso_file->dso_name, true); + dso = dsos__find(&machine->dsos, vdso_file->dso_name, true); if (dso) - goto out; + return dso; file_name = vdso__get_compat_file(vdso_file); if (!file_name) - goto out; + return NULL; - dso = __machine__addnew_vdso(machine, vdso_file->dso_name, file_name); -out: - return dso; + return __machine__addnew_vdso(machine, vdso_file->dso_name, file_name); } static int __machine__findnew_vdso_compat(struct machine *machine, @@ -308,21 +304,21 @@ static struct dso *machine__find_vdso(struct machine *machine, dso_type = machine__thread_dso_type(machine, thread); switch (dso_type) { case DSO__TYPE_32BIT: - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO32, true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO32, true); if (!dso) { - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO, - true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO, + true); if (dso && dso_type != dso__type(dso, machine)) dso = NULL; } break; case DSO__TYPE_X32BIT: - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSOX32, true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSOX32, true); break; case DSO__TYPE_64BIT: case DSO__TYPE_UNKNOWN: default: - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO, true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO, true); break; } @@ -334,42 +330,38 @@ struct dso *machine__findnew_vdso(struct machine *machine, { struct vdso_info *vdso_info; struct dso *dso = NULL; + char *file; - down_write(&machine->dsos.lock); if (!machine->vdso_info) machine->vdso_info = vdso_info__new(); vdso_info = machine->vdso_info; if (!vdso_info) - goto out_unlock; + return NULL; dso = machine__find_vdso(machine, thread); if (dso) - goto out_unlock; + return dso; #if BITS_PER_LONG == 64 if (__machine__findnew_vdso_compat(machine, thread, vdso_info, &dso)) - goto out_unlock; + return dso; #endif - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO, true); - if (!dso) { - char *file; + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO, true); + if (dso) + return dso; - file = get_file(&vdso_info->vdso); - if (file) - dso = __machine__addnew_vdso(machine, DSO__NAME_VDSO, file); - } + file = get_file(&vdso_info->vdso); + if (!file) + return NULL; -out_unlock: - dso__get(dso); - up_write(&machine->dsos.lock); - return dso; + return __machine__addnew_vdso(machine, DSO__NAME_VDSO, file); } bool dso__is_vdso(struct dso *dso) { - return !strcmp(dso->short_name, DSO__NAME_VDSO) || - !strcmp(dso->short_name, DSO__NAME_VDSO32) || - !strcmp(dso->short_name, DSO__NAME_VDSOX32); + return !strcmp(dso__short_name(dso), DSO__NAME_VDSO) || + !strcmp(dso__short_name(dso), DSO__NAME_VDSO32) || + !strcmp(dso__short_name(dso), DSO__NAME_VDSOX32); } diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c index 388c9e3ad0..44a9ecbd91 100644 --- a/tools/power/acpi/tools/pfrut/pfrut.c +++ b/tools/power/acpi/tools/pfrut/pfrut.c @@ -174,6 +174,8 @@ void print_cap(struct pfru_update_cap_info *cap) exit(1); } + printf("update capability:%d\n", cap->update_cap); + uuid_unparse(cap->code_type, uuid); printf("code injection image type:%s\n", uuid); printf("fw_version:%d\n", cap->fw_version); diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index d865dc1f89..5899c27c2e 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -16,9 +16,9 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.18"; +static const char *version_str = "v1.19"; -static const int supported_api_ver = 2; +static const int supported_api_ver = 3; static struct isst_if_platform_info isst_platform_info; static char *progname; static int debug_flag; @@ -46,6 +46,8 @@ static int force_online_offline; static int auto_mode; static int fact_enable_fail; static int cgroupv2; +static int max_die_id; +static int max_punit_id; /* clos related */ static int current_clos = -1; @@ -562,6 +564,18 @@ void for_each_online_power_domain_in_set(void (*callback)(struct isst_id *, void } for (i = 0; i < MAX_PACKAGE_COUNT; i++) { + if (max_die_id == max_punit_id) { + for (k = 0; k < MAX_PUNIT_PER_DIE && k < MAX_DIE_PER_PACKAGE; k++) { + id.cpu = cpus[i][k][k]; + id.pkg = i; + id.die = k; + id.punit = k; + if (isst_is_punit_valid(&id)) + callback(&id, arg1, arg2, arg3, arg4); + } + continue; + } + for (j = 0; j < MAX_DIE_PER_PACKAGE; j++) { /* * Fix me: @@ -795,6 +809,12 @@ static void create_cpu_map(void) cpu_cnt[pkg_id][die_id][punit_id]++; + if (max_die_id < die_id) + max_die_id = die_id; + + if (max_punit_id < cpu_map[i].punit_id) + max_punit_id = cpu_map[i].punit_id; + debug_printf( "map logical_cpu:%d core: %d die:%d pkg:%d punit:%d punit_cpu:%d punit_core:%d\n", i, cpu_map[i].core_id, cpu_map[i].die_id, @@ -2054,6 +2074,7 @@ static void dump_fact_config_for_cpu(struct isst_id *id, void *arg1, void *arg2, struct isst_fact_info fact_info; int ret; + memset(&fact_info, 0, sizeof(fact_info)); ret = isst_get_fact_info(id, tdp_level, fact_bucket, &fact_info); if (ret) { isst_display_error_info_message(1, "Failed to get turbo-freq info at this level", 1, tdp_level); diff --git a/tools/power/x86/intel-speed-select/isst-core-mbox.c b/tools/power/x86/intel-speed-select/isst-core-mbox.c index 24bea57f4f..c81ecd602b 100644 --- a/tools/power/x86/intel-speed-select/isst-core-mbox.c +++ b/tools/power/x86/intel-speed-select/isst-core-mbox.c @@ -746,6 +746,7 @@ static int mbox_set_pbf_fact_status(struct isst_id *id, int pbf, int enable) static int _get_fact_bucket_info(struct isst_id *id, int level, struct isst_fact_bucket_info *bucket_info) { + int trl_max_levels = isst_get_trl_max_levels(); unsigned int resp; int i, k, ret; @@ -769,7 +770,7 @@ static int _get_fact_bucket_info(struct isst_id *id, int level, } } - for (k = 0; k < 3; ++k) { + for (k = 0; k < trl_max_levels; ++k) { for (i = 0; i < 2; ++i) { int j; diff --git a/tools/power/x86/intel-speed-select/isst-core-tpmi.c b/tools/power/x86/intel-speed-select/isst-core-tpmi.c index 3458768562..32ea70c7db 100644 --- a/tools/power/x86/intel-speed-select/isst-core-tpmi.c +++ b/tools/power/x86/intel-speed-select/isst-core-tpmi.c @@ -194,8 +194,14 @@ static int tpmi_get_ctdp_control(struct isst_id *id, int config_index, if (!(info.level_mask & level_mask)) return -1; - ctdp_level->fact_support = info.sst_tf_support; - ctdp_level->pbf_support = info.sst_bf_support; + if (api_version() > 2) { + ctdp_level->fact_support = info.sst_tf_support & BIT(config_index); + ctdp_level->pbf_support = info.sst_bf_support & BIT(config_index); + } else { + ctdp_level->fact_support = info.sst_tf_support; + ctdp_level->pbf_support = info.sst_bf_support; + } + ctdp_level->fact_enabled = !!(info.feature_state & BIT(1)); ctdp_level->pbf_enabled = !!(info.feature_state & BIT(0)); diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index f55fef4c13..05efffbca3 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -23,6 +23,7 @@ int isst_set_platform_ops(int api_version) isst_ops = mbox_get_platform_ops(); break; case 2: + case 3: isst_ops = tpmi_get_platform_ops(); break; default: diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 14c9b03785..07ebd08f32 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -172,12 +172,19 @@ static int print_package_info(struct isst_id *id, FILE *outf) int level = 1; if (out_format_is_json()) { - if (api_version() > 1) - snprintf(header, sizeof(header), "package-%d:die-%d:powerdomain-%d:cpu-%d", - id->pkg, id->die, id->punit, id->cpu); - else + if (api_version() > 1) { + if (id->cpu < 0) + snprintf(header, sizeof(header), + "package-%d:die-%d:powerdomain-%d:cpu-None", + id->pkg, id->die, id->punit); + else + snprintf(header, sizeof(header), + "package-%d:die-%d:powerdomain-%d:cpu-%d", + id->pkg, id->die, id->punit, id->cpu); + } else { snprintf(header, sizeof(header), "package-%d:die-%d:cpu-%d", id->pkg, id->die, id->cpu); + } format_and_print(outf, level, header, NULL); return 1; } @@ -189,7 +196,12 @@ static int print_package_info(struct isst_id *id, FILE *outf) snprintf(header, sizeof(header), "powerdomain-%d", id->punit); format_and_print(outf, level++, header, NULL); } - snprintf(header, sizeof(header), "cpu-%d", id->cpu); + + if (id->cpu < 0) + snprintf(header, sizeof(header), "cpu-None"); + else + snprintf(header, sizeof(header), "cpu-%d", id->cpu); + format_and_print(outf, level, header, NULL); return level; @@ -199,8 +211,8 @@ static void _isst_pbf_display_information(struct isst_id *id, FILE *outf, int le struct isst_pbf_info *pbf_info, int disp_level) { - char header[256]; - char value[512]; + static char header[256]; + static char value[1024]; snprintf(header, sizeof(header), "speed-select-base-freq-properties"); format_and_print(outf, disp_level, header, NULL); @@ -338,8 +350,8 @@ void isst_ctdp_display_core_info(struct isst_id *id, FILE *outf, char *prefix, void isst_ctdp_display_information(struct isst_id *id, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { - char header[256]; - char value[512]; + static char header[256]; + static char value[1024]; static int level; int trl_max_levels = isst_get_trl_max_levels(); int i; diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 4bddd3c66b..39ee75677c 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -80,7 +80,7 @@ #define DISP_FREQ_MULTIPLIER 100 #define MAX_PACKAGE_COUNT 32 -#define MAX_DIE_PER_PACKAGE 2 +#define MAX_DIE_PER_PACKAGE 16 #define MAX_PUNIT_PER_DIE 8 /* Unified structure to specific a CPU or a Power Domain */ diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 92e139b9c7..b1e6817f1e 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -3,6 +3,8 @@ CC = $(CROSS_COMPILE)gcc BUILD_OUTPUT := $(CURDIR) PREFIX ?= /usr DESTDIR ?= +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = turbostat-$(DAY) ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -12,6 +14,7 @@ turbostat : turbostat.c override CFLAGS += -O2 -Wall -Wextra -I../../../include override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' override CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"' +override CFLAGS += -DBUILD_BUG_HEADER='"../../../../include/linux/build_bug.h"' override CFLAGS += -D_FILE_OFFSET_BITS=64 override CFLAGS += -D_FORTIFY_SOURCE=2 @@ -22,9 +25,33 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 .PHONY : clean clean : @rm -f $(BUILD_OUTPUT)/turbostat + @rm -f $(SNAPSHOT).tar.gz install : turbostat - install -d $(DESTDIR)$(PREFIX)/bin + install -d $(DESTDIR)$(PREFIX)/bin install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat - install -d $(DESTDIR)$(PREFIX)/share/man/man8 + install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8 + +snapshot: turbostat + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp turbostat Makefile turbostat.c turbostat.8 ../../../../arch/x86/include/asm/intel-family.h $(SNAPSHOT) + + @sed -e 's/^#include /#include "bits.h"/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DINTEL_FAMILY_HEADER='\"intel-family.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DBUILD_BUG_HEADER='\"build_bug.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' -e's/.*BUILD_BUG_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d3672e5d9..8d37acd392 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -155,7 +155,9 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .PP -\fBUncMHz\fP uncore MHz, instantaneous sample. +\fBUncMHz\fP per-package uncore MHz, instantaneous sample. +.PP +\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. This is ideal for remote debugging, use the "--out" option to save everything to a text file, and get that file to the expert helping you debug. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5d80d193e5..9f5d053d4b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -10,6 +10,7 @@ #define _GNU_SOURCE #include MSRHEADER #include INTEL_FAMILY_HEADER +#include BUILD_BUG_HEADER #include #include #include @@ -58,15 +59,22 @@ #define MAX_NOFILE 0x8000 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; -enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; -enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; + +struct sysfs_path { + char path[PATH_BYTES]; + int id; + struct sysfs_path *next; +}; struct msr_counter { unsigned int msr_num; char name[NAME_BYTES]; - char path[PATH_BYTES]; + struct sysfs_path *sp; unsigned int width; enum counter_type type; enum counter_format format; @@ -78,64 +86,64 @@ struct msr_counter { }; struct msr_counter bic[] = { - { 0x0, "usec", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Package", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Node", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Core", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU", "", 0, 0, 0, NULL, 0 }, - { 0x0, "APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Die", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, - { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Package", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Node", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, + { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Core", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -216,6 +224,28 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +/* + * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: + * If you change the values, note they are used both in comparisons + * (>= PCL__7) and to index pkg_cstate_limit_strings[]. + */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ + struct amperf_group_fd; char *proc_stat = "/proc/stat"; @@ -299,6 +329,9 @@ struct gfx_sysfs_info { static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); +int add_counter(unsigned int msr_num, char *path, char *name, + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags, int package_num); /* Model specific support Start */ @@ -663,6 +696,23 @@ static const struct platform_features adl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features arl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -905,8 +955,10 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE, &cnl_features }, - { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, + { INTEL_FAM6_ARROWLAKE_H, &arl_features }, + { INTEL_FAM6_ARROWLAKE_U, &arl_features }, + { INTEL_FAM6_ARROWLAKE, &arl_features }, + { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, @@ -979,8 +1031,9 @@ char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; -#define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 +#define MAX_ADDED_CORE_COUNTERS 8 +#define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 /* Indexes used to map data read from perf and MSRs into global variables */ @@ -1153,6 +1206,161 @@ struct rapl_counter { double scale; }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum ccstate_rci_index { + CCSTATE_RCI_INDEX_C1_RESIDENCY = 0, + CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, + CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, + CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, + PCSTATE_RCI_INDEX_C2_RESIDENCY = 4, + PCSTATE_RCI_INDEX_C3_RESIDENCY = 5, + PCSTATE_RCI_INDEX_C6_RESIDENCY = 6, + PCSTATE_RCI_INDEX_C7_RESIDENCY = 7, + PCSTATE_RCI_INDEX_C8_RESIDENCY = 8, + PCSTATE_RCI_INDEX_C9_RESIDENCY = 9, + PCSTATE_RCI_INDEX_C10_RESIDENCY = 10, + NUM_CSTATE_COUNTERS, +}; + +struct cstate_counter_info_t { + unsigned long long data[NUM_CSTATE_COUNTERS]; + enum cstate_source source[NUM_CSTATE_COUNTERS]; + unsigned long long msr[NUM_CSTATE_COUNTERS]; + int fd_perf_core; + int fd_perf_pkg; +}; + +struct cstate_counter_info_t *ccstate_counter_info; +unsigned int ccstate_counter_info_size; + +#define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE (1u << 0) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2) + +struct cstate_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + unsigned long long flags; + int pkg_cstate_limit; +}; + +static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { + { + .feature_mask = CC1, + .perf_subsys = "cstate_core", + .perf_name = "c1-residency", + .msr = MSR_CORE_C1_RES, + .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, + .bic = BIC_CPU_c1, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC3, + .perf_subsys = "cstate_core", + .perf_name = "c3-residency", + .msr = MSR_CORE_C3_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_CPU_c3, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC6, + .perf_subsys = "cstate_core", + .perf_name = "c6-residency", + .msr = MSR_CORE_C6_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_CPU_c6, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC7, + .perf_subsys = "cstate_core", + .perf_name = "c7-residency", + .msr = MSR_CORE_C7_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_CPU_c7, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = PC2, + .perf_subsys = "cstate_pkg", + .perf_name = "c2-residency", + .msr = MSR_PKG_C2_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY, + .bic = BIC_Pkgpc2, + .flags = 0, + .pkg_cstate_limit = PCL__2, + }, + { + .feature_mask = PC3, + .perf_subsys = "cstate_pkg", + .perf_name = "c3-residency", + .msr = MSR_PKG_C3_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_Pkgpc3, + .flags = 0, + .pkg_cstate_limit = PCL__3, + }, + { + .feature_mask = PC6, + .perf_subsys = "cstate_pkg", + .perf_name = "c6-residency", + .msr = MSR_PKG_C6_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_Pkgpc6, + .flags = 0, + .pkg_cstate_limit = PCL__6, + }, + { + .feature_mask = PC7, + .perf_subsys = "cstate_pkg", + .perf_name = "c7-residency", + .msr = MSR_PKG_C7_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_Pkgpc7, + .flags = 0, + .pkg_cstate_limit = PCL__7, + }, + { + .feature_mask = PC8, + .perf_subsys = "cstate_pkg", + .perf_name = "c8-residency", + .msr = MSR_PKG_C8_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY, + .bic = BIC_Pkgpc8, + .flags = 0, + .pkg_cstate_limit = PCL__8, + }, + { + .feature_mask = PC9, + .perf_subsys = "cstate_pkg", + .perf_name = "c9-residency", + .msr = MSR_PKG_C9_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY, + .bic = BIC_Pkgpc9, + .flags = 0, + .pkg_cstate_limit = PCL__9, + }, + { + .feature_mask = PC10, + .perf_subsys = "cstate_pkg", + .perf_name = "c10-residency", + .msr = MSR_PKG_C10_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY, + .bic = BIC_Pkgpc10, + .flags = 0, + .pkg_cstate_limit = PCL_10, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1182,7 +1390,7 @@ struct core_data { struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1215,7 +1423,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1358,36 +1566,42 @@ struct sys_counters { struct msr_counter *pp; } sys; -void free_sys_counters(void) +static size_t free_msr_counters_(struct msr_counter **pp) { - struct msr_counter *p = sys.tp, *pnext = NULL; + struct msr_counter *p = NULL; + size_t num_freed = 0; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + while (*pp) { + p = *pp; - p = sys.cp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + if (p->msr_num != 0) { + *pp = p->next; + + free(p); + ++num_freed; - p = sys.pp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; + continue; + } + + pp = &p->next; } - sys.added_thread_counters = 0; - sys.added_core_counters = 0; - sys.added_package_counters = 0; - sys.tp = NULL; - sys.cp = NULL; - sys.pp = NULL; + return num_freed; +} + +/* + * Free all added counters accessed via msr. + */ +static void free_sys_msr_counters(void) +{ + /* Thread counters */ + sys.added_thread_counters -= free_msr_counters_(&sys.tp); + + /* Core counters */ + sys.added_core_counters -= free_msr_counters_(&sys.cp); + + /* Package counters */ + sys.added_package_counters -= free_msr_counters_(&sys.pp); } struct system_summary { @@ -1533,23 +1747,12 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = - BIC_SMI | - BIC_CPU_c1 | - BIC_CPU_c3 | - BIC_CPU_c6 | - BIC_CPU_c7 | - BIC_Mod_c6 | - BIC_CoreTmp | - BIC_Totl_c0 | - BIC_Any_c0 | - BIC_GFX_c0 | - BIC_CPUGFX | - BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; - free_sys_counters(); + free_sys_msr_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -1932,13 +2135,15 @@ void print_header(char *delim) if (mp->format == FORMAT_RAW) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else + else if (mp->width == 32) outp += sprintf(outp, "%s%10.10s", delim, mp->name); + else + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } else { if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) outp += sprintf(outp, "%s%8s", delim, mp->name); else - outp += sprintf(outp, "%s%s", delim, mp->name); + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } } @@ -1970,7 +2175,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - t->counter[i], mp->path); + t->counter[i], mp->sp->path); } } @@ -1991,7 +2196,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - c->counter[i], mp->path); + c->counter[i], mp->sp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -2027,7 +2232,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { outp += sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - p->counter[i], mp->path); + p->counter[i], mp->sp->path); } } @@ -2392,7 +2597,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); - } + } else if (mp->type == COUNTER_K2M) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } done: @@ -2502,6 +2708,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; + else if (mp->format == FORMAT_AVERAGE) + old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; } @@ -2974,7 +3182,7 @@ unsigned long long snapshot_sysfs_counter(char *path) return counter; } -int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) +int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path) { if (mp->msr_num != 0) { assert(!no_msr); @@ -2984,25 +3192,40 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path); *counterp = snapshot_sysfs_counter(path); } else { - *counterp = snapshot_sysfs_counter(mp->path); + *counterp = snapshot_sysfs_counter(counter_path); } } return 0; } -unsigned long long get_uncore_mhz(int package, int die) +unsigned long long get_legacy_uncore_mhz(int package) { char path[128]; + int die; + static int warn_once; + + /* + * for this package, use the first die_id that exists + */ + for (die = 0; die <= topo.max_die_id; ++die) { - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, - die); + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", + package, die); - return (snapshot_sysfs_counter(path) / 1000); + if (access(path, R_OK) == 0) + return (snapshot_sysfs_counter(path) / 1000); + } + if (!warn_once) { + warnx("BUG: %s: No %s", __func__, path); + warn_once = 1; + } + + return 0; } int get_epb(int cpu) @@ -3365,6 +3588,17 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) return ret; } +static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) + if (cci->source[i] == CSTATE_SOURCE_PERF) + ++ret; + + return ret; +} + void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { rc->raw_value = rci->data[idx]; @@ -3439,7 +3673,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct } } - _Static_assert(NUM_RAPL_COUNTERS == 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); @@ -3451,6 +3685,154 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct return 0; } +char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) +{ + while (sp) { + if (sp->id == id) + return (sp->path); + sp = sp->next; + } + if (debug) + warnx("%s: id%d not found", __func__, id); + return NULL; +} + +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + /* + * Overcommit memory a little bit here, + * but skip calculating exact sizes for the buffers. + */ + unsigned long long perf_data[NUM_CSTATE_COUNTERS]; + unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1]; + unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1]; + + struct cstate_counter_info_t *cci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(ccstate_counter_info); + assert(cpu <= ccstate_counter_info_size); + + memset(perf_data, 0, sizeof(perf_data)); + memset(perf_data_core, 0, sizeof(perf_data_core)); + memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + + cci = &ccstate_counter_info[cpu]; + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long); + ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0; + + if (cci->fd_perf_core != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core)); + + if (actual_read_size_core <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core); + } + + if (cci->fd_perf_pkg != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg)); + + if (actual_read_size_pkg <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg); + } + + const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg; + + if (actual_read_size_total != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total); + + /* + * Copy ccstate and pcstate data into unified buffer. + * + * Skip first element from core and pkg buffers. + * Kernel puts there how many counters were read. + */ + const size_t num_core_counters = perf_data_core[0]; + const size_t num_pkg_counters = perf_data_pkg[0]; + + assert(num_perf_counters == num_core_counters + num_pkg_counters); + + /* Copy ccstate perf data */ + memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long)); + + /* Copy pcstate perf data */ + memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long)); + + for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { + switch (cci->source[i]) { + case CSTATE_SOURCE_NONE: + break; + + case CSTATE_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); + + if (debug) { + fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); + } + + cci->data[i] = perf_data[pi]; + + ++pi; + break; + + case CSTATE_SOURCE_MSR: + assert(!no_msr); + if (get_msr(cpu, cci->msr[i], &cci->data[i])) + return -13 - i; + + if (debug) { + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); + } + + break; + } + } + + /* + * Helper to write the data only if the source of + * the counter for the current cpu is not none. + * + * Otherwise we would overwrite core data with 0 (default value), + * when invoked for the thread sibling. + */ +#define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ + if (cci->source[index] != CSTATE_SOURCE_NONE) \ + out_counter = cci->data[index]; \ +} while (0) + + BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11); + + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + + PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY); + +#undef PERF_COUNTER_WRITE_DATA + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3506,13 +3888,11 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { - if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) - return -6; - } + + get_cstate_counters(cpu, t, c, p); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &t->counter[i])) + if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) return -10; } @@ -3526,31 +3906,14 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return status; } - if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { - if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) - return -6; - } - - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { - if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) { - if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } - - if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) { - if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) - return -8; - else if (t->is_atom) { - /* - * For Atom CPUs that has core cstate deeper than c6, - * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. - * Minus CC7 (and deeper cstates) residency to get - * accturate cc6 residency. - */ - c->c6 -= c->c7; - } + if (DO_BIC(BIC_CPU_c7) && t->is_atom) { + /* + * For Atom CPUs that has core cstate deeper than c6, + * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. + * Minus CC7 (and deeper cstates) residency to get + * accturate cc6 residency. + */ + c->c6 -= c->c7; } if (DO_BIC(BIC_Mod_c6)) @@ -3567,7 +3930,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) get_core_throt_cnt(cpu, &c->core_throt_cnt); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &c->counter[i])) + if (get_mp(cpu, mp, &c->counter[i], mp->sp->path)) return -10; } @@ -3591,34 +3954,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0)) return -13; } - if (DO_BIC(BIC_Pkgpc3)) - if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) - return -9; - if (DO_BIC(BIC_Pkgpc6)) { - if (platform->has_msr_atom_pkg_c6_residency) { - if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } else { - if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } - } - - if (DO_BIC(BIC_Pkgpc2)) - if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2)) - return -11; - if (DO_BIC(BIC_Pkgpc7)) - if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7)) - return -12; - if (DO_BIC(BIC_Pkgpc8)) - if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8)) - return -13; - if (DO_BIC(BIC_Pkgpc9)) - if (get_msr(cpu, MSR_PKG_C9_RESIDENCY, &p->pc9)) - return -13; - if (DO_BIC(BIC_Pkgpc10)) - if (get_msr(cpu, MSR_PKG_C10_RESIDENCY, &p->pc10)) - return -13; if (DO_BIC(BIC_CPU_LPI)) p->cpu_lpi = cpuidle_cur_cpu_lpi_us; @@ -3637,9 +3972,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) - p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + p->uncore_mhz = get_legacy_uncore_mhz(p->package_id); if (DO_BIC(BIC_GFX_rc6)) p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; @@ -3660,7 +3994,16 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &p->counter[i])) + char *path = NULL; + + if (mp->msr_num == 0) { + path = find_sysfs_path_by_id(mp->sp, p->package_id); + if (path == NULL) { + warnx("%s: package_id %d not found", __func__, p->package_id); + return -10; + } + } + if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } done: @@ -3669,31 +4012,8 @@ done: return 0; } -/* - * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: - * If you change the values, note they are used both in comparisons - * (>= PCL__7) and to index pkg_cstate_limit_strings[]. - */ - -#define PCLUKN 0 /* Unknown */ -#define PCLRSV 1 /* Reserved */ -#define PCL__0 2 /* PC0 */ -#define PCL__1 3 /* PC1 */ -#define PCL__2 4 /* PC2 */ -#define PCL__3 5 /* PC3 */ -#define PCL__4 6 /* PC4 */ -#define PCL__6 7 /* PC6 */ -#define PCL_6N 8 /* PC6 No Retention */ -#define PCL_6R 9 /* PC6 Retention */ -#define PCL__7 10 /* PC7 */ -#define PCL_7S 11 /* PC7 Shrink */ -#define PCL__8 12 /* PC8 */ -#define PCL__9 13 /* PC9 */ -#define PCL_10 14 /* PC10 */ -#define PCLUNL 15 /* Unlimited */ - int pkg_cstate_limit = PCLUKN; -char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", +char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" }; @@ -4182,6 +4502,26 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_cstate(void) +{ + if (!ccstate_counter_info) + return; + + const int counter_info_num = ccstate_counter_info_size; + + for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) { + if (ccstate_counter_info[counter_id].fd_perf_core != -1) + close(ccstate_counter_info[counter_id].fd_perf_core); + + if (ccstate_counter_info[counter_id].fd_perf_pkg != -1) + close(ccstate_counter_info[counter_id].fd_perf_pkg); + } + + free(ccstate_counter_info); + ccstate_counter_info = NULL; + ccstate_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) @@ -4243,6 +4583,7 @@ void free_all_buffers(void) free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free_fd_rapl_percpu(); + free_fd_cstate(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4578,6 +4919,7 @@ static void update_effective_set(bool startup) void linux_perf_init(void); void rapl_perf_init(void); +void cstate_perf_init(void); void re_initialize(void) { @@ -4585,6 +4927,7 @@ void re_initialize(void) setup_all_buffers(false); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5303,30 +5646,27 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void probe_intel_uncore_frequency(void) +static void probe_intel_uncore_frequency_legacy(void) { int i, j; char path[256]; - if (!genuine_intel) - return; - - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - goto probe_cluster; - - BIC_PRESENT(BIC_UNCORE_MHZ); - - if (quiet) - return; - for (i = 0; i < topo.num_packages; ++i) { - for (j = 0; j < topo.num_die; ++j) { + for (j = 0; j <= topo.max_die_id; ++j) { int k, l; char path_base[128]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, j); + if (access(path_base, R_OK)) + continue; + + BIC_PRESENT(BIC_UNCORE_MHZ); + + if (quiet) + return; + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); @@ -5344,24 +5684,36 @@ static void probe_intel_uncore_frequency(void) fprintf(outf, " %d MHz\n", k / 1000); } } - return; +} + +static void probe_intel_uncore_frequency_cluster(void) +{ + int i, uncore_max_id; + char path[256]; + char path_base[128]; -probe_cluster: if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; - if (quiet) - return; + for (uncore_max_id = 0;; ++uncore_max_id) { + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id); - for (i = 0;; ++i) { + /* uncore## start at 00 and skips no numbers, so stop upon first missing */ + if (access(path_base, R_OK)) { + uncore_max_id -= 1; + break; + } + } + for (i = uncore_max_id; i >= 0; --i) { int k, l; - char path_base[128]; int package_id, domain_id, cluster_id; + char name_buf[16]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); if (access(path_base, R_OK)) - break; + err(1, "%s: %s\n", __func__, path_base); sprintf(path, "%s/package_id", path_base); package_id = read_sysfs_int(path); @@ -5372,6 +5724,14 @@ probe_cluster: sprintf(path, "%s/fabric_cluster_id", path_base); cluster_id = read_sysfs_int(path); + sprintf(path, "%s/current_freq_khz", path_base); + sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id); + + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); + + if (quiet) + continue; + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); @@ -5391,6 +5751,17 @@ probe_cluster: } } +static void probe_intel_uncore_frequency(void) +{ + if (!genuine_intel) + return; + + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0) + probe_intel_uncore_frequency_cluster(); + else + probe_intel_uncore_frequency_legacy(); +} + static void probe_graphics(void) { /* Xe graphics sysfs knobs */ @@ -5475,7 +5846,6 @@ next: else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) @@ -6414,7 +6784,8 @@ bool is_aperf_access_required(void) return BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC); + || BIC_IS_ENABLED(BIC_IPC) + || BIC_IS_ENABLED(BIC_CPU_c1); } int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, @@ -6646,42 +7017,160 @@ static int has_amperf_access(void) return 0; } -void probe_cstates(void) +int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) { - probe_cst_limit(); + if (strcmp(group_name, "cstate_core") == 0) + return &cci->fd_perf_core; - if (platform->supported_cstates & CC1) - BIC_PRESENT(BIC_CPU_c1); + if (strcmp(group_name, "cstate_pkg") == 0) + return &cci->fd_perf_pkg; - if (platform->supported_cstates & CC3) - BIC_PRESENT(BIC_CPU_c3); + return NULL; +} - if (platform->supported_cstates & CC6) - BIC_PRESENT(BIC_CPU_c6); +int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) +{ + if (no_perf) + return -1; - if (platform->supported_cstates & CC7) - BIC_PRESENT(BIC_CPU_c7); + int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys); - if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) - BIC_PRESENT(BIC_Pkgpc2); + if (pfd_group == NULL) + return -1; - if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3)) - BIC_PRESENT(BIC_Pkgpc3); + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); - if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6)) - BIC_PRESENT(BIC_Pkgpc6); + const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); - if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7)) - BIC_PRESENT(BIC_Pkgpc7); + if (fd_counter == -1) + return -1; - if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8)) - BIC_PRESENT(BIC_Pkgpc8); + /* If it's the first counter opened, make it a group descriptor */ + if (*pfd_group == -1) + *pfd_group = fd_counter; - if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9)) - BIC_PRESENT(BIC_Pkgpc9); + return fd_counter; +} - if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) - BIC_PRESENT(BIC_Pkgpc10); +int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) +{ + int ret = add_cstate_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); + + return ret; +} + +void cstate_perf_init_(bool soft_c1) +{ + bool has_counter; + bool *cores_visited = NULL, *pkg_visited = NULL; + const int cores_visited_elems = topo.max_core_id + 1; + const int pkg_visited_elems = topo.max_package_id + 1; + const int cci_num = topo.max_cpu_num + 1; + + ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info)); + if (!ccstate_counter_info) + err(1, "calloc ccstate_counter_arch_info"); + ccstate_counter_info_size = cci_num; + + cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited)); + if (!cores_visited) + err(1, "calloc cores_visited"); + + pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited)); + if (!pkg_visited) + err(1, "calloc pkg_visited"); + + /* Initialize cstate_counter_info_percpu */ + for (int cpu = 0; cpu < cci_num; ++cpu) { + ccstate_counter_info[cpu].fd_perf_core = -1; + ccstate_counter_info[cpu].fd_perf_pkg = -1; + } + + for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) { + has_counter = false; + memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited)); + memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited)); + + const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx]; + + for (int cpu = 0; cpu < cci_num; ++cpu) { + + struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + const int core_id = cpus[cpu].physical_core_id; + const int pkg_id = cpus[cpu].physical_package_id; + + assert(core_id < cores_visited_elems); + assert(pkg_id < pkg_visited_elems); + + const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD; + const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE; + + if (!per_thread && cores_visited[core_id]) + continue; + + if (!per_core && pkg_visited[pkg_id]) + continue; + + const bool counter_needed = BIC_IS_ENABLED(cai->bic) || + (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); + const bool counter_supported = (platform->supported_cstates & cai->feature_mask); + + if (counter_needed && counter_supported) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { + + cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit + && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + } + } + + if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + has_counter = true; + cores_visited[core_id] = true; + pkg_visited[pkg_id] = true; + } + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(cores_visited); + free(pkg_visited); +} + +void cstate_perf_init(void) +{ + /* + * If we don't have a C1 residency MSR, we calculate it "in software", + * but we need APERF, MPERF too. + */ + const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access() + && platform->supported_cstates & CC1; + + if (soft_c1) + BIC_PRESENT(BIC_CPU_c1); + + cstate_perf_init_(soft_c1); +} + +void probe_cstates(void) +{ + probe_cst_limit(); if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); @@ -6939,6 +7428,22 @@ void process_cpuid() BIC_PRESENT(BIC_TSC_MHz); } +static void counter_info_init(void) +{ + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) { + struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i]; + + if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY) + cai->msr = MSR_KNL_CORE_C6_RESIDENCY; + + if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES) + cai->msr = 0; + + if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) + cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; + } +} + void probe_pm_features(void) { probe_pstates(); @@ -7416,10 +7921,12 @@ void turbostat_init() check_msr_access(); check_perf_access(); process_cpuid(); + counter_info_init(); probe_pm_features(); set_amperf_source(); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); @@ -7510,7 +8017,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.04.08 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.05.10 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 @@ -7536,61 +8043,114 @@ void print_bootcmd(void) fclose(fp); } +struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) +{ + struct msr_counter *mp; + + for (mp = head; mp; mp = mp->next) { + if (debug) + printf("%s: %s %s\n", __func__, name, mp->name); + if (!strncmp(name, mp->name, strlen(mp->name))) + return mp; + } + return NULL; +} + int add_counter(unsigned int msr_num, char *path, char *name, unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags) + enum counter_type type, enum counter_format format, int flags, int id) { struct msr_counter *msrp; if (no_msr && msr_num) errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); - msrp = calloc(1, sizeof(struct msr_counter)); - if (msrp == NULL) { - perror("calloc"); - exit(1); - } - - msrp->msr_num = msr_num; - strncpy(msrp->name, name, NAME_BYTES - 1); - if (path) - strncpy(msrp->path, path, PATH_BYTES - 1); - msrp->width = width; - msrp->type = type; - msrp->format = format; - msrp->flags = flags; + if (debug) + printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, + path, name, width, scope, type, format, flags, id); switch (scope) { case SCOPE_CPU: - msrp->next = sys.tp; - sys.tp = msrp; - sys.added_thread_counters++; - if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) { - fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.tp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter %s", name); + return -1; } break; - case SCOPE_CORE: - msrp->next = sys.cp; - sys.cp = msrp; - sys.added_core_counters++; - if (sys.added_core_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.cp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter %s", name); + return -1; } break; - case SCOPE_PACKAGE: - msrp->next = sys.pp; - sys.pp = msrp; - sys.added_package_counters++; - if (sys.added_package_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.pp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter %s", name); + return -1; } break; + default: + warnx("ignoring counter %s with unknown scope", name); + return -1; + } + + if (msrp == NULL) { + msrp = calloc(1, sizeof(struct msr_counter)); + if (msrp == NULL) + err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; + strncpy(msrp->name, name, NAME_BYTES - 1); + msrp->width = width; + msrp->type = type; + msrp->format = format; + msrp->flags = flags; + + switch (scope) { + case SCOPE_CPU: + msrp->next = sys.tp; + sys.tp = msrp; + break; + case SCOPE_CORE: + msrp->next = sys.cp; + sys.cp = msrp; + break; + case SCOPE_PACKAGE: + msrp->next = sys.pp; + sys.pp = msrp; + break; + } + } + + if (path) { + struct sysfs_path *sp; + + sp = calloc(1, sizeof(struct sysfs_path)); + if (sp == NULL) { + perror("calloc"); + exit(1); + } + strncpy(sp->path, path, PATH_BYTES - 1); + sp->id = id; + sp->next = msrp->sp; + msrp->sp = sp; } return 0; @@ -7692,7 +8252,7 @@ next: sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0)) + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) fail++; if (fail) { @@ -7757,7 +8317,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0); } for (state = 10; state >= 0; --state) { @@ -7785,7 +8345,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0); } } diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph new file mode 100755 index 0000000000..57d78f6df0 --- /dev/null +++ b/tools/sound/dapm-graph @@ -0,0 +1,303 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate a graph of the current DAPM state for an audio card +# +# Copyright 2024 Bootlin +# Author: Luca Ceresoli + +set -eu + +STYLE_NODE_ON="shape=box,style=bold,color=green4" +STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95" + +# Print usage and exit +# +# $1 = exit return value +# $2 = error string (required if $1 != 0) +usage() +{ + if [ "${1}" -ne 0 ]; then + echo "${2}" >&2 + fi + + echo " +Generate a graph of the current DAPM state for an audio card. + +The DAPM state can be obtained via debugfs for a card on the local host or +a remote target, or from a local copy of the debugfs tree for the card. + +Usage: + $(basename $0) [options] -c CARD - Local sound card + $(basename $0) [options] -c CARD -r REMOTE_TARGET - Card on remote system + $(basename $0) [options] -d STATE_DIR - Local directory + +Options: + -c CARD Sound card to get DAPM state of + -r REMOTE_TARGET Get DAPM state from REMOTE_TARGET via SSH and SCP + instead of using a local sound card + -d STATE_DIR Get DAPM state from a local copy of a debugfs tree + -o OUT_FILE Output file (default: dapm.dot) + -D Show verbose debugging info + -h Print this help and exit + +The output format is implied by the extension of OUT_FILE: + + * Use the .dot extension to generate a text graph representation in + graphviz dot syntax. + * Any other extension is assumed to be a format supported by graphviz for + rendering, e.g. 'png', 'svg', and will produce both the .dot file and a + picture from it. This requires the 'dot' program from the graphviz + package. +" + + exit ${1} +} + +# Connect to a remote target via SSH, collect all DAPM files from debufs +# into a tarball and get the tarball via SCP into $3/dapm.tar +# +# $1 = target as used by ssh and scp, e.g. "root@192.168.1.1" +# $2 = sound card name +# $3 = temp dir path (present on the host, created on the target) +# $4 = local directory to extract the tarball into +# +# Requires an ssh+scp server, find and tar+gz on the target +# +# Note: the tarball is needed because plain 'scp -r' from debugfs would +# copy only empty files +grab_remote_files() +{ + echo "Collecting DAPM state from ${1}" + dbg_echo "Collected DAPM state in ${3}" + + ssh "${1}" " +set -eu && +cd \"/sys/kernel/debug/asoc/${2}\" && +find * -type d -exec mkdir -p ${3}/dapm-tree/{} \; && +find * -type f -exec cp \"{}\" \"${3}/dapm-tree/{}\" \; && +cd ${3}/dapm-tree && +tar cf ${3}/dapm.tar ." + scp -q "${1}:${3}/dapm.tar" "${3}" + + mkdir -p "${4}" + tar xf "${tmp_dir}/dapm.tar" -C "${4}" +} + +# Parse a widget file and generate graph description in graphviz dot format +# +# Skips any file named "bias_level". +# +# $1 = temporary work dir +# $2 = component name +# $3 = widget filename +process_dapm_widget() +{ + local tmp_dir="${1}" + local c_name="${2}" + local w_file="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + local w_name="$(basename "${w_file}")" + local w_tag="${c_name}_${w_name}" + + if [ "${w_name}" = "bias_level" ]; then + return 0 + fi + + dbg_echo " + Widget: ${w_name}" + + cat "${w_file}" | ( + read line + + if echo "${line}" | grep -q ': On ' + then local node_style="${STYLE_NODE_ON}" + else local node_style="${STYLE_NODE_OFF}" + fi + + local w_type="" + while read line; do + # Collect widget type if present + if echo "${line}" | grep -q '^widget-type '; then + local w_type_raw="$(echo "$line" | cut -d ' ' -f 2)" + dbg_echo " - Widget type: ${w_type_raw}" + + # Note: escaping '\n' is tricky to get working with both + # bash and busybox ash, so use a '%' here and replace it + # later + local w_type="%n[${w_type_raw}]" + fi + + # Collect any links. We could use "in" links or "out" links, + # let's use "in" links + if echo "${line}" | grep -q '^in '; then + local w_src=$(echo "$line" | + awk -F\" '{print $6 "_" $4}' | + sed 's/^(null)_/ROOT_/') + dbg_echo " - Input route from: ${w_src}" + echo " \"${w_src}\" -> \"$w_tag\"" >> "${links_file}" + fi + done + + echo " \"${w_tag}\" [label=\"${w_name}${w_type}\",${node_style}]" | + tr '%' '\\' >> "${dot_file}" + ) +} + +# Parse the DAPM tree for a sound card component and generate graph +# description in graphviz dot format +# +# $1 = temporary work dir +# $2 = component directory +# $3 = forced component name (extracted for path if empty) +process_dapm_component() +{ + local tmp_dir="${1}" + local c_dir="${2}" + local c_name="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + if [ -z "${c_name}" ]; then + # Extract directory name into component name: + # "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a" + c_name="$(basename $(dirname "${c_dir}"))" + fi + + dbg_echo " * Component: ${c_name}" + + echo "" >> "${dot_file}" + echo " subgraph \"${c_name}\" {" >> "${dot_file}" + echo " cluster = true" >> "${dot_file}" + echo " label = \"${c_name}\"" >> "${dot_file}" + echo " color=dodgerblue" >> "${dot_file}" + + # Create empty file to ensure it will exist in all cases + >"${links_file}" + + # Iterate over widgets in the component dir + for w_file in ${c_dir}/*; do + process_dapm_widget "${tmp_dir}" "${c_name}" "${w_file}" + done + + echo " }" >> "${dot_file}" + + cat "${links_file}" >> "${dot_file}" +} + +# Parse the DAPM tree for a sound card and generate graph description in +# graphviz dot format +# +# $1 = temporary work dir +# $2 = directory tree with DAPM state (either in debugfs or a mirror) +process_dapm_tree() +{ + local tmp_dir="${1}" + local dapm_dir="${2}" + local dot_file="${tmp_dir}/main.dot" + + echo "digraph G {" > "${dot_file}" + echo " fontname=\"sans-serif\"" >> "${dot_file}" + echo " node [fontname=\"sans-serif\"]" >> "${dot_file}" + + + # Process root directory (no component) + process_dapm_component "${tmp_dir}" "${dapm_dir}/dapm" "ROOT" + + # Iterate over components + for c_dir in "${dapm_dir}"/*/dapm + do + process_dapm_component "${tmp_dir}" "${c_dir}" "" + done + + echo "}" >> "${dot_file}" +} + +main() +{ + # Parse command line + local out_file="dapm.dot" + local card_name="" + local remote_target="" + local dapm_tree="" + local dbg_on="" + while getopts "c:r:d:o:Dh" arg; do + case $arg in + c) card_name="${OPTARG}" ;; + r) remote_target="${OPTARG}" ;; + d) dapm_tree="${OPTARG}" ;; + o) out_file="${OPTARG}" ;; + D) dbg_on="1" ;; + h) usage 0 ;; + *) usage 1 ;; + esac + done + shift $(($OPTIND - 1)) + + if [ -n "${dapm_tree}" ]; then + if [ -n "${card_name}${remote_target}" ]; then + usage 1 "Cannot use -c and -r with -d" + fi + echo "Using local tree: ${dapm_tree}" + elif [ -n "${remote_target}" ]; then + if [ -z "${card_name}" ]; then + usage 1 "-r requires -c" + fi + echo "Using card ${card_name} from remote target ${remote_target}" + elif [ -n "${card_name}" ]; then + echo "Using local card: ${card_name}" + else + usage 1 "Please choose mode using -c, -r or -d" + fi + + # Define logging function + if [ "${dbg_on}" ]; then + dbg_echo() { + echo "$*" >&2 + } + else + dbg_echo() { + : + } + fi + + # Filename must have a dot in order the infer the format from the + # extension + if ! echo "${out_file}" | grep -qE '\.'; then + echo "Missing extension in output filename ${out_file}" >&2 + usage + exit 1 + fi + + local out_fmt="${out_file##*.}" + local dot_file="${out_file%.*}.dot" + + dbg_echo "dot file: $dot_file" + dbg_echo "Output file: $out_file" + dbg_echo "Output format: $out_fmt" + + tmp_dir="$(mktemp -d /tmp/$(basename $0).XXXXXX)" + trap "{ rm -fr ${tmp_dir}; }" INT TERM EXIT + + if [ -z "${dapm_tree}" ] + then + dapm_tree="/sys/kernel/debug/asoc/${card_name}" + fi + if [ -n "${remote_target}" ]; then + dapm_tree="${tmp_dir}/dapm-tree" + grab_remote_files "${remote_target}" "${card_name}" "${tmp_dir}" "${dapm_tree}" + fi + # In all cases now ${dapm_tree} contains the DAPM state + + process_dapm_tree "${tmp_dir}" "${dapm_tree}" + cp "${tmp_dir}/main.dot" "${dot_file}" + + if [ "${out_file}" != "${dot_file}" ]; then + dot -T"${out_fmt}" "${dot_file}" -o "${out_file}" + fi + + echo "Generated file ${out_file}" +} + +main "${@}" diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 0d0ca63de8..eaf091a3d3 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -128,7 +128,7 @@ static struct { #define CXL_TEST_EVENT_CNT_MAX 15 /* Set a number of events to return at a time for simulation. */ -#define CXL_TEST_EVENT_CNT 3 +#define CXL_TEST_EVENT_RET_MAX 4 struct mock_event_log { u16 clear_idx; @@ -223,6 +223,12 @@ static void mes_add_event(struct mock_event_store *mes, log->nr_events++; } +/* + * Vary the number of events returned to simulate events occuring while the + * logs are being read. + */ +static int ret_limit = 0; + static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) { struct cxl_get_event_payload *pl; @@ -234,14 +240,18 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) if (cmd->size_in != sizeof(log_type)) return -EINVAL; - if (cmd->size_out < struct_size(pl, records, CXL_TEST_EVENT_CNT)) + ret_limit = (ret_limit + 1) % CXL_TEST_EVENT_RET_MAX; + if (!ret_limit) + ret_limit = 1; + + if (cmd->size_out < struct_size(pl, records, ret_limit)) return -EINVAL; log_type = *((u8 *)cmd->payload_in); if (log_type >= CXL_EVENT_TYPE_MAX) return -EINVAL; - memset(cmd->payload_out, 0, cmd->size_out); + memset(cmd->payload_out, 0, struct_size(pl, records, 0)); log = event_find_log(dev, log_type); if (!log || event_log_empty(log)) @@ -249,7 +259,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) pl = cmd->payload_out; - for (i = 0; i < CXL_TEST_EVENT_CNT && !event_log_empty(log); i++) { + for (i = 0; i < ret_limit && !event_log_empty(log); i++) { memcpy(&pl->records[i], event_get_current(log), sizeof(pl->records[i])); pl->records[i].event.generic.hdr.handle = @@ -257,6 +267,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) log->cur_idx++; } + cmd->size_out = struct_size(pl, records, i); pl->record_count = cpu_to_le16(i); if (!event_log_empty(log)) pl->flags |= CXL_GET_EVENT_FLAG_MORE_RECORDS; diff --git a/tools/testing/kunit/qemu_configs/riscv.py b/tools/testing/kunit/qemu_configs/riscv.py index 12a1d52597..c87758030f 100644 --- a/tools/testing/kunit/qemu_configs/riscv.py +++ b/tools/testing/kunit/qemu_configs/riscv.py @@ -13,7 +13,7 @@ if not os.path.isfile(OPENSBI_PATH): QEMU_ARCH = QemuArchParams(linux_arch='riscv', kconfig=''' -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c index b8419f4603..b438f3d053 100644 --- a/tools/testing/nvdimm/test/ndtest.c +++ b/tools/testing/nvdimm/test/ndtest.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "../watermark.h" #include "nfit_test.h" @@ -830,12 +832,11 @@ static int ndtest_bus_register(struct ndtest_priv *p) return 0; } -static int ndtest_remove(struct platform_device *pdev) +static void ndtest_remove(struct platform_device *pdev) { struct ndtest_priv *p = to_ndtest_priv(&pdev->dev); nvdimm_bus_unregister(p->bus); - return 0; } static int ndtest_probe(struct platform_device *pdev) @@ -882,7 +883,7 @@ static const struct platform_device_id ndtest_id[] = { static struct platform_driver ndtest_driver = { .probe = ndtest_probe, - .remove = ndtest_remove, + .remove_new = ndtest_remove, .driver = { .name = KBUILD_MODNAME, }, diff --git a/tools/testing/nvdimm/test/ndtest.h b/tools/testing/nvdimm/test/ndtest.h index 2c54c9cbb9..8f27ad6f73 100644 --- a/tools/testing/nvdimm/test/ndtest.h +++ b/tools/testing/nvdimm/test/ndtest.h @@ -5,37 +5,6 @@ #include #include -/* SCM device is unable to persist memory contents */ -#define PAPR_PMEM_UNARMED (1ULL << (63 - 0)) -/* SCM device failed to persist memory contents */ -#define PAPR_PMEM_SHUTDOWN_DIRTY (1ULL << (63 - 1)) -/* SCM device contents are not persisted from previous IPL */ -#define PAPR_PMEM_EMPTY (1ULL << (63 - 3)) -#define PAPR_PMEM_HEALTH_CRITICAL (1ULL << (63 - 4)) -/* SCM device will be garded off next IPL due to failure */ -#define PAPR_PMEM_HEALTH_FATAL (1ULL << (63 - 5)) -/* SCM contents cannot persist due to current platform health status */ -#define PAPR_PMEM_HEALTH_UNHEALTHY (1ULL << (63 - 6)) - -/* Bits status indicators for health bitmap indicating unarmed dimm */ -#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_PMEM_SAVE_FAILED (1ULL << (63 - 10)) - -/* Bits status indicators for health bitmap indicating unflushed dimm */ -#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY) - -/* Bits status indicators for health bitmap indicating unrestored dimm */ -#define PAPR_PMEM_BAD_RESTORE_MASK (PAPR_PMEM_EMPTY) - -/* Bit status indicators for smart event notification */ -#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \ - PAPR_PMEM_HEALTH_FATAL | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_PMEM_SAVE_MASK (PAPR_PMEM_SAVE_FAILED) - struct ndtest_config; struct ndtest_priv { diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index e150483365..9039f3709a 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,8 +17,10 @@ TARGETS += devices TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf TARGETS += drivers/s390x/uvdevice +TARGETS += drivers/net TARGETS += drivers/net/bonding TARGETS += drivers/net/team +TARGETS += drivers/net/virtio_net TARGETS += dt TARGETS += efivarfs TARGETS += exec @@ -63,7 +65,7 @@ TARGETS += net/hsr TARGETS += net/mptcp TARGETS += net/openvswitch TARGETS += net/tcp_ao -TARGETS += netfilter +TARGETS += net/netfilter TARGETS += nsfs TARGETS += perf_events TARGETS += pidfd @@ -116,6 +118,13 @@ TARGETS += zram TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug +# Networking tests want the net/lib target, include it automatically +ifneq ($(filter net drivers/net drivers/net/hw,$(TARGETS)),) +ifeq ($(filter net/lib,$(TARGETS)),) + INSTALL_DEP_TARGETS := net/lib +endif +endif + # User can optionally provide a TARGETS skiplist. By default we skip # BPF since it has cutting edge build time dependencies which require # more effort to install. @@ -245,7 +254,7 @@ ifdef INSTALL_PATH install -m 744 run_kselftest.sh $(INSTALL_PATH)/ rm -f $(TEST_LIST) @ret=1; \ - for TARGET in $(TARGETS); do \ + for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install \ INSTALL_PATH=$(INSTALL_PATH)/$$TARGET \ diff --git a/tools/testing/selftests/alsa/conf.c b/tools/testing/selftests/alsa/conf.c index 89e3656a04..e2b3a5810f 100644 --- a/tools/testing/selftests/alsa/conf.c +++ b/tools/testing/selftests/alsa/conf.c @@ -105,7 +105,7 @@ static struct card_cfg_data *conf_data_by_card(int card, bool msg) return NULL; } -static int dump_config_tree(snd_config_t *top) +static void dump_config_tree(snd_config_t *top) { snd_output_t *out; int err; diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 02ee3a91b7..285c47dd42 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -262,7 +262,7 @@ static int write_clone_read(void) int main(int argc, char **argv) { - int ret, i; + int ret; putstr("TAP version 13\n"); putstr("1.."); diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index f1aebabfb0..5025401323 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -17,7 +17,6 @@ test_dev_cgroup test_verifier_log feature test_sock -test_sock_addr urandom_read test_sockmap test_lirc_mode2_user diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index d8ade15e27..0445ac38bc 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,5 +10,3 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) -verifier_arena # JIT does not support arena -arena_htab # JIT does not support arena diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index f4a2f66a68..c34adf39ee 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -6,3 +6,4 @@ stacktrace_build_id # compare_map_keys stackid_hmap vs. sta verifier_iterating_callbacks verifier_arena # JIT does not support arena arena_htab # JIT does not support arena +arena_atomics diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3b9eb40d63..dd49c1d23a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -53,6 +53,7 @@ progs/syscall.c-CFLAGS := -fno-strict-aliasing progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing ifneq ($(LLVM),) # Silence some warnings when compiled with clang @@ -81,11 +82,24 @@ TEST_INST_SUBDIRS += bpf_gcc # The following tests contain C code that, although technically legal, # triggers GCC warnings that cannot be disabled: declaration of # anonymous struct types in function parameter lists. -progs/btf_dump_test_case_bitfields.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_namespacing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_packing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_padding.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_syntax.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_bitfields.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_namespacing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +# The following tests do type-punning, via the __imm_insn macro, from +# `struct bpf_insn' to long and then uses the value. This triggers an +# "is used uninitialized" warning in GCC due to strict-aliasing +# rules. +progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing endif ifneq ($(CLANG_CPUV4),) @@ -102,8 +116,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ test_xdp_veth.sh \ - test_offload.py \ - test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ @@ -128,7 +140,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ +TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ @@ -136,18 +148,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi -# Emit succinct information message describing current building step -# $1 - generic step name (e.g., CC, LINK, etc); -# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; -# $3 - target (assumed to be file); only file name will be emitted; -# $4 - optional extra arg, emitted as-is, if provided. -ifeq ($(V),1) -Q = -msg = -else -Q = @ -msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; -MAKEFLAGS += --no-print-directory +ifneq ($(V),1) submake_extras := feature_display=0 endif @@ -274,7 +275,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \ BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ - BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf \ + BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \ BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \ EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS)' \ EXTRA_LDFLAGS='$(SAN_LDFLAGS)' && \ @@ -290,11 +291,11 @@ UNPRIV_HELPERS := $(OUTPUT)/unpriv_helpers.o TRACE_HELPERS := $(OUTPUT)/trace_helpers.o JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o +NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) @@ -308,6 +309,7 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) $(OUTPUT)/xsk.o: $(BPFOBJ) +$(OUTPUT)/test_tcp_check_syncookie_user: $(NETWORK_HELPERS) BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ @@ -442,7 +444,7 @@ endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c @@ -455,7 +457,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_map_key.c + test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ @@ -481,7 +483,7 @@ LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(ske # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2 @@ -509,7 +511,7 @@ endef # Using TRUNNER_XXX variables, provided by callers of DEFINE_TEST_RUNNER and # set up by DEFINE_TEST_RUNNER itself, create test runner build rules with: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER_RULES ifeq ($($(TRUNNER_OUTPUT)-dir),) @@ -532,7 +534,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ - $$($$<-CFLAGS)) + $$($$<-CFLAGS) \ + $$($$<-$2-CFLAGS)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) @@ -658,7 +661,7 @@ $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs-cpuv4 test runner. ifneq ($(CLANG_CPUV4),) TRUNNER_BPF_BUILD_RULE := CLANG_CPUV4_BPF_BUILD_RULE -TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS $(eval $(call DEFINE_TEST_RUNNER,test_progs,cpuv4)) endif @@ -695,7 +698,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Include find_bit.c to compile xskxceiver. EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c -$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) +$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ @@ -729,6 +732,7 @@ $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tas $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h +$(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -748,6 +752,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_hashmap_lookup.o \ $(OUTPUT)/bench_local_storage_create.o \ $(OUTPUT)/bench_htab_mem.o \ + $(OUTPUT)/bench_bpf_crypto.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ @@ -759,7 +764,7 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index b2b4c391eb..627b74ae04 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -280,6 +280,8 @@ extern struct argp bench_strncmp_argp; extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; +extern struct argp bench_trigger_batch_argp; +extern struct argp bench_crypto_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -292,6 +294,8 @@ static const struct argp_child bench_parsers[] = { { &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 }, { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, + { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, + { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, {}, }; @@ -491,24 +495,31 @@ extern const struct bench bench_rename_kretprobe; extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; -extern const struct bench bench_trig_base; -extern const struct bench bench_trig_tp; -extern const struct bench bench_trig_rawtp; + +/* pure counting benchmarks to establish theoretical lmits */ +extern const struct bench bench_trig_usermode_count; +extern const struct bench bench_trig_syscall_count; +extern const struct bench bench_trig_kernel_count; + +/* batched, staying mostly in-kernel benchmarks */ extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; -extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; -extern const struct bench bench_trig_uprobe_base; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; + +/* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; + extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -529,6 +540,8 @@ extern const struct bench bench_local_storage_tasks_trace; extern const struct bench bench_bpf_hashmap_lookup; extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; +extern const struct bench bench_crypto_encrypt; +extern const struct bench bench_crypto_decrypt; static const struct bench *benchs[] = { &bench_count_global, @@ -539,24 +552,28 @@ static const struct bench *benchs[] = { &bench_rename_rawtp, &bench_rename_fentry, &bench_rename_fexit, - &bench_trig_base, - &bench_trig_tp, - &bench_trig_rawtp, + /* pure counting benchmarks for establishing theoretical limits */ + &bench_trig_usermode_count, + &bench_trig_kernel_count, + &bench_trig_syscall_count, + /* batched, staying mostly in-kernel triggers */ &bench_trig_kprobe, &bench_trig_kretprobe, &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, - &bench_trig_fentry_sleep, &bench_trig_fmodret, - &bench_trig_uprobe_base, + &bench_trig_tp, + &bench_trig_rawtp, + /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, &bench_trig_uprobe_push, &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, @@ -577,6 +594,8 @@ static const struct bench *benchs[] = { &bench_bpf_hashmap_lookup, &bench_local_storage_create, &bench_htab_mem, + &bench_crypto_encrypt, + &bench_crypto_decrypt, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c new file mode 100644 index 0000000000..2845edaba8 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include "bench.h" +#include "crypto_bench.skel.h" + +#define MAX_CIPHER_LEN 32 +static char *input; +static struct crypto_ctx { + struct crypto_bench *skel; + int pfd; +} ctx; + +static struct crypto_args { + u32 crypto_len; + char *crypto_cipher; +} args = { + .crypto_len = 16, + .crypto_cipher = "ecb(aes)", +}; + +enum { + ARG_CRYPTO_LEN = 5000, + ARG_CRYPTO_CIPHER = 5001, +}; + +static const struct argp_option opts[] = { + { "crypto-len", ARG_CRYPTO_LEN, "CRYPTO_LEN", 0, + "Set the length of crypto buffer" }, + { "crypto-cipher", ARG_CRYPTO_CIPHER, "CRYPTO_CIPHER", 0, + "Set the cipher to use (default:ecb(aes))" }, + {}, +}; + +static error_t crypto_parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_CRYPTO_LEN: + args.crypto_len = strtoul(arg, NULL, 10); + if (!args.crypto_len || + args.crypto_len > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "Invalid crypto buffer len (limit %zu)\n", + sizeof(ctx.skel->bss->dst)); + argp_usage(state); + } + break; + case ARG_CRYPTO_CIPHER: + args.crypto_cipher = strdup(arg); + if (!strlen(args.crypto_cipher) || + strlen(args.crypto_cipher) > MAX_CIPHER_LEN) { + fprintf(stderr, "Invalid crypto cipher len (limit %d)\n", + MAX_CIPHER_LEN); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_crypto_argp = { + .options = opts, + .parser = crypto_parse_arg, +}; + +static void crypto_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "bpf crypto benchmark doesn't support consumer!\n"); + exit(1); + } +} + +static void crypto_setup(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + + int err, pfd; + size_t i, sz; + + sz = args.crypto_len; + if (!sz || sz > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "invalid encrypt buffer size (source %zu, target %zu)\n", + sz, sizeof(ctx.skel->bss->dst)); + exit(1); + } + + setup_libbpf(); + + ctx.skel = crypto_bench__open(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + snprintf(ctx.skel->bss->cipher, 128, "%s", args.crypto_cipher); + memcpy(ctx.skel->bss->key, "12345678testtest", 16); + ctx.skel->bss->key_len = 16; + ctx.skel->bss->authsize = 0; + + srandom(time(NULL)); + input = malloc(sz); + for (i = 0; i < sz - 1; i++) + input[i] = '1' + random() % 9; + input[sz - 1] = '\0'; + + ctx.skel->rodata->len = args.crypto_len; + + err = crypto_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + pfd = bpf_program__fd(ctx.skel->progs.crypto_setup); + if (pfd < 0) { + fprintf(stderr, "failed to get fd for setup prog\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + err = bpf_prog_test_run_opts(pfd, &opts); + if (err || ctx.skel->bss->status) { + fprintf(stderr, "failed to run setup prog: err %d, status %d\n", + err, ctx.skel->bss->status); + crypto_bench__destroy(ctx.skel); + exit(1); + } +} + +static void crypto_encrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_encrypt); +} + +static void crypto_decrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_decrypt); +} + +static void crypto_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void *crypto_producer(void *unused) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .repeat = 64, + .data_in = input, + .data_size_in = args.crypto_len, + ); + + while (true) + (void)bpf_prog_test_run_opts(ctx.pfd, &opts); + return NULL; +} + +const struct bench bench_crypto_encrypt = { + .name = "crypto-encrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_encrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_crypto_decrypt = { + .name = "crypto-decrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_decrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c index b36de42ee4..e2ff8ea1cb 100644 --- a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c @@ -186,7 +186,7 @@ static void *task_producer(void *input) for (i = 0; i < batch_sz; i++) { if (!pthd_results[i]) - pthread_join(pthds[i], NULL);; + pthread_join(pthds[i], NULL); } } diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index ace0d1011a..4b05539f16 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,15 +1,95 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include +#include +#include #include "bench.h" #include "trigger_bench.skel.h" #include "trace_helpers.h" +#define MAX_TRIG_BATCH_ITERS 1000 + +static struct { + __u32 batch_iters; +} args = { + .batch_iters = 100, +}; + +enum { + ARG_TRIG_BATCH_ITERS = 7000, +}; + +static const struct argp_option opts[] = { + { "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0, + "Number of in-kernel iterations per one driver test run"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_TRIG_BATCH_ITERS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) { + fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n", + 1, MAX_TRIG_BATCH_ITERS); + argp_usage(state); + } + args.batch_iters = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_trigger_batch_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* adjust slot shift in inc_hits() if changing */ +#define MAX_BUCKETS 256 + +#pragma GCC diagnostic ignored "-Wattributes" + /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; + bool usermode_counters; + int driver_prog_fd; } ctx; -static struct counter base_hits; +static struct counter base_hits[MAX_BUCKETS]; + +static __always_inline void inc_counter(struct counter *counters) +{ + static __thread int tid = 0; + unsigned slot; + + if (unlikely(tid == 0)) + tid = syscall(SYS_gettid); + + /* multiplicative hashing, it's fast */ + slot = 2654435769U * tid; + slot >>= 24; + + atomic_inc(&base_hits[slot].value); /* use highest byte as an index */ +} + +static long sum_and_reset_counters(struct counter *counters) +{ + int i; + long sum = 0; + + for (i = 0; i < MAX_BUCKETS; i++) + sum += atomic_swap(&counters[i].value, 0); + return sum; +} static void trigger_validate(void) { @@ -19,41 +99,63 @@ static void trigger_validate(void) } } -static void *trigger_base_producer(void *input) +static void *trigger_producer(void *input) { - while (true) { - (void)syscall(__NR_getpgid); - atomic_inc(&base_hits.value); + if (ctx.usermode_counters) { + while (true) { + (void)syscall(__NR_getpgid); + inc_counter(base_hits); + } + } else { + while (true) + (void)syscall(__NR_getpgid); } return NULL; } -static void trigger_base_measure(struct bench_res *res) +static void *trigger_producer_batch(void *input) { - res->hits = atomic_swap(&base_hits.value, 0); -} + int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver); -static void *trigger_producer(void *input) -{ while (true) - (void)syscall(__NR_getpgid); + bpf_prog_test_run_opts(fd, NULL); + return NULL; } static void trigger_measure(struct bench_res *res) { - res->hits = atomic_swap(&ctx.skel->bss->hits, 0); + if (ctx.usermode_counters) + res->hits = sum_and_reset_counters(base_hits); + else + res->hits = sum_and_reset_counters(ctx.skel->bss->hits); } static void setup_ctx(void) { setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + + /* default "driver" BPF program */ + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); + + ctx.skel->rodata->batch_iters = args.batch_iters; +} + +static void load_ctx(void) +{ + int err; + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } } static void attach_bpf(struct bpf_program *prog) @@ -67,64 +169,104 @@ static void attach_bpf(struct bpf_program *prog) } } -static void trigger_tp_setup(void) +static void trigger_syscall_count_setup(void) { - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_tp); + ctx.usermode_counters = true; } -static void trigger_rawtp_setup(void) +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } static void trigger_kprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe); } static void trigger_kretprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe); } static void trigger_kprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi); } static void trigger_kretprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi); } static void trigger_fentry_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fentry); } static void trigger_fexit_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fexit); } -static void trigger_fentry_sleep_setup(void) +static void trigger_fmodret_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); } -static void trigger_fmodret_setup(void) +static void trigger_tp_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fmodret); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_rawtp); } /* make sure call is not inlined and not avoided by compiler, so __weak and @@ -137,7 +279,7 @@ static void trigger_fmodret_setup(void) * GCC doesn't generate stack setup preample for these functions due to them * having no input arguments and doing nothing in the body. */ -__weak void uprobe_target_nop(void) +__nocf_check __weak void uprobe_target_nop(void) { asm volatile ("nop"); } @@ -146,7 +288,7 @@ __weak void opaque_noop_func(void) { } -__weak int uprobe_target_push(void) +__nocf_check __weak int uprobe_target_push(void) { /* overhead of function call is negligible compared to uprobe * triggering, so this shouldn't affect benchmark results much @@ -155,16 +297,16 @@ __weak int uprobe_target_push(void) return 1; } -__weak void uprobe_target_ret(void) +__nocf_check __weak void uprobe_target_ret(void) { asm volatile (""); } -static void *uprobe_base_producer(void *input) +static void *uprobe_producer_count(void *input) { while (true) { uprobe_target_nop(); - atomic_inc(&base_hits.value); + inc_counter(base_hits); } return NULL; } @@ -194,15 +336,24 @@ static void usetup(bool use_retprobe, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; + int err; setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + uprobe_offset = get_uprobe_offset(target_addr); link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, use_retprobe, @@ -216,204 +367,90 @@ static void usetup(bool use_retprobe, void *target_addr) ctx.skel->links.bench_trigger_uprobe = link; } -static void uprobe_setup_nop(void) +static void usermode_count_setup(void) +{ + ctx.usermode_counters = true; +} + +static void uprobe_nop_setup(void) { usetup(false, &uprobe_target_nop); } -static void uretprobe_setup_nop(void) +static void uretprobe_nop_setup(void) { usetup(true, &uprobe_target_nop); } -static void uprobe_setup_push(void) +static void uprobe_push_setup(void) { usetup(false, &uprobe_target_push); } -static void uretprobe_setup_push(void) +static void uretprobe_push_setup(void) { usetup(true, &uprobe_target_push); } -static void uprobe_setup_ret(void) +static void uprobe_ret_setup(void) { usetup(false, &uprobe_target_ret); } -static void uretprobe_setup_ret(void) +static void uretprobe_ret_setup(void) { usetup(true, &uprobe_target_ret); } -const struct bench bench_trig_base = { - .name = "trig-base", +const struct bench bench_trig_syscall_count = { + .name = "trig-syscall-count", .validate = trigger_validate, - .producer_thread = trigger_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_tp = { - .name = "trig-tp", - .validate = trigger_validate, - .setup = trigger_tp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_rawtp = { - .name = "trig-rawtp", - .validate = trigger_validate, - .setup = trigger_rawtp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe = { - .name = "trig-kprobe", - .validate = trigger_validate, - .setup = trigger_kprobe_setup, + .setup = trigger_syscall_count_setup, .producer_thread = trigger_producer, .measure = trigger_measure, .report_progress = hits_drops_report_progress, .report_final = hits_drops_report_final, }; -const struct bench bench_trig_kretprobe = { - .name = "trig-kretprobe", - .validate = trigger_validate, - .setup = trigger_kretprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe_multi = { - .name = "trig-kprobe-multi", - .validate = trigger_validate, - .setup = trigger_kprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe_multi = { - .name = "trig-kretprobe-multi", - .validate = trigger_validate, - .setup = trigger_kretprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry = { - .name = "trig-fentry", - .validate = trigger_validate, - .setup = trigger_fentry_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fexit = { - .name = "trig-fexit", - .validate = trigger_validate, - .setup = trigger_fexit_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry_sleep = { - .name = "trig-fentry-sleep", - .validate = trigger_validate, - .setup = trigger_fentry_sleep_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fmodret = { - .name = "trig-fmodret", - .validate = trigger_validate, - .setup = trigger_fmodret_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_base = { - .name = "trig-uprobe-base", - .setup = NULL, /* no uprobe/uretprobe is attached */ - .producer_thread = uprobe_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_nop = { - .name = "trig-uprobe-nop", - .setup = uprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_nop = { - .name = "trig-uretprobe-nop", - .setup = uretprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_push = { - .name = "trig-uprobe-push", - .setup = uprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_push = { - .name = "trig-uretprobe-push", - .setup = uretprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_ret = { - .name = "trig-uprobe-ret", - .setup = uprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_ret = { - .name = "trig-uretprobe-ret", - .setup = uretprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; +/* batched (staying mostly in kernel) kprobe/fentry benchmarks */ +#define BENCH_TRIG_KERNEL(KIND, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .setup = trigger_##KIND##_setup, \ + .producer_thread = trigger_producer_batch, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ + .argp = &bench_trigger_batch_argp, \ +} + +BENCH_TRIG_KERNEL(kernel_count, "kernel-count"); +BENCH_TRIG_KERNEL(kprobe, "kprobe"); +BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); +BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); +BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); +BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(fexit, "fexit"); +BENCH_TRIG_KERNEL(fmodret, "fmodret"); +BENCH_TRIG_KERNEL(tp, "tp"); +BENCH_TRIG_KERNEL(rawtp, "rawtp"); + +/* uprobe benchmarks */ +#define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .validate = trigger_validate, \ + .setup = KIND##_setup, \ + .producer_thread = uprobe_producer_##PRODUCER, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ +} + +BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count"); +BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop"); +BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push"); +BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); +BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); +BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); +BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 78e83f2432..a690f5a68b 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -2,8 +2,22 @@ set -eufo pipefail -for i in base tp rawtp kprobe fentry fmodret -do - summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-10s: %s\n" $i "$summary" +def_tests=( \ + usermode-count kernel-count syscall-count \ + fentry fexit fmodret \ + rawtp tp \ + kprobe kprobe-multi \ + kretprobe kretprobe-multi \ +) + +tests=("$@") +if [ ${#tests[@]} -eq 0 ]; then + tests=("${def_tests[@]}") +fi + +p=${PROD_CNT:-1} + +for t in "${tests[@]}"; do + summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-15s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 9bdcc74e03..af169f831f 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index b99b9f408e..85dbc3ea4d 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -29,6 +29,7 @@ static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { re static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {} static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } #define cond_break ({}) +#define can_loop true #endif /* Safely walk link list elements. Deletion of elements is allowed. */ @@ -36,8 +37,7 @@ static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } for (void * ___tmp = (pos = list_entry_safe((head)->first, \ typeof(*(pos)), member), \ (void *)0); \ - pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ - cond_break, \ + pos && ({ ___tmp = (void *)pos->member.next; 1; }) && can_loop; \ pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a5b9df38c1..3d9e4b8c6b 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -326,9 +326,48 @@ l_true: \ }) #endif +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + #define cond_break \ ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ + asm volatile goto("1:.byte 0xe5; \ .byte 0; \ .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ .short 0" \ @@ -337,6 +376,7 @@ l_true: \ l_break: break; \ l_continue:; \ }) +#endif #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ @@ -386,6 +426,28 @@ l_true: \ , [as]"i"((dst_as << 16) | src_as)); #endif +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; + +typedef struct { +} __bpf_preempt_t; + +static inline __bpf_preempt_t __bpf_preempt_constructor(void) +{ + __bpf_preempt_t ret = {}; + + bpf_preempt_disable(); + return ret; +} +static inline void __bpf_preempt_destructor(__bpf_preempt_t *t) +{ + bpf_preempt_enable(); +} +#define bpf_guard_preempt() \ + __bpf_preempt_t ___bpf_apply(preempt, __COUNTER__) \ + __attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) = \ + __bpf_preempt_constructor() + /* Description * Assert that a conditional expression is true. * Returns @@ -459,4 +521,11 @@ extern int bpf_iter_css_new(struct bpf_iter_css *it, extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; +extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; +extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags__k, void *aux__ign) __ksym; +#define bpf_wq_set_callback(timer, cb, flags) \ + bpf_wq_set_callback_impl(timer, cb, flags, NULL) #endif diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 14ebe7d9e1..3b6675ab40 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -75,4 +75,7 @@ extern void bpf_key_put(struct bpf_key *key) __ksym; extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; + +extern bool bpf_session_is_return(void) __ksym __weak; +extern __u64 *bpf_session_cookie(void) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h deleted file mode 100644 index 82a7c9de95..0000000000 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BPF_TCP_HELPERS_H -#define __BPF_TCP_HELPERS_H - -#include -#include -#include -#include -#include - -#define BPF_STRUCT_OPS(name, args...) \ -SEC("struct_ops/"#name) \ -BPF_PROG(name, args) - -#ifndef SOL_TCP -#define SOL_TCP 6 -#endif - -#ifndef TCP_CA_NAME_MAX -#define TCP_CA_NAME_MAX 16 -#endif - -#define tcp_jiffies32 ((__u32)bpf_jiffies64()) - -struct sock_common { - unsigned char skc_state; - __u16 skc_num; -} __attribute__((preserve_access_index)); - -enum sk_pacing { - SK_PACING_NONE = 0, - SK_PACING_NEEDED = 1, - SK_PACING_FQ = 2, -}; - -struct sock { - struct sock_common __sk_common; -#define sk_state __sk_common.skc_state - unsigned long sk_pacing_rate; - __u32 sk_pacing_status; /* see enum sk_pacing */ -} __attribute__((preserve_access_index)); - -struct inet_sock { - struct sock sk; -} __attribute__((preserve_access_index)); - -struct inet_connection_sock { - struct inet_sock icsk_inet; - __u8 icsk_ca_state:6, - icsk_ca_setsockopt:1, - icsk_ca_dst_locked:1; - struct { - __u8 pending; - } icsk_ack; - __u64 icsk_ca_priv[104 / sizeof(__u64)]; -} __attribute__((preserve_access_index)); - -struct request_sock { - struct sock_common __req_common; -} __attribute__((preserve_access_index)); - -struct tcp_sock { - struct inet_connection_sock inet_conn; - - __u32 rcv_nxt; - __u32 snd_nxt; - __u32 snd_una; - __u32 window_clamp; - __u8 ecn_flags; - __u32 delivered; - __u32 delivered_ce; - __u32 snd_cwnd; - __u32 snd_cwnd_cnt; - __u32 snd_cwnd_clamp; - __u32 snd_ssthresh; - __u8 syn_data:1, /* SYN includes data */ - syn_fastopen:1, /* SYN includes Fast Open option */ - syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ - syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ - __u32 max_packets_out; - __u32 lsndtime; - __u32 prior_cwnd; - __u64 tcp_mstamp; /* most recent packet received/sent */ - bool is_mptcp; -} __attribute__((preserve_access_index)); - -static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} - -static __always_inline void *inet_csk_ca(const struct sock *sk) -{ - return (void *)inet_csk(sk)->icsk_ca_priv; -} - -static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -static __always_inline bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} -#define after(seq2, seq1) before(seq1, seq2) - -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 - -enum inet_csk_ack_state_t { - ICSK_ACK_SCHED = 1, - ICSK_ACK_TIMER = 2, - ICSK_ACK_PUSHED = 4, - ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ -}; - -enum tcp_ca_event { - CA_EVENT_TX_START = 0, - CA_EVENT_CWND_RESTART = 1, - CA_EVENT_COMPLETE_CWR = 2, - CA_EVENT_LOSS = 3, - CA_EVENT_ECN_NO_CE = 4, - CA_EVENT_ECN_IS_CE = 5, -}; - -struct ack_sample { - __u32 pkts_acked; - __s32 rtt_us; - __u32 in_flight; -} __attribute__((preserve_access_index)); - -struct rate_sample { - __u64 prior_mstamp; /* starting timestamp for interval */ - __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ - __s32 delivered; /* number of packets delivered over interval */ - long interval_us; /* time for tp->delivered to incr "delivered" */ - __u32 snd_interval_us; /* snd interval for delivered packets */ - __u32 rcv_interval_us; /* rcv interval for delivered packets */ - long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ - int losses; /* number of packets marked lost upon ACK */ - __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ - __u32 prior_in_flight; /* in flight before this ACK */ - bool is_app_limited; /* is sample from packet with bubble in pipe? */ - bool is_retrans; /* is sample from retransmission? */ - bool is_ack_delayed; /* is this (likely) a delayed ACK? */ -} __attribute__((preserve_access_index)); - -#define TCP_CA_NAME_MAX 16 -#define TCP_CONG_NEEDS_ECN 0x2 - -struct tcp_congestion_ops { - char name[TCP_CA_NAME_MAX]; - __u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); - - /* return slow start threshold (required) */ - __u32 (*ssthresh)(struct sock *sk); - /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked); - /* call before changing ca_state (optional) */ - void (*set_state)(struct sock *sk, __u8 new_state); - /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); - /* call when ack arrives (optional) */ - void (*in_ack_event)(struct sock *sk, __u32 flags); - /* new value of cwnd after loss (required) */ - __u32 (*undo_cwnd)(struct sock *sk); - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ - __u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - __u32 (*sndbuf_expand)(struct sock *sk); - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) - */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); - void *owner; -}; - -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) -{ - return tp->snd_cwnd < tp->snd_ssthresh; -} - -static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; - - return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); -} - -static __always_inline bool tcp_cc_eq(const char *a, const char *b) -{ - int i; - - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (a[i] != b[i]) - return false; - if (!a[i]) - break; - } - - return true; -} - -extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; -extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; - -struct mptcp_sock { - struct inet_connection_sock sk; - - __u32 token; - struct sock *first; - char ca_name[TCP_CA_NAME_MAX]; -} __attribute__((preserve_access_index)); - -#endif diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index edcd261065..2a18bd320e 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -10,18 +10,30 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include "bpf_testmod.h" #include "bpf_testmod_kfunc.h" #define CREATE_TRACE_POINTS #include "bpf_testmod-events.h" +#define CONNECT_TIMEOUT_SEC 1 + typedef int (*func_proto_typedef)(long); typedef int (*func_proto_typedef_nested1)(func_proto_typedef); typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1); DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123; long bpf_testmod_test_struct_arg_result; +static DEFINE_MUTEX(sock_lock); +static struct socket *sock; struct bpf_testmod_struct_arg_1 { int a; @@ -497,6 +509,241 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused return arg; } +__bpf_kfunc void bpf_kfunc_call_test_sleepable(void) +{ +} + +__bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args) +{ + int proto; + int err; + + mutex_lock(&sock_lock); + + if (sock) { + pr_err("%s called without releasing old sock", __func__); + err = -EPERM; + goto out; + } + + switch (args->af) { + case AF_INET: + case AF_INET6: + proto = args->type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP; + break; + case AF_UNIX: + proto = PF_UNIX; + break; + default: + pr_err("invalid address family %d\n", args->af); + err = -EINVAL; + goto out; + } + + err = sock_create_kern(current->nsproxy->net_ns, args->af, args->type, + proto, &sock); + + if (!err) + /* Set timeout for call to kernel_connect() to prevent it from hanging, + * and consider the connection attempt failed if it returns + * -EINPROGRESS. + */ + sock->sk->sk_sndtimeo = CONNECT_TIMEOUT_SEC * HZ; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc void bpf_kfunc_close_sock(void) +{ + mutex_lock(&sock_lock); + + if (sock) { + sock_release(sock); + sock = NULL; + } + + mutex_unlock(&sock_lock); +} + +__bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_connect(sock, (struct sockaddr *)&args->addr, + args->addrlen, 0); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_listen(void) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_listen(sock, 128); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_sendmsg(sock, &msg, &iov, 1, args->msglen); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, args->msglen); + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = sock_sendmsg(sock, &msg); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getsockname(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getpeername(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -523,6 +770,16 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_bind, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_listen, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -653,6 +910,8 @@ static int bpf_testmod_init(void) return ret; if (bpf_fentry_test1(0) < 0) return -EINVAL; + sock = NULL; + mutex_init(&sock_lock); return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } @@ -666,6 +925,7 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index 7c664dd610..b0d586a675 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -64,6 +64,22 @@ struct prog_test_fail3 { char arr2[]; }; +struct init_sock_args { + int af; + int type; +}; + +struct addr_args { + char addr[sizeof(struct __kernel_sockaddr_storage)]; + int addrlen; +}; + +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; + struct prog_test_ref_kfunc * bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) __ksym; void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; @@ -96,6 +112,7 @@ void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym; void bpf_kfunc_call_test_destructive(void) __ksym; +void bpf_kfunc_call_test_sleepable(void) __ksym; void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p); struct prog_test_member *bpf_kfunc_call_memb_acquire(void); @@ -106,4 +123,15 @@ void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p); void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len); void bpf_kfunc_common_test(void) __ksym; + +int bpf_kfunc_init_sock(struct init_sock_args *args) __ksym; +void bpf_kfunc_close_sock(void) __ksym; +int bpf_kfunc_call_kernel_connect(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_bind(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_listen(void) __ksym; +int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index e812876d79..23bb9a9e6a 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -429,7 +429,7 @@ int create_and_get_cgroup(const char *relative_path) * which is an invalid cgroup id. * If there is a failure, it prints the error to stderr. */ -unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) +static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) { int dirfd, err, flags, mount_id, fhsize; union { diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index dec9fd7ebb..98b6b6a886 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -13,7 +13,12 @@ CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_USER_API=y CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_USER_API_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_AES=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF4=y @@ -91,3 +96,5 @@ CONFIG_VSOCKETS=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y +CONFIG_TCP_CONG_DCTCP=y +CONFIG_TCP_CONG_BBR=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index be96bf0223..35250e6cde 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -52,6 +52,8 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; +static const struct network_helper_opts default_opts; + int settimeo(int fd, int timeout_ms) { struct timeval timeout = { .tv_sec = 3 }; @@ -78,24 +80,22 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -static int __start_server(int type, int protocol, const struct sockaddr *addr, - socklen_t addrlen, int timeout_ms, bool reuseport) +static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { - int on = 1; int fd; - fd = socket(addr->sa_family, type, protocol); + fd = socket(addr->sa_family, type, opts->proto); if (fd < 0) { log_err("Failed to create server socket"); return -1; } - if (settimeo(fd, timeout_ms)) + if (settimeo(fd, opts->timeout_ms)) goto error_close; - if (reuseport && - setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) { - log_err("Failed to set SO_REUSEPORT"); + if (opts->post_socket_cb && opts->post_socket_cb(fd, NULL)) { + log_err("Failed to call post_socket_cb"); goto error_close; } @@ -118,35 +118,35 @@ error_close: return -1; } -static int start_server_proto(int family, int type, int protocol, - const char *addr_str, __u16 port, int timeout_ms) +int start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + }; struct sockaddr_storage addr; socklen_t addrlen; if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, protocol, (struct sockaddr *)&addr, - addrlen, timeout_ms, false); + return __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); } -int start_server(int family, int type, const char *addr_str, __u16 port, - int timeout_ms) +static int reuseport_cb(int fd, const struct post_socket_opts *opts) { - return start_server_proto(family, type, 0, addr_str, port, timeout_ms); -} + int on = 1; -int start_mptcp_server(int family, const char *addr_str, __u16 port, - int timeout_ms) -{ - return start_server_proto(family, SOCK_STREAM, IPPROTO_MPTCP, addr_str, - port, timeout_ms); + return setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)); } int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .post_socket_cb = reuseport_cb, + }; struct sockaddr_storage addr; unsigned int nr_fds = 0; socklen_t addrlen; @@ -162,8 +162,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, 0, (struct sockaddr *)&addr, addrlen, - timeout_ms, true); + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -172,8 +171,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, 0, (struct sockaddr *)&addr, - addrlen, timeout_ms, true); + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -185,6 +183,15 @@ close_fds: return NULL; } +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts) +{ + if (!opts) + opts = &default_opts; + + return __start_server(type, (struct sockaddr *)addr, len, opts); +} + void free_fds(int *fds, unsigned int nr_close_fds) { if (fds) { @@ -258,17 +265,24 @@ static int connect_fd_to_addr(int fd, return 0; } -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t addrlen, int type) +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { int fd; - fd = socket(addr->ss_family, type, 0); + if (!opts) + opts = &default_opts; + + fd = socket(addr->ss_family, type, opts->proto); if (fd < 0) { log_err("Failed to create client socket"); return -1; } - if (connect_fd_to_addr(fd, addr, addrlen, false)) + if (settimeo(fd, opts->timeout_ms)) + goto error_close; + + if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail)) goto error_close; return fd; @@ -278,8 +292,6 @@ error_close: return -1; } -static const struct network_helper_opts default_opts; - int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; @@ -442,22 +454,30 @@ struct nstoken *open_netns(const char *name) struct nstoken *token; token = calloc(1, sizeof(struct nstoken)); - if (!ASSERT_OK_PTR(token, "malloc token")) + if (!token) { + log_err("Failed to malloc token"); return NULL; + } token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY); - if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net")) + if (token->orig_netns_fd == -1) { + log_err("Failed to open(/proc/self/ns/net)"); goto fail; + } snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); nsfd = open(nspath, O_RDONLY | O_CLOEXEC); - if (!ASSERT_GE(nsfd, 0, "open netns fd")) + if (nsfd == -1) { + log_err("Failed to open(%s)", nspath); goto fail; + } err = setns(nsfd, CLONE_NEWNET); close(nsfd); - if (!ASSERT_OK(err, "setns")) + if (err) { + log_err("Failed to setns(nsfd)"); goto fail; + } return token; fail: @@ -472,7 +492,8 @@ void close_netns(struct nstoken *token) if (!token) return; - ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns"); + if (setns(token->orig_netns_fd, CLONE_NEWNET)) + log_err("Failed to setns(orig_netns_fd)"); close(token->orig_netns_fd); free(token); } @@ -499,3 +520,153 @@ int get_socket_local_port(int sock_fd) return -1; } + +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_GRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} + +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_SRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} + +struct send_recv_arg { + int fd; + uint32_t bytes; + int stop; +}; + +static void *send_recv_server(void *arg) +{ + struct send_recv_arg *a = (struct send_recv_arg *)arg; + ssize_t nr_sent = 0, bytes = 0; + char batch[1500]; + int err = 0, fd; + + fd = accept(a->fd, NULL, NULL); + while (fd == -1) { + if (errno == EINTR) + continue; + err = -errno; + goto done; + } + + if (settimeo(fd, 0)) { + err = -errno; + goto done; + } + + while (bytes < a->bytes && !READ_ONCE(a->stop)) { + nr_sent = send(fd, &batch, + MIN(a->bytes - bytes, sizeof(batch)), 0); + if (nr_sent == -1 && errno == EINTR) + continue; + if (nr_sent == -1) { + err = -errno; + break; + } + bytes += nr_sent; + } + + if (bytes != a->bytes) { + log_err("send %zd expected %u", bytes, a->bytes); + if (!err) + err = bytes > a->bytes ? -E2BIG : -EINTR; + } + +done: + if (fd >= 0) + close(fd); + if (err) { + WRITE_ONCE(a->stop, 1); + return ERR_PTR(err); + } + return NULL; +} + +int send_recv_data(int lfd, int fd, uint32_t total_bytes) +{ + ssize_t nr_recv = 0, bytes = 0; + struct send_recv_arg arg = { + .fd = lfd, + .bytes = total_bytes, + .stop = 0, + }; + pthread_t srv_thread; + void *thread_ret; + char batch[1500]; + int err = 0; + + err = pthread_create(&srv_thread, NULL, send_recv_server, (void *)&arg); + if (err) { + log_err("Failed to pthread_create"); + return err; + } + + /* recv total_bytes */ + while (bytes < total_bytes && !READ_ONCE(arg.stop)) { + nr_recv = recv(fd, &batch, + MIN(total_bytes - bytes, sizeof(batch)), 0); + if (nr_recv == -1 && errno == EINTR) + continue; + if (nr_recv == -1) { + err = -errno; + break; + } + bytes += nr_recv; + } + + if (bytes != total_bytes) { + log_err("recv %zd expected %u", bytes, total_bytes); + if (!err) + err = bytes > total_bytes ? -E2BIG : -EINTR; + } + + WRITE_ONCE(arg.stop, 1); + pthread_join(srv_thread, &thread_ret); + if (IS_ERR(thread_ret)) { + log_err("Failed in thread_ret %ld", PTR_ERR(thread_ret)); + err = err ? : PTR_ERR(thread_ret); + } + + return err; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 94b9be24e3..883c7ea9d8 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -9,14 +9,20 @@ typedef __u16 __sum16; #include #include #include +#include +#include +#include #include #include +#include #define MAGIC_VAL 0x1234 #define NUM_ITER 100000 #define VIP_NUM 5 #define MAGIC_BYTES 123 +struct post_socket_opts {}; + struct network_helper_opts { const char *cc; int timeout_ms; @@ -24,6 +30,7 @@ struct network_helper_opts { bool noconnect; int type; int proto; + int (*post_socket_cb)(int fd, const struct post_socket_opts *opts); }; /* ipv4 test vector */ @@ -45,13 +52,14 @@ extern struct ipv6_packet pkt_v6; int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); -int start_mptcp_server(int family, const char *addr, __u16 port, - int timeout_ms); int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens); +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); void free_fds(int *fds, unsigned int nr_close_fds); -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type); +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); @@ -61,6 +69,8 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, struct sockaddr_storage *addr, socklen_t *len); char *ping_command(int family); int get_socket_local_port(int sock_fd); +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); struct nstoken; /** @@ -71,6 +81,7 @@ struct nstoken; */ struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); +int send_recv_data(int lfd, int fd, uint32_t total_bytes); static __u16 csum_fold(__u32 csum) { diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c new file mode 100644 index 0000000000..0807a48a58 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include "arena_atomics.skel.h" + +static void test_add(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->add64_value, 3, "add64_value"); + ASSERT_EQ(skel->arena->add64_result, 1, "add64_result"); + + ASSERT_EQ(skel->arena->add32_value, 3, "add32_value"); + ASSERT_EQ(skel->arena->add32_result, 1, "add32_result"); + + ASSERT_EQ(skel->arena->add_stack_value_copy, 3, "add_stack_value"); + ASSERT_EQ(skel->arena->add_stack_result, 1, "add_stack_result"); + + ASSERT_EQ(skel->arena->add_noreturn_value, 3, "add_noreturn_value"); +} + +static void test_sub(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.sub); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->sub64_value, -1, "sub64_value"); + ASSERT_EQ(skel->arena->sub64_result, 1, "sub64_result"); + + ASSERT_EQ(skel->arena->sub32_value, -1, "sub32_value"); + ASSERT_EQ(skel->arena->sub32_result, 1, "sub32_result"); + + ASSERT_EQ(skel->arena->sub_stack_value_copy, -1, "sub_stack_value"); + ASSERT_EQ(skel->arena->sub_stack_result, 1, "sub_stack_result"); + + ASSERT_EQ(skel->arena->sub_noreturn_value, -1, "sub_noreturn_value"); +} + +static void test_and(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.and); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->and64_value, 0x010ull << 32, "and64_value"); + ASSERT_EQ(skel->arena->and32_value, 0x010, "and32_value"); +} + +static void test_or(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.or); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->or64_value, 0x111ull << 32, "or64_value"); + ASSERT_EQ(skel->arena->or32_value, 0x111, "or32_value"); +} + +static void test_xor(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xor); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xor64_value, 0x101ull << 32, "xor64_value"); + ASSERT_EQ(skel->arena->xor32_value, 0x101, "xor32_value"); +} + +static void test_cmpxchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.cmpxchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->cmpxchg64_value, 2, "cmpxchg64_value"); + ASSERT_EQ(skel->arena->cmpxchg64_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg64_result_succeed, 1, "cmpxchg_result_succeed"); + + ASSERT_EQ(skel->arena->cmpxchg32_value, 2, "lcmpxchg32_value"); + ASSERT_EQ(skel->arena->cmpxchg32_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed"); +} + +static void test_xchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xchg64_value, 2, "xchg64_value"); + ASSERT_EQ(skel->arena->xchg64_result, 1, "xchg64_result"); + + ASSERT_EQ(skel->arena->xchg32_value, 2, "xchg32_value"); + ASSERT_EQ(skel->arena->xchg32_result, 1, "xchg32_result"); +} + +void test_arena_atomics(void) +{ + struct arena_atomics *skel; + int err; + + skel = arena_atomics__open(); + if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open")) + return; + + if (skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang", + __func__); + test__skip(); + goto cleanup; + } + err = arena_atomics__load(skel); + if (!ASSERT_OK(err, "arena atomics skeleton load")) + return; + skel->bss->pid = getpid(); + + if (test__start_subtest("add")) + test_add(skel); + if (test__start_subtest("sub")) + test_sub(skel); + if (test__start_subtest("and")) + test_and(skel); + if (test__start_subtest("or")) + test_or(skel); + if (test__start_subtest("xor")) + test_xor(skel); + if (test__start_subtest("cmpxchg")) + test_cmpxchg(skel); + if (test__start_subtest("xchg")) + test_xchg(skel); + +cleanup: + arena_atomics__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 1454cebc26..4407ea428e 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -573,6 +573,115 @@ cleanup: close(lsm_fd); } +static void tp_btf_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_trace_opts, trace_opts); + + /* There are three different ways to attach tp_btf (BTF-aware raw + * tracepoint) programs. Let's test all of them. + */ + prog_fd = bpf_program__fd(skel->progs.handle_tp_btf); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->tp_btf_res = 0; + + raw_tp_opts.cookie = cookie = 0x11000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "raw_tp_open_res"); + + /* low-level generic bpf_link_create() API */ + skel->bss->tp_btf_res = 0; + + link_opts.tracing.cookie = cookie = 0x22000000000000L; + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_RAW_TP, &link_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "link_create_res"); + + /* high-level bpf_link-based bpf_program__attach_trace_opts() API */ + skel->bss->tp_btf_res = 0; + + trace_opts.cookie = cookie = 0x33000000000000L; + link = bpf_program__attach_trace_opts(skel->progs.handle_tp_btf, &trace_opts); + if (!ASSERT_OK_PTR(link, "attach_trace_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "attach_trace_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + +static void raw_tp_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_raw_tracepoint_opts, opts); + + /* There are two different ways to attach raw_tp programs */ + prog_fd = bpf_program__fd(skel->progs.handle_raw_tp); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->raw_tp_res = 0; + + raw_tp_opts.tp_name = "sys_enter"; + raw_tp_opts.cookie = cookie = 0x55000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "raw_tp_open_res"); + + /* high-level bpf_link-based bpf_program__attach_raw_tracepoint_opts() API */ + skel->bss->raw_tp_res = 0; + + opts.cookie = cookie = 0x66000000000000L; + link = bpf_program__attach_raw_tracepoint_opts(skel->progs.handle_raw_tp, + "sys_enter", &opts); + if (!ASSERT_OK_PTR(link, "attach_raw_tp_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "attach_raw_tp_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + void test_bpf_cookie(void) { struct test_bpf_cookie *skel; @@ -601,6 +710,9 @@ void test_bpf_cookie(void) tracing_subtest(skel); if (test__start_subtest("lsm")) lsm_subtest(skel); - + if (test__start_subtest("tp_btf")) + tp_btf_subtest(skel); + if (test__start_subtest("raw_tp")) + raw_tp_subtest(skel); test_bpf_cookie__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index a88e6e07e4..3f0daf6607 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -13,6 +13,8 @@ #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" +#include "tcp_ca_kfunc.skel.h" +#include "bpf_cc_cubic.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -20,7 +22,6 @@ static const unsigned int total_bytes = 10 * 1024 * 1024; static int expected_stg = 0xeB9F; -static int stop; static int settcpca(int fd, const char *tcp_ca) { @@ -33,63 +34,12 @@ static int settcpca(int fd, const char *tcp_ca) return 0; } -static void *server(void *arg) -{ - int lfd = (int)(long)arg, err = 0, fd; - ssize_t nr_sent = 0, bytes = 0; - char batch[1500]; - - fd = accept(lfd, NULL, NULL); - while (fd == -1) { - if (errno == EINTR) - continue; - err = -errno; - goto done; - } - - if (settimeo(fd, 0)) { - err = -errno; - goto done; - } - - while (bytes < total_bytes && !READ_ONCE(stop)) { - nr_sent = send(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_sent == -1 && errno == EINTR) - continue; - if (nr_sent == -1) { - err = -errno; - break; - } - bytes += nr_sent; - } - - ASSERT_EQ(bytes, total_bytes, "send"); - -done: - if (fd >= 0) - close(fd); - if (err) { - WRITE_ONCE(stop, 1); - return ERR_PTR(err); - } - return NULL; -} - static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { - struct sockaddr_in6 sa6 = {}; - ssize_t nr_recv = 0, bytes = 0; int lfd = -1, fd = -1; - pthread_t srv_thread; - socklen_t addrlen = sizeof(sa6); - void *thread_ret; - char batch[1500]; int err; - WRITE_ONCE(stop, 0); - - lfd = socket(AF_INET6, SOCK_STREAM, 0); + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; @@ -99,23 +49,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) return; } - if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd, 0) || settimeo(fd, 0)) - goto done; - - /* bind, listen and start server thread to accept */ - sa6.sin6_family = AF_INET6; - sa6.sin6_addr = in6addr_loopback; - err = bind(lfd, (struct sockaddr *)&sa6, addrlen); - if (!ASSERT_NEQ(err, -1, "bind")) - goto done; - - err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); - if (!ASSERT_NEQ(err, -1, "getsockname")) - goto done; - - err = listen(lfd, 1); - if (!ASSERT_NEQ(err, -1, "listen")) + if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca)) goto done; if (sk_stg_map) { @@ -126,7 +60,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } /* connect to server */ - err = connect(fd, (struct sockaddr *)&sa6, addrlen); + err = connect_fd_to_fd(fd, lfd, 0); if (!ASSERT_NEQ(err, -1, "connect")) goto done; @@ -140,26 +74,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) goto done; } - err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); - if (!ASSERT_OK(err, "pthread_create")) - goto done; - - /* recv total_bytes */ - while (bytes < total_bytes && !READ_ONCE(stop)) { - nr_recv = recv(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_recv == -1 && errno == EINTR) - continue; - if (nr_recv == -1) - break; - bytes += nr_recv; - } - - ASSERT_EQ(bytes, total_bytes, "recv"); - - WRITE_ONCE(stop, 1); - pthread_join(srv_thread, &thread_ret); - ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); + ASSERT_OK(send_recv_data(lfd, fd, total_bytes), "send_recv_data"); done: close(lfd); @@ -315,7 +230,7 @@ static void test_rel_setsockopt(void) struct bpf_dctcp_release *rel_skel; libbpf_print_fn_t old_print_fn; - err_str = "unknown func bpf_setsockopt"; + err_str = "program of this type cannot use helper bpf_setsockopt"; found = false; old_print_fn = libbpf_set_print(libbpf_debug_print); @@ -392,7 +307,8 @@ static void test_update_ca(void) return; link = bpf_map__attach_struct_ops(skel->maps.ca_update_1); - ASSERT_OK_PTR(link, "attach_struct_ops"); + if (!ASSERT_OK_PTR(link, "attach_struct_ops")) + goto out; do_test("tcp_ca_update", NULL); saved_ca1_cnt = skel->bss->ca1_cnt; @@ -406,6 +322,7 @@ static void test_update_ca(void) ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt"); bpf_link__destroy(link); +out: tcp_ca_update__destroy(skel); } @@ -421,7 +338,8 @@ static void test_update_wrong(void) return; link = bpf_map__attach_struct_ops(skel->maps.ca_update_1); - ASSERT_OK_PTR(link, "attach_struct_ops"); + if (!ASSERT_OK_PTR(link, "attach_struct_ops")) + goto out; do_test("tcp_ca_update", NULL); saved_ca1_cnt = skel->bss->ca1_cnt; @@ -434,6 +352,7 @@ static void test_update_wrong(void) ASSERT_GT(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt"); bpf_link__destroy(link); +out: tcp_ca_update__destroy(skel); } @@ -448,7 +367,8 @@ static void test_mixed_links(void) return; link_nl = bpf_map__attach_struct_ops(skel->maps.ca_no_link); - ASSERT_OK_PTR(link_nl, "attach_struct_ops_nl"); + if (!ASSERT_OK_PTR(link_nl, "attach_struct_ops_nl")) + goto out; link = bpf_map__attach_struct_ops(skel->maps.ca_update_1); ASSERT_OK_PTR(link, "attach_struct_ops"); @@ -461,6 +381,7 @@ static void test_mixed_links(void) bpf_link__destroy(link); bpf_link__destroy(link_nl); +out: tcp_ca_update__destroy(skel); } @@ -503,7 +424,8 @@ static void test_link_replace(void) bpf_link__destroy(link); link = bpf_map__attach_struct_ops(skel->maps.ca_update_2); - ASSERT_OK_PTR(link, "attach_struct_ops_2nd"); + if (!ASSERT_OK_PTR(link, "attach_struct_ops_2nd")) + goto out; /* BPF_F_REPLACE with a wrong old map Fd. It should fail! * @@ -526,9 +448,40 @@ static void test_link_replace(void) bpf_link__destroy(link); +out: tcp_ca_update__destroy(skel); } +static void test_tcp_ca_kfunc(void) +{ + struct tcp_ca_kfunc *skel; + + skel = tcp_ca_kfunc__open_and_load(); + ASSERT_OK_PTR(skel, "tcp_ca_kfunc__open_and_load"); + tcp_ca_kfunc__destroy(skel); +} + +static void test_cc_cubic(void) +{ + struct bpf_cc_cubic *cc_cubic_skel; + struct bpf_link *link; + + cc_cubic_skel = bpf_cc_cubic__open_and_load(); + if (!ASSERT_OK_PTR(cc_cubic_skel, "bpf_cc_cubic__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(cc_cubic_skel->maps.cc_cubic); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_cc_cubic__destroy(cc_cubic_skel); + return; + } + + do_test("bpf_cc_cubic", NULL); + + bpf_link__destroy(link); + bpf_cc_cubic__destroy(cc_cubic_skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -557,4 +510,8 @@ void test_bpf_tcp_ca(void) test_multi_links(); if (test__start_subtest("link_replace")) test_link_replace(); + if (test__start_subtest("tcp_ca_kfunc")) + test_tcp_ca_kfunc(); + if (test__start_subtest("cc_cubic")) + test_cc_cubic(); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index e9ea38aa82..09a8e6f9b3 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -653,7 +653,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, cmpstr = "(struct file_operations){\n" " .owner = (struct module *)0xffffffffffffffff,\n" -" .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,"; +" .fop_flags = (fop_flags_t)4294967295,"; ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations"); } diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 2a55f717fc..34b59f6bac 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -10,6 +10,7 @@ #include #include +#include "network_helpers.h" #include "progs/test_cls_redirect.h" #include "test_cls_redirect.skel.h" @@ -35,39 +36,6 @@ struct tuple { struct addr_port dst; }; -static int start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto err; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - -static int connect_to_server(const struct sockaddr *addr, socklen_t len, - int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(connect(fd, addr, len))) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) { const struct sockaddr_in6 *in6; @@ -98,14 +66,14 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, socklen_t slen = sizeof(ss); struct sockaddr *sa = (struct sockaddr *)&ss; - *server = start_server(addr, len, type); + *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (*server < 0) return false; if (CHECK_FAIL(getsockname(*server, sa, &slen))) goto close_server; - *conn = connect_to_server(sa, slen, type); + *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL); if (*conn < 0) goto close_server; diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c new file mode 100644 index 0000000000..b1a3a49a82 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include + +#include "test_progs.h" +#include "network_helpers.h" +#include "crypto_sanity.skel.h" +#include "crypto_basic.skel.h" + +#define NS_TEST "crypto_sanity_ns" +#define IPV6_IFACE_ADDR "face::1" +static const unsigned char crypto_key[] = "testtest12345678"; +static const char plain_text[] = "stringtoencrypt0"; +static int opfd = -1, tfmfd = -1; +static const char algo[] = "ecb(aes)"; +static int init_afalg(void) +{ + struct sockaddr_alg sa = { + .salg_family = AF_ALG, + .salg_type = "skcipher", + .salg_name = "ecb(aes)" + }; + + tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (tfmfd == -1) + return errno; + if (bind(tfmfd, (struct sockaddr *)&sa, sizeof(sa)) == -1) + return errno; + if (setsockopt(tfmfd, SOL_ALG, ALG_SET_KEY, crypto_key, 16) == -1) + return errno; + opfd = accept(tfmfd, NULL, 0); + if (opfd == -1) + return errno; + return 0; +} + +static void deinit_afalg(void) +{ + if (tfmfd != -1) + close(tfmfd); + if (opfd != -1) + close(opfd); +} + +static void do_crypt_afalg(const void *src, void *dst, int size, bool encrypt) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char cbuf[CMSG_SPACE(4)] = {0}; + struct iovec iov; + + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_ALG; + cmsg->cmsg_type = ALG_SET_OP; + cmsg->cmsg_len = CMSG_LEN(4); + *(__u32 *)CMSG_DATA(cmsg) = encrypt ? ALG_OP_ENCRYPT : ALG_OP_DECRYPT; + + iov.iov_base = (char *)src; + iov.iov_len = size; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + sendmsg(opfd, &msg, 0); + read(opfd, dst, size); +} + +void test_crypto_basic(void) +{ + RUN_TESTS(crypto_basic); +} + +void test_crypto_sanity(void) +{ + LIBBPF_OPTS(bpf_tc_hook, qdisc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_enc); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_dec); + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct nstoken *nstoken = NULL; + struct crypto_sanity *skel; + char afalg_plain[16] = {0}; + char afalg_dst[16] = {0}; + struct sockaddr_in6 addr; + int sockfd, err, pfd; + socklen_t addrlen; + u16 udp_test_port; + + skel = crypto_sanity__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open")) + return; + + SYS(fail, "ip netns add %s", NS_TEST); + SYS(fail, "ip -net %s -6 addr add %s/128 dev lo nodad", NS_TEST, IPV6_IFACE_ADDR); + SYS(fail, "ip -net %s link set dev lo up", NS_TEST); + + nstoken = open_netns(NS_TEST); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto fail; + + err = init_afalg(); + if (!ASSERT_OK(err, "AF_ALG init fail")) + goto fail; + + qdisc_hook.ifindex = if_nametoindex("lo"); + if (!ASSERT_GT(qdisc_hook.ifindex, 0, "if_nametoindex lo")) + goto fail; + + skel->bss->key_len = 16; + skel->bss->authsize = 0; + udp_test_port = skel->data->udp_test_port; + memcpy(skel->bss->key, crypto_key, sizeof(crypto_key)); + snprintf(skel->bss->algo, 128, "%s", algo); + pfd = bpf_program__fd(skel->progs.skb_crypto_setup); + if (!ASSERT_GT(pfd, 0, "skb_crypto_setup fd")) + goto fail; + + err = bpf_prog_test_run_opts(pfd, &opts); + if (!ASSERT_OK(err, "skb_crypto_setup") || + !ASSERT_OK(opts.retval, "skb_crypto_setup retval")) + goto fail; + + if (!ASSERT_OK(skel->bss->status, "skb_crypto_setup status")) + goto fail; + + err = bpf_tc_hook_create(&qdisc_hook); + if (!ASSERT_OK(err, "create qdisc hook")) + goto fail; + + addrlen = sizeof(addr); + err = make_sockaddr(AF_INET6, IPV6_IFACE_ADDR, udp_test_port, + (void *)&addr, &addrlen); + if (!ASSERT_OK(err, "make_sockaddr")) + goto fail; + + tc_attach_enc.prog_fd = bpf_program__fd(skel->progs.encrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "attach encrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "encrypt socket")) + goto fail; + err = sendto(sockfd, plain_text, sizeof(plain_text), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(plain_text), "encrypt send")) + goto fail; + + do_crypt_afalg(plain_text, afalg_dst, sizeof(afalg_dst), true); + + if (!ASSERT_OK(skel->bss->status, "encrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_dst, sizeof(afalg_dst), "encrypt AF_ALG")) + goto fail; + + tc_attach_enc.flags = tc_attach_enc.prog_fd = tc_attach_enc.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "bpf_tc_detach encrypt")) + goto fail; + + tc_attach_dec.prog_fd = bpf_program__fd(skel->progs.decrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_dec); + if (!ASSERT_OK(err, "attach decrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "decrypt socket")) + goto fail; + err = sendto(sockfd, afalg_dst, sizeof(afalg_dst), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(afalg_dst), "decrypt send")) + goto fail; + + do_crypt_afalg(afalg_dst, afalg_plain, sizeof(afalg_plain), false); + + if (!ASSERT_OK(skel->bss->status, "decrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_plain, sizeof(afalg_plain), "decrypt AF_ALG")) + goto fail; + + tc_attach_dec.flags = tc_attach_dec.prog_fd = tc_attach_dec.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_dec); + ASSERT_OK(err, "bpf_tc_detach decrypt"); + +fail: + close_netns(nstoken); + deinit_afalg(); + SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null"); + crypto_sanity__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/empty_skb.c b/tools/testing/selftests/bpf/prog_tests/empty_skb.c index 261228eb68..438583e1f2 100644 --- a/tools/testing/selftests/bpf/prog_tests/empty_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/empty_skb.c @@ -94,6 +94,8 @@ void test_empty_skb(void) SYS(out, "ip netns add empty_skb"); tok = open_netns("empty_skb"); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add veth0 type veth peer veth1"); SYS(out, "ip link set dev veth0 up"); SYS(out, "ip link set dev veth1 up"); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c index f949647dbb..552a0875ca 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -21,13 +21,13 @@ static int do_sleep(void *skel) } #define STACK_SIZE (1024 * 1024) -static char child_stack[STACK_SIZE]; void test_fexit_sleep(void) { struct fexit_sleep_lskel *fexit_skel = NULL; int wstatus, duration = 0; pid_t cpid; + char *child_stack = NULL; int err, fexit_cnt; fexit_skel = fexit_sleep_lskel__open_and_load(); @@ -38,6 +38,11 @@ void test_fexit_sleep(void) if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) goto cleanup; + child_stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (!ASSERT_NEQ(child_stack, MAP_FAILED, "mmap")) + goto cleanup; + cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) goto cleanup; @@ -78,5 +83,6 @@ void test_fexit_sleep(void) goto cleanup; cleanup: + munmap(child_stack, STACK_SIZE); fexit_sleep_lskel__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c index 3379df2d4c..bd76589580 100644 --- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c @@ -26,6 +26,17 @@ #define IPV6_TBID_ADDR "fd00::FFFF" #define IPV6_TBID_NET "fd00::" #define IPV6_TBID_DST "fd00::2" +#define MARK_NO_POLICY 33 +#define MARK 42 +#define MARK_TABLE "200" +#define IPV4_REMOTE_DST "1.2.3.4" +#define IPV4_LOCAL "10.4.0.3" +#define IPV4_GW1 "10.4.0.1" +#define IPV4_GW2 "10.4.0.2" +#define IPV6_REMOTE_DST "be:ef::b0:10" +#define IPV6_LOCAL "fd01::3" +#define IPV6_GW1 "fd01::1" +#define IPV6_GW2 "fd01::2" #define DMAC "11:11:11:11:11:11" #define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, } #define DMAC2 "01:01:01:01:01:01" @@ -36,9 +47,11 @@ struct fib_lookup_test { const char *daddr; int expected_ret; const char *expected_src; + const char *expected_dst; int lookup_flags; __u32 tbid; __u8 dmac[6]; + __u32 mark; }; static const struct fib_lookup_test tests[] = { @@ -90,10 +103,47 @@ static const struct fib_lookup_test tests[] = { .daddr = IPV6_ADDR_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, .expected_src = IPV6_IFACE_ADDR_SEC, .lookup_flags = BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + /* policy routing */ + { .desc = "IPv4 policy routing, default", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv4 policy routing, mark doesn't point to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv4 policy routing, mark points to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv4 policy routing, mark points to a policy, but no flag", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, default", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv6 policy routing, mark doesn't point to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv6 policy routing, mark points to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, mark points to a policy, but no flag", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, }; -static int ifindex; - static int setup_netns(void) { int err; @@ -144,12 +194,24 @@ static int setup_netns(void) if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)")) goto fail; + /* Setup for policy routing tests */ + SYS(fail, "ip addr add %s/24 dev veth1", IPV4_LOCAL); + SYS(fail, "ip addr add %s/64 dev veth1 nodad", IPV6_LOCAL); + SYS(fail, "ip route add %s/32 via %s", IPV4_REMOTE_DST, IPV4_GW1); + SYS(fail, "ip route add %s/32 via %s table %s", IPV4_REMOTE_DST, IPV4_GW2, MARK_TABLE); + SYS(fail, "ip -6 route add %s/128 via %s", IPV6_REMOTE_DST, IPV6_GW1); + SYS(fail, "ip -6 route add %s/128 via %s table %s", IPV6_REMOTE_DST, IPV6_GW2, MARK_TABLE); + SYS(fail, "ip rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + SYS(fail, "ip -6 rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + return 0; fail: return -1; } -static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_lookup_test *test) +static int set_lookup_params(struct bpf_fib_lookup *params, + const struct fib_lookup_test *test, + int ifindex) { int ret; @@ -158,6 +220,7 @@ static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_loo params->l4_protocol = IPPROTO_TCP; params->ifindex = ifindex; params->tbid = test->tbid; + params->mark = test->mark; if (inet_pton(AF_INET6, test->daddr, params->ipv6_dst) == 1) { params->family = AF_INET6; @@ -190,40 +253,45 @@ static void mac_str(char *b, const __u8 *mac) mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } -static void assert_src_ip(struct bpf_fib_lookup *fib_params, const char *expected_src) +static void assert_ip_address(int family, void *addr, const char *expected_str) { + char str[INET6_ADDRSTRLEN]; + u8 expected_addr[16]; + int addr_len = 0; int ret; - __u32 src6[4]; - __be32 src4; - switch (fib_params->family) { + switch (family) { case AF_INET6: - ret = inet_pton(AF_INET6, expected_src, src6); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ret = memcmp(src6, fib_params->ipv6_src, sizeof(fib_params->ipv6_src)); - if (!ASSERT_EQ(ret, 0, "fib_lookup ipv6 src")) { - char str_src6[64]; - - inet_ntop(AF_INET6, fib_params->ipv6_src, str_src6, - sizeof(str_src6)); - printf("ipv6 expected %s actual %s ", expected_src, - str_src6); - } - + ret = inet_pton(AF_INET6, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET6, expected_str)"); + addr_len = 16; break; case AF_INET: - ret = inet_pton(AF_INET, expected_src, &src4); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ASSERT_EQ(fib_params->ipv4_src, src4, "fib_lookup ipv4 src"); - + ret = inet_pton(AF_INET, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET, expected_str)"); + addr_len = 4; break; default: - PRINT_FAIL("invalid addr family: %d", fib_params->family); + PRINT_FAIL("invalid address family: %d", family); + break; + } + + if (memcmp(addr, expected_addr, addr_len)) { + inet_ntop(family, addr, str, sizeof(str)); + PRINT_FAIL("expected %s actual %s ", expected_str, str); } } +static void assert_src_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_src, expected); +} + +static void assert_dst_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_dst, expected); +} + void test_fib_lookup(void) { struct bpf_fib_lookup *fib_params; @@ -256,15 +324,18 @@ void test_fib_lookup(void) if (setup_netns()) goto fail; - ifindex = if_nametoindex("veth1"); - skb.ifindex = ifindex; + skb.ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(skb.ifindex, 0, "if_nametoindex(veth1)")) + goto fail; + fib_params = &skel->bss->fib_params; for (i = 0; i < ARRAY_SIZE(tests); i++) { printf("Testing %s ", tests[i].desc); - if (set_lookup_params(fib_params, &tests[i])) + if (set_lookup_params(fib_params, &tests[i], skb.ifindex)) continue; + skel->bss->fib_lookup_ret = -1; skel->bss->lookup_flags = tests[i].lookup_flags; @@ -278,6 +349,9 @@ void test_fib_lookup(void) if (tests[i].expected_src) assert_src_ip(fib_params, tests[i].expected_src); + if (tests[i].expected_dst) + assert_dst_ip(fib_params, tests[i].expected_dst); + ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac)); if (!ASSERT_EQ(ret, 0, "dmac not match")) { char expected[18], actual[18]; diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index c4773173a4..9e5f387391 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index 8963f8a549..09f6487f58 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -5,6 +5,7 @@ #include "for_each_hash_map_elem.skel.h" #include "for_each_array_map_elem.skel.h" #include "for_each_map_elem_write_key.skel.h" +#include "for_each_multi_maps.skel.h" static unsigned int duration; @@ -143,6 +144,65 @@ static void test_write_map_key(void) for_each_map_elem_write_key__destroy(skel); } +static void test_multi_maps(void) +{ + struct for_each_multi_maps *skel; + __u64 val, array_total, hash_total; + __u32 key, max_entries; + int i, err; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + skel = for_each_multi_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_multi_maps__open_and_load")) + return; + + array_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + array_total += val; + err = bpf_map__update_elem(skel->maps.arraymap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "array_map_update")) + goto out; + } + + hash_total = 0; + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i + 100; + val = i + 1; + hash_total += val; + err = bpf_map__update_elem(skel->maps.hashmap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "hash_map_update")) + goto out; + } + + skel->bss->data_output = 0; + skel->bss->use_array = 1; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, array_total, "array output"); + + skel->bss->data_output = 0; + skel->bss->use_array = 0; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, hash_total, "hash output"); + +out: + for_each_multi_maps__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) @@ -151,4 +211,6 @@ void test_for_each(void) test_array_map(); if (test__start_subtest("write_map_key")) test_write_map_key(); + if (test__start_subtest("multi_maps")) + test_multi_maps(); } diff --git a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c index 8dd2af9081..284764e717 100644 --- a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c @@ -88,6 +88,8 @@ static int attach(struct ip_check_defrag *skel, bool ipv6) int err = -1; nstoken = open_netns(NS1); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto out; skel->links.defrag = bpf_program__attach_netfilter(skel->progs.defrag, &opts); if (!ASSERT_OK_PTR(skel->links.defrag, "program attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 05000810e2..960c9323d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -4,6 +4,8 @@ #include "trace_helpers.h" #include "kprobe_multi_empty.skel.h" #include "kprobe_multi_override.skel.h" +#include "kprobe_multi_session.skel.h" +#include "kprobe_multi_session_cookie.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -326,6 +328,74 @@ cleanup: kprobe_multi__destroy(skel); } +static void test_session_skel_api(void) +{ + struct kprobe_multi_session *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int i, err, prog_fd; + + skel = kprobe_multi_session__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_session__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_session__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + /* bpf_fentry_test1-4 trigger return probe, result is 2 */ + for (i = 0; i < 4; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 2, "kprobe_session_result"); + + /* bpf_fentry_test5-8 trigger only entry probe, result is 1 */ + for (i = 4; i < 8; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 1, "kprobe_session_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session__destroy(skel); +} + +static void test_session_cookie_skel_api(void) +{ + struct kprobe_multi_session_cookie *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int err, prog_fd; + + skel = kprobe_multi_session_cookie__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fentry_raw_skel_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session_cookie__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_wrapper__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + ASSERT_EQ(skel->bss->test_kprobe_1_result, 1, "test_kprobe_1_result"); + ASSERT_EQ(skel->bss->test_kprobe_2_result, 2, "test_kprobe_2_result"); + ASSERT_EQ(skel->bss->test_kprobe_3_result, 3, "test_kprobe_3_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session_cookie__destroy(skel); +} + static size_t symbol_hash(long key, void *ctx __maybe_unused) { return str_hash((const char *) key); @@ -336,15 +406,80 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) return strcmp((const char *) key1, (const char *) key2) == 0; } +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static bool skip_entry(char *name) +{ + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + return false; +} + +/* Do comparision by ignoring '.llvm.' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { - size_t cap = 0, cnt = 0, i; - char *name = NULL, **syms = NULL; + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; char buf[256]; FILE *f; int err = 0; + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) + return -EINVAL; + /* * The available_filter_functions contains many duplicates, * but other than that all symbols are usable in kprobe multi @@ -368,33 +503,23 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) } while (fgets(buf, sizeof(buf), f)) { - if (kernel && strchr(buf, '[')) - continue; - if (!kernel && !strchr(buf, '[')) + if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - continue; - if (!strcmp(name, "default_idle")) - continue; - if (!strncmp(name, "rcu_", 4)) - continue; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - continue; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) + if (skip_entry(name)) continue; - err = hashmap__add(map, name, 0); + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); if (err == -EEXIST) { err = 0; continue; @@ -407,8 +532,7 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (err) goto error; - syms[cnt++] = name; - name = NULL; + syms[cnt++] = ksym_name; } *symsp = syms; @@ -418,42 +542,88 @@ error: free(name); fclose(f); hashmap__free(map); - if (err) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (err) free(syms); + return err; +} + +static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); return err; } -static void test_kprobe_multi_bench_attach(bool kernel) +static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { - LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - struct kprobe_multi_empty *skel = NULL; long attach_start_ns, attach_end_ns; long detach_start_ns, detach_end_ns; double attach_delta, detach_delta; struct bpf_link *link = NULL; - char **syms = NULL; - size_t cnt = 0, i; - - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) - return; - - skel = kprobe_multi_empty__open_and_load(); - if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) - goto cleanup; - - opts.syms = (const char **) syms; - opts.cnt = cnt; attach_start_ns = get_time_ns(); link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty, - NULL, &opts); + NULL, opts); attach_end_ns = get_time_ns(); if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts")) - goto cleanup; + return; detach_start_ns = get_time_ns(); bpf_link__destroy(link); @@ -462,17 +632,65 @@ static void test_kprobe_multi_bench_attach(bool kernel) attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; - printf("%s: found %lu functions\n", __func__, cnt); + printf("%s: found %lu functions\n", __func__, opts->cnt); printf("%s: attached in %7.3lfs\n", __func__, attach_delta); printf("%s: detached in %7.3lfs\n", __func__, detach_delta); +} + +static void test_kprobe_multi_bench_attach(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + char **syms = NULL; + size_t cnt = 0; + + if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.syms = (const char **) syms; + opts.cnt = cnt; + + do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); - if (syms) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (syms) free(syms); +} + +static void test_kprobe_multi_bench_attach_addr(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + unsigned long *addrs = NULL; + size_t cnt = 0; + int err; + + err = get_addrs(&addrs, &cnt, kernel); + if (err == -ENOENT) { + test__skip(); + return; } + + if (!ASSERT_OK(err, "get_addrs")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.addrs = addrs; + opts.cnt = cnt; + + do_bench_test(skel, &opts); + +cleanup: + kprobe_multi_empty__destroy(skel); + free(addrs); } static void test_attach_override(void) @@ -515,6 +733,10 @@ void serial_test_kprobe_multi_bench_attach(void) test_kprobe_multi_bench_attach(true); if (test__start_subtest("modules")) test_kprobe_multi_bench_attach(false); + if (test__start_subtest("kernel")) + test_kprobe_multi_bench_attach_addr(true); + if (test__start_subtest("modules")) + test_kprobe_multi_bench_attach_addr(false); } void test_kprobe_multi_test(void) @@ -538,4 +760,8 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("session")) + test_session_skel_api(); + if (test__start_subtest("session_cookie")) + test_session_cookie_skel_api(); } diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index b295969b26..dc7aab532f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -5,8 +5,6 @@ #include "test_ksyms.skel.h" #include -static int duration; - void test_ksyms(void) { const char *btf_path = "/sys/kernel/btf/vmlinux"; @@ -18,43 +16,37 @@ void test_ksyms(void) int err; err = kallsyms_find("bpf_link_fops", &link_fops_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "bpf_link_fops: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "bpf_link_fops: ksym_find")) return; err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "__per_cpu_start: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "__per_cpu_start: ksym_find")) return; - if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) + if (!ASSERT_OK(stat(btf_path, &st), "stat_btf")) return; btf_size = st.st_size; skel = test_ksyms__open_and_load(); - if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) + if (!ASSERT_OK_PTR(skel, "test_ksyms__open_and_load")) return; err = test_ksyms__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ksyms__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); data = skel->data; - CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops", - "got 0x%llx, exp 0x%llx\n", - data->out__bpf_link_fops, link_fops_addr); - CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1", - "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); - CHECK(data->out__btf_size != btf_size, "btf_size", - "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, - per_cpu_start_addr); + ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); + ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); + ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); + ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); cleanup: test_ksyms__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index f53d658ed0..6d391d95f9 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -51,6 +51,10 @@ void test_module_attach(void) 0, "bpf_testmod_test_read"); ASSERT_OK(err, "set_attach_target"); + err = bpf_program__set_attach_target(skel->progs.handle_fentry_explicit_manual, + 0, "bpf_testmod:bpf_testmod_test_read"); + ASSERT_OK(err, "set_attach_target_explicit"); + err = test_module_attach__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton\n")) return; @@ -70,6 +74,8 @@ void test_module_attach(void) ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual"); + ASSERT_EQ(bss->fentry_explicit_read_sz, READ_SZ, "fentry_explicit"); + ASSERT_EQ(bss->fentry_explicit_manual_read_sz, READ_SZ, "fentry_explicit_manual"); ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 8f8d792307..274d2e033e 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -82,6 +82,22 @@ static void cleanup_netns(struct nstoken *nstoken) SYS_NOFAIL("ip netns del %s", NS_TEST); } +static int start_mptcp_server(int family, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .proto = IPPROTO_MPTCP, + }; + struct sockaddr_storage addr; + socklen_t addrlen; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return start_server_addr(SOCK_STREAM, &addr, addrlen, &opts); +} + static int verify_tsk(int map_fd, int client_fd) { int err, cfd = client_fd; @@ -273,6 +289,8 @@ static int run_mptcpify(int cgroup_fd) if (!ASSERT_OK_PTR(mptcpify_skel, "skel_open_load")) return libbpf_get_error(mptcpify_skel); + mptcpify_skel->bss->pid = getpid(); + err = mptcpify__attach(mptcpify_skel); if (!ASSERT_OK(err, "skel_attach")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 24d493482f..e72d75d6ba 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -12,77 +12,229 @@ #include #include #include +#include "network_helpers.h" #define STACK_SIZE (1024 * 1024) static char child_stack[STACK_SIZE]; -static int test_current_pid_tgid(void *args) +static int get_pid_tgid(pid_t *pid, pid_t *tgid, + struct test_ns_current_pid_tgid__bss *bss) { - struct test_ns_current_pid_tgid__bss *bss; - struct test_ns_current_pid_tgid *skel; - int err = -1, duration = 0; - pid_t tgid, pid; struct stat st; + int err; - skel = test_ns_current_pid_tgid__open_and_load(); - if (CHECK(!skel, "skel_open_load", "failed to load skeleton\n")) - goto cleanup; - - pid = syscall(SYS_gettid); - tgid = getpid(); + *pid = syscall(SYS_gettid); + *tgid = getpid(); err = stat("/proc/self/ns/pid", &st); - if (CHECK(err, "stat", "failed /proc/self/ns/pid: %d\n", err)) - goto cleanup; + if (!ASSERT_OK(err, "stat /proc/self/ns/pid")) + return err; - bss = skel->bss; bss->dev = st.st_dev; bss->ino = st.st_ino; bss->user_pid = 0; bss->user_tgid = 0; + return 0; +} + +static int test_current_pid_tgid_tp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.tp_handler, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; err = test_ns_current_pid_tgid__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); - ASSERT_EQ(bss->user_pid, pid, "pid"); - ASSERT_EQ(bss->user_tgid, tgid, "tgid"); - err = 0; + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + +static int test_current_pid_tgid_cgrp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int server_fd = -1, ret = -1, err; + int cgroup_fd = *(int *)args; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.cgroup_bind4, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; + + skel->links.cgroup_bind4 = bpf_program__attach_cgroup( + skel->progs.cgroup_bind4, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.cgroup_bind4, "bpf_program__attach_cgroup")) + goto cleanup; + + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; cleanup: - test_ns_current_pid_tgid__destroy(skel); + if (server_fd >= 0) + close(server_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + +static int test_current_pid_tgid_sk_msg(void *args) +{ + int verdict, map, server_fd = -1, client_fd = -1; + struct test_ns_current_pid_tgid__bss *bss; + static const char send_msg[] = "message"; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err, key = 0; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.sk_msg, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, skel->bss)) + goto cleanup; + + verdict = bpf_program__fd(skel->progs.sk_msg); + map = bpf_map__fd(skel->maps.sock_map); + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "prog_attach")) + goto cleanup; + + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; - return err; + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto cleanup; + + err = bpf_map_update_elem(map, &key, &client_fd, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto cleanup; + + err = send(client_fd, send_msg, sizeof(send_msg), 0); + if (!ASSERT_EQ(err, sizeof(send_msg), "send(msg)")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + if (server_fd >= 0) + close(server_fd); + if (client_fd >= 0) + close(client_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; } -static void test_ns_current_pid_tgid_new_ns(void) +static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg) { - int wstatus, duration = 0; + int wstatus; pid_t cpid; /* Create a process in a new namespace, this process * will be the init process of this new namespace hence will be pid 1. */ - cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, - CLONE_NEWPID | SIGCHLD, NULL); + cpid = clone(fn, child_stack + STACK_SIZE, + CLONE_NEWPID | SIGCHLD, arg); - if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(cpid, -1, "clone")) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(waitpid(cpid, &wstatus, 0), -1, "waitpid")) return; - if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) + if (!ASSERT_OK(WEXITSTATUS(wstatus), "newns_pidtgid")) return; } +static void test_in_netns(int (*fn)(void *), void *arg) +{ + struct nstoken *nstoken = NULL; + + SYS(cleanup, "ip netns add ns_current_pid_tgid"); + SYS(cleanup, "ip -net ns_current_pid_tgid link set dev lo up"); + + nstoken = open_netns("ns_current_pid_tgid"); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto cleanup; + + test_ns_current_pid_tgid_new_ns(fn, arg); + +cleanup: + if (nstoken) + close_netns(nstoken); + SYS_NOFAIL("ip netns del ns_current_pid_tgid"); +} + /* TODO: use a different tracepoint */ void serial_test_ns_current_pid_tgid(void) { - if (test__start_subtest("ns_current_pid_tgid_root_ns")) - test_current_pid_tgid(NULL); - if (test__start_subtest("ns_current_pid_tgid_new_ns")) - test_ns_current_pid_tgid_new_ns(); + if (test__start_subtest("root_ns_tp")) + test_current_pid_tgid_tp(NULL); + if (test__start_subtest("new_ns_tp")) + test_ns_current_pid_tgid_new_ns(test_current_pid_tgid_tp, NULL); + if (test__start_subtest("new_ns_cgrp")) { + int cgroup_fd = -1; + + cgroup_fd = test__join_cgroup("/sock_addr"); + if (ASSERT_GE(cgroup_fd, 0, "join_cgroup")) { + test_in_netns(test_current_pid_tgid_cgrp, &cgroup_fd); + close(cgroup_fd); + } + } + if (test__start_subtest("new_ns_sk_msg")) + test_in_netns(test_current_pid_tgid_sk_msg, NULL); } diff --git a/tools/testing/selftests/bpf/prog_tests/perf_skip.c b/tools/testing/selftests/bpf/prog_tests/perf_skip.c new file mode 100644 index 0000000000..37d8618800 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_skip.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include "test_perf_skip.skel.h" +#include +#include +#include + +#ifndef TRAP_PERF +#define TRAP_PERF 6 +#endif + +int sigio_count, sigtrap_count; + +static void handle_sigio(int sig __always_unused) +{ + ++sigio_count; +} + +static void handle_sigtrap(int signum __always_unused, + siginfo_t *info, + void *ucontext __always_unused) +{ + ASSERT_EQ(info->si_code, TRAP_PERF, "si_code"); + ++sigtrap_count; +} + +static noinline int test_function(void) +{ + asm volatile (""); + return 0; +} + +void serial_test_perf_skip(void) +{ + struct sigaction action = {}; + struct sigaction previous_sigtrap; + sighandler_t previous_sigio = SIG_ERR; + struct test_perf_skip *skel = NULL; + struct perf_event_attr attr = {}; + int perf_fd = -1; + int err; + struct f_owner_ex owner; + struct bpf_link *prog_link = NULL; + + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = handle_sigtrap; + sigemptyset(&action.sa_mask); + if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction")) + return; + + previous_sigio = signal(SIGIO, handle_sigio); + if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal")) + goto cleanup; + + skel = test_perf_skip__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + attr.type = PERF_TYPE_BREAKPOINT; + attr.size = sizeof(attr); + attr.bp_type = HW_BREAKPOINT_X; + attr.bp_addr = (uintptr_t)test_function; + attr.bp_len = sizeof(long); + attr.sample_period = 1; + attr.sample_type = PERF_SAMPLE_IP; + attr.pinned = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.precise_ip = 3; + attr.sigtrap = 1; + attr.remove_on_exec = 1; + + perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); + if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { + printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n"); + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(perf_fd < 0, "perf_event_open")) + goto cleanup; + + /* Configure the perf event to signal on sample. */ + err = fcntl(perf_fd, F_SETFL, O_ASYNC); + if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)")) + goto cleanup; + + owner.type = F_OWNER_TID; + owner.pid = syscall(__NR_gettid); + err = fcntl(perf_fd, F_SETOWN_EX, &owner); + if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)")) + goto cleanup; + + /* Allow at most one sample. A sample rejected by bpf should + * not count against this. + */ + err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1); + if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)")) + goto cleanup; + + prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd); + if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event")) + goto cleanup; + + /* Configure the bpf program to suppress the sample. */ + skel->bss->ip = (uintptr_t)test_function; + test_function(); + + ASSERT_EQ(sigio_count, 0, "sigio_count"); + ASSERT_EQ(sigtrap_count, 0, "sigtrap_count"); + + /* Configure the bpf program to allow the sample. */ + skel->bss->ip = 0; + test_function(); + + ASSERT_EQ(sigio_count, 1, "sigio_count"); + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); + + /* Test that the sample above is the only one allowed (by perf, not + * by bpf) + */ + test_function(); + + ASSERT_EQ(sigio_count, 1, "sigio_count"); + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); + +cleanup: + bpf_link__destroy(prog_link); + if (perf_fd >= 0) + close(perf_fd); + test_perf_skip__destroy(skel); + + if (previous_sigio != SIG_ERR) + signal(SIGIO, previous_sigio); + sigaction(SIGTRAP, &previous_sigtrap, NULL); +} diff --git a/tools/testing/selftests/bpf/prog_tests/preempt_lock.c b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c new file mode 100644 index 0000000000..02917c6724 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +void test_preempt_lock(void) +{ + RUN_TESTS(preempt_lock); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 48c5695b7a..da430df45a 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -12,8 +12,11 @@ #include #include #include + #include "test_ringbuf.lskel.h" +#include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" +#include "test_ringbuf_write.lskel.h" #define EDONE 7777 @@ -83,6 +86,58 @@ static void *poll_thread(void *input) return (void *)(long)ring_buffer__poll(ringbuf, timeout); } +static void ringbuf_write_subtest(void) +{ + struct test_ringbuf_write_lskel *skel; + int page_size = getpagesize(); + size_t *mmap_ptr; + int err, rb_fd; + + skel = test_ringbuf_write_lskel__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->maps.ringbuf.max_entries = 0x4000; + + err = test_ringbuf_write_lskel__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + rb_fd = skel->maps.ringbuf.map_fd; + + mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0); + if (!ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos")) + goto cleanup; + *mmap_ptr = 0x3000; + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw"); + + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(rb_fd, process_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ringbuf_new")) + goto cleanup; + + err = test_ringbuf_write_lskel__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup_ringbuf; + + skel->bss->discarded = 0; + skel->bss->passed = 0; + + /* trigger exactly two samples */ + syscall(__NR_getpgid); + syscall(__NR_getpgid); + + ASSERT_EQ(skel->bss->discarded, 2, "discarded"); + ASSERT_EQ(skel->bss->passed, 0, "passed"); + + test_ringbuf_write_lskel__detach(skel); +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_write_lskel__destroy(skel); +} + static void ringbuf_subtest(void) { const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); @@ -326,6 +381,68 @@ cleanup: test_ringbuf_lskel__destroy(skel); } +/* + * Test ring_buffer__consume_n() by producing N_TOT_SAMPLES samples in the ring + * buffer, via getpid(), and consuming them in chunks of N_SAMPLES. + */ +#define N_TOT_SAMPLES 32 +#define N_SAMPLES 4 + +/* Sample value to verify the callback validity */ +#define SAMPLE_VALUE 42L + +static int process_n_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + ASSERT_EQ(s->value, SAMPLE_VALUE, "sample_value"); + + return 0; +} + +static void ringbuf_n_subtest(void) +{ + struct test_ringbuf_n_lskel *skel_n; + int err, i; + + skel_n = test_ringbuf_n_lskel__open(); + if (!ASSERT_OK_PTR(skel_n, "test_ringbuf_n_lskel__open")) + return; + + skel_n->maps.ringbuf.max_entries = getpagesize(); + skel_n->bss->pid = getpid(); + + err = test_ringbuf_n_lskel__load(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__load")) + goto cleanup; + + ringbuf = ring_buffer__new(skel_n->maps.ringbuf.map_fd, + process_n_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new")) + goto cleanup; + + err = test_ringbuf_n_lskel__attach(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__attach")) + goto cleanup_ringbuf; + + /* Produce N_TOT_SAMPLES samples in the ring buffer by calling getpid() */ + skel_n->bss->value = SAMPLE_VALUE; + for (i = 0; i < N_TOT_SAMPLES; i++) + syscall(__NR_getpgid); + + /* Consume all samples from the ring buffer in batches of N_SAMPLES */ + for (i = 0; i < N_TOT_SAMPLES; i += err) { + err = ring_buffer__consume_n(ringbuf, N_SAMPLES); + if (!ASSERT_EQ(err, N_SAMPLES, "rb_consume")) + goto cleanup_ringbuf; + } + +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_n_lskel__destroy(skel_n); +} + static int process_map_key_sample(void *ctx, void *data, size_t len) { struct sample *s; @@ -384,6 +501,10 @@ void test_ringbuf(void) { if (test__start_subtest("ringbuf")) ringbuf_subtest(); + if (test__start_subtest("ringbuf_n")) + ringbuf_n_subtest(); if (test__start_subtest("ringbuf_map_key")) ringbuf_map_key_subtest(); + if (test__start_subtest("ringbuf_write")) + ringbuf_write_subtest(); } diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index b15b343ebb..920aee41bd 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -179,7 +179,7 @@ static void test_send_signal_nmi(bool signal_thread) pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, -1 /* cpu */, -1 /* group_fd */, 0 /* flags */); if (pmu_fd == -1) { - if (errno == ENOENT) { + if (errno == ENOENT || errno == EOPNOTSUPP) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 1374b626a9..0b9bd1d6f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -15,6 +15,7 @@ #include #include "test_progs.h" +#include "network_helpers.h" #define BIND_PORT 1234 #define CONNECT_PORT 4321 @@ -22,8 +23,6 @@ #define NS_SELF "/proc/self/ns/net" #define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" -static const struct timeval timeo_sec = { .tv_sec = 3 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int stop, duration; static bool @@ -73,52 +72,6 @@ configure_stack(void) return true; } -static int -start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto close_out; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int -connect_to_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = -1; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(connect(fd, addr, len))) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static in_port_t get_port(int fd) { @@ -161,7 +114,7 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) in_port_t port; int ret = 1; - client = connect_to_server(addr, len, type); + client = connect_to_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (client == -1) { perror("Cannot connect to server"); goto out; @@ -310,7 +263,9 @@ void test_sk_assign(void) continue; prepare_addr(test->addr, test->family, BIND_PORT, false); addr = (const struct sockaddr *)test->addr; - server = start_server(addr, test->len, test->type); + server = start_server_addr(test->type, + (const struct sockaddr_storage *)addr, + test->len, NULL); if (server == -1) goto close; diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 597d0467a9..de2466547e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -994,7 +994,7 @@ static void drop_on_reuseport(const struct test *t) err = update_lookup_map(t->sock_map, SERVER_A, server1); if (err) - goto detach; + goto close_srv1; /* second server on destination address we should never reach */ server2 = make_server(t->sotype, t->connect_to.ip, t->connect_to.port, diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 5fd6177189..b880c564a2 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -3,16 +3,56 @@ #include "test_progs.h" +#include "sock_addr_kern.skel.h" +#include "bind4_prog.skel.h" +#include "bind6_prog.skel.h" #include "connect_unix_prog.skel.h" +#include "connect4_prog.skel.h" +#include "connect6_prog.skel.h" +#include "sendmsg4_prog.skel.h" +#include "sendmsg6_prog.skel.h" +#include "recvmsg4_prog.skel.h" +#include "recvmsg6_prog.skel.h" #include "sendmsg_unix_prog.skel.h" #include "recvmsg_unix_prog.skel.h" +#include "getsockname4_prog.skel.h" +#include "getsockname6_prog.skel.h" #include "getsockname_unix_prog.skel.h" +#include "getpeername4_prog.skel.h" +#include "getpeername6_prog.skel.h" #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" +#ifndef ENOTSUPP +# define ENOTSUPP 524 +#endif + +#define TEST_NS "sock_addr" +#define TEST_IF_PREFIX "test_sock_addr" +#define TEST_IPV4 "127.0.0.4" +#define TEST_IPV6 "::6" + +#define SERV4_IP "192.168.1.254" +#define SERV4_REWRITE_IP "127.0.0.1" +#define SRC4_IP "172.16.0.1" +#define SRC4_REWRITE_IP TEST_IPV4 +#define SERV4_PORT 4040 +#define SERV4_REWRITE_PORT 4444 + +#define SERV6_IP "face:b00c:1234:5678::abcd" +#define SERV6_REWRITE_IP "::1" +#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" +#define SRC6_IP "::1" +#define SRC6_REWRITE_IP TEST_IPV6 +#define WILDCARD6_IP "::" +#define SERV6_PORT 6060 +#define SERV6_REWRITE_PORT 6666 + #define SERVUN_ADDRESS "bpf_cgroup_unix_test" #define SERVUN_REWRITE_ADDRESS "bpf_cgroup_unix_test_rewrite" -#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" +#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" + +#define save_errno_do(op) ({ int __save = errno; op; errno = __save; }) enum sock_addr_test_type { SOCK_ADDR_TEST_BIND, @@ -23,165 +63,1603 @@ enum sock_addr_test_type { SOCK_ADDR_TEST_GETPEERNAME, }; -typedef void *(*load_fn)(int cgroup_fd); +typedef void *(*load_fn)(int cgroup_fd, + enum bpf_attach_type attach_type, + bool expect_reject); typedef void (*destroy_fn)(void *skel); -struct sock_addr_test { - enum sock_addr_test_type type; - const char *name; - /* BPF prog properties */ - load_fn loadfn; - destroy_fn destroyfn; - /* Socket properties */ - int socket_family; - int socket_type; - /* IP:port pairs for BPF prog to override */ - const char *requested_addr; - unsigned short requested_port; - const char *expected_addr; - unsigned short expected_port; - const char *expected_src_addr; +static int cmp_addr(const struct sockaddr_storage *addr1, socklen_t addr1_len, + const struct sockaddr_storage *addr2, socklen_t addr2_len, + bool cmp_port); + +struct init_sock_args { + int af; + int type; }; -static void *connect_unix_prog_load(int cgroup_fd) -{ - struct connect_unix_prog *skel; +struct addr_args { + char addr[sizeof(struct sockaddr_storage)]; + int addrlen; +}; - skel = connect_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; - skel->links.connect_unix_prog = bpf_program__attach_cgroup( - skel->progs.connect_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.connect_unix_prog, "prog_attach")) - goto cleanup; +static struct sock_addr_kern *skel; - return skel; -cleanup: - connect_unix_prog__destroy(skel); - return NULL; +static int run_bpf_prog(const char *prog_name, void *ctx, int ctx_size) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_program *prog; + int prog_fd, err; + + topts.ctx_in = ctx; + topts.ctx_size_in = ctx_size; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto err; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, prog_name)) + goto err; + + err = topts.retval; + errno = -topts.retval; + goto out; +err: + err = -1; +out: + return err; } -static void connect_unix_prog_destroy(void *skel) +static int kernel_init_sock(int af, int type, int protocol) { - connect_unix_prog__destroy(skel); + struct init_sock_args args = { + .af = af, + .type = type, + }; + + return run_bpf_prog("init_sock", &args, sizeof(args)); } -static void *sendmsg_unix_prog_load(int cgroup_fd) +static int kernel_close_sock(int fd) { - struct sendmsg_unix_prog *skel; + return run_bpf_prog("close_sock", NULL, 0); +} - skel = sendmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; +static int sock_addr_op(const char *name, struct sockaddr *addr, + socklen_t *addrlen, bool expect_change) +{ + struct addr_args args; + int err; - skel->links.sendmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.sendmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.sendmsg_unix_prog, "prog_attach")) - goto cleanup; + if (addrlen) + args.addrlen = *addrlen; - return skel; -cleanup: - sendmsg_unix_prog__destroy(skel); - return NULL; + if (addr) + memcpy(&args.addr, addr, *addrlen); + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!expect_change && addr) + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + *addrlen, + (struct sockaddr_storage *)&args.addr, + args.addrlen, 1), + 0, "address_param_modified")) + return -1; + + if (addrlen) + *addrlen = args.addrlen; + + if (addr) + memcpy(addr, &args.addr, *addrlen); + + return err; } -static void sendmsg_unix_prog_destroy(void *skel) +static int send_msg_op(const char *name, struct sockaddr *addr, + socklen_t addrlen, const char *msg, int msglen) { - sendmsg_unix_prog__destroy(skel); + struct sendmsg_args args; + int err; + + memset(&args, 0, sizeof(args)); + memcpy(&args.addr.addr, addr, addrlen); + args.addr.addrlen = addrlen; + memcpy(args.msg, msg, msglen); + args.msglen = msglen; + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + addrlen, + (struct sockaddr_storage *)&args.addr.addr, + args.addr.addrlen, 1), + 0, "address_param_modified")) + return -1; + + return err; } -static void *recvmsg_unix_prog_load(int cgroup_fd) +static int kernel_connect(struct sockaddr *addr, socklen_t addrlen) { - struct recvmsg_unix_prog *skel; - - skel = recvmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; + return sock_addr_op("kernel_connect", addr, &addrlen, false); +} - skel->links.recvmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.recvmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.recvmsg_unix_prog, "prog_attach")) - goto cleanup; +static int kernel_bind(int fd, struct sockaddr *addr, socklen_t addrlen) +{ + return sock_addr_op("kernel_bind", addr, &addrlen, false); +} - return skel; -cleanup: - recvmsg_unix_prog__destroy(skel); - return NULL; +static int kernel_listen(void) +{ + return sock_addr_op("kernel_listen", NULL, NULL, false); } -static void recvmsg_unix_prog_destroy(void *skel) +static int kernel_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) { - recvmsg_unix_prog__destroy(skel); + return send_msg_op("kernel_sendmsg", addr, addrlen, msg, msglen); } -static void *getsockname_unix_prog_load(int cgroup_fd) +static int sock_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) { - struct getsockname_unix_prog *skel; + return send_msg_op("sock_sendmsg", addr, addrlen, msg, msglen); +} - skel = getsockname_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; +static int kernel_getsockname(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getsockname", addr, addrlen, true); +} - skel->links.getsockname_unix_prog = bpf_program__attach_cgroup( - skel->progs.getsockname_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getsockname_unix_prog, "prog_attach")) - goto cleanup; +static int kernel_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getpeername", addr, addrlen, true); +} - return skel; -cleanup: - getsockname_unix_prog__destroy(skel); - return NULL; +int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) +{ + int err; + + if (!ASSERT_OK(kernel_init_sock(addr->ss_family, type, 0), + "kernel_init_sock")) + goto err; + + if (kernel_connect((struct sockaddr *)addr, addrlen) < 0) + goto err; + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); +out: + return err; } -static void getsockname_unix_prog_destroy(void *skel) +int kernel_start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) { - getsockname_unix_prog__destroy(skel); + struct sockaddr_storage addr; + socklen_t addrlen; + int err; + + if (!ASSERT_OK(kernel_init_sock(family, type, 0), "kernel_init_sock")) + goto err; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + goto err; + + if (kernel_bind(0, (struct sockaddr *)&addr, addrlen) < 0) + goto err; + + if (type == SOCK_STREAM) { + if (!ASSERT_OK(kernel_listen(), "kernel_listen")) + goto err; + } + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); +out: + return err; } -static void *getpeername_unix_prog_load(int cgroup_fd) +struct sock_ops { + int (*connect_to_addr)(int type, const struct sockaddr_storage *addr, + socklen_t addrlen, + const struct network_helper_opts *opts); + int (*start_server)(int family, int type, const char *addr_str, + __u16 port, int timeout_ms); + int (*socket)(int famil, int type, int protocol); + int (*bind)(int fd, struct sockaddr *addr, socklen_t addrlen); + int (*getsockname)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*getpeername)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*sendmsg)(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen); + int (*close)(int fd); +}; + +static int user_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) { - struct getpeername_unix_prog *skel; + struct msghdr hdr; + struct iovec iov; - skel = getpeername_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; + memset(&iov, 0, sizeof(iov)); + iov.iov_base = msg; + iov.iov_len = msglen; - skel->links.getpeername_unix_prog = bpf_program__attach_cgroup( - skel->progs.getpeername_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getpeername_unix_prog, "prog_attach")) - goto cleanup; + memset(&hdr, 0, sizeof(hdr)); + hdr.msg_name = (void *)addr; + hdr.msg_namelen = addrlen; + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; - return skel; -cleanup: - getpeername_unix_prog__destroy(skel); - return NULL; + return sendmsg(fd, &hdr, 0); } -static void getpeername_unix_prog_destroy(void *skel) +static int user_bind(int fd, struct sockaddr *addr, socklen_t addrlen) { - getpeername_unix_prog__destroy(skel); + return bind(fd, (const struct sockaddr *)addr, addrlen); } +struct sock_ops user_ops = { + .connect_to_addr = connect_to_addr, + .start_server = start_server, + .socket = socket, + .bind = user_bind, + .getsockname = getsockname, + .getpeername = getpeername, + .sendmsg = user_sendmsg, + .close = close, +}; + +struct sock_ops kern_ops_sock_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = sock_sendmsg, + .close = kernel_close_sock, +}; + +struct sock_ops kern_ops_kernel_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = kernel_sendmsg, + .close = kernel_close_sock, +}; + +struct sock_addr_test { + enum sock_addr_test_type type; + const char *name; + /* BPF prog properties */ + load_fn loadfn; + destroy_fn destroyfn; + enum bpf_attach_type attach_type; + /* Socket operations */ + struct sock_ops *ops; + /* Socket properties */ + int socket_family; + int socket_type; + /* IP:port pairs for BPF prog to override */ + const char *requested_addr; + unsigned short requested_port; + const char *expected_addr; + unsigned short expected_port; + const char *expected_src_addr; + /* Expected test result */ + enum { + LOAD_REJECT, + ATTACH_REJECT, + SYSCALL_EPERM, + SYSCALL_ENOTSUPP, + SUCCESS, + } expected_result; +}; + +#define BPF_SKEL_FUNCS_RAW(skel_name, prog_name) \ +static void *prog_name##_load_raw(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ +{ \ + struct skel_name *skel = skel_name##__open(); \ + int prog_fd = -1; \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + if (!ASSERT_OK(skel_name##__load(skel), "load")) \ + goto cleanup; \ + prog_fd = bpf_program__fd(skel->progs.prog_name); \ + if (!ASSERT_GT(prog_fd, 0, "prog_fd")) \ + goto cleanup; \ + if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, \ + BPF_F_ALLOW_OVERRIDE), "bpf_prog_attach") { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ +cleanup: \ + if (prog_fd > 0) \ + bpf_prog_detach(cgroup_fd, attach_type); \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void prog_name##_destroy_raw(void *progfd) \ +{ \ + /* No-op. *_load_raw does all cleanup. */ \ +} \ + +#define BPF_SKEL_FUNCS(skel_name, prog_name) \ +static void *prog_name##_load(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ +{ \ + struct skel_name *skel = skel_name##__open(); \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + if (!ASSERT_OK(bpf_program__set_expected_attach_type(skel->progs.prog_name, \ + attach_type), \ + "set_expected_attach_type")) \ + goto cleanup; \ + if (skel_name##__load(skel)) { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ + skel->links.prog_name = bpf_program__attach_cgroup( \ + skel->progs.prog_name, cgroup_fd); \ + if (!ASSERT_OK_PTR(skel->links.prog_name, "prog_attach")) \ + goto cleanup; \ + return skel; \ +cleanup: \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void prog_name##_destroy(void *skel) \ +{ \ + skel_name##__destroy(skel); \ +} + +BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS_RAW(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS(bind4_prog, bind_v4_deny_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS_RAW(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_deny_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS_RAW(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_deny_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_deny_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS_RAW(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_deny_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS_RAW(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_deny_prog); +BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS_RAW(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS_RAW(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS_RAW(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS_RAW(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS(getsockname6_prog, getsockname_v6_prog); +BPF_SKEL_FUNCS_RAW(getsockname6_prog, getsockname_v6_prog); +BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS_RAW(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS_RAW(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS(getpeername6_prog, getpeername_v6_prog); +BPF_SKEL_FUNCS_RAW(getpeername6_prog, getpeername_v6_prog); + static struct sock_addr_test tests[] = { + /* bind - system calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (stream)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (dgram)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: load prog with wrong expected attach type", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: attach prog with wrong attach type", + bind_v4_prog_load_raw, + bind_v4_prog_destroy_raw, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (stream)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (dgram)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: load prog with wrong expected attach type", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: attach prog with wrong attach type", + bind_v6_prog_load_raw, + bind_v6_prog_destroy_raw, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + + /* bind - kernel calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (stream)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (dgram)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (stream)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (dgram)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + + /* connect - system calls */ + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect (stream)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect (dgram)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: load prog with wrong expected attach type", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: attach prog with wrong attach type", + connect_v4_prog_load_raw, + connect_v4_prog_destroy_raw, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect (stream)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect (dgram)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: load prog with wrong expected attach type", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: attach prog with wrong attach type", + connect_v6_prog_load_raw, + connect_v6_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: connect (stream)", + connect_unix_prog_load, + connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: connect deny (stream)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: attach prog with wrong attach type", + connect_unix_prog_load_raw, + connect_unix_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, + + /* connect - kernel calls */ + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (stream)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (dgram)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (stream)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (dgram)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect (dgram)", + connect_unix_prog_load, + connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect deny (dgram)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + + /* sendmsg - system calls */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sendmsg (dgram)", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: load prog with wrong expected attach type", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: attach prog with wrong attach type", + sendmsg_v4_prog_load_raw, + sendmsg_v4_prog_destroy_raw, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg (dgram)", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg IPv4-mapped IPv6 (dgram)", + sendmsg_v6_v4mapped_prog_load, + sendmsg_v6_v4mapped_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_ENOTSUPP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram)", + sendmsg_v6_wildcard_prog_load, + sendmsg_v6_wildcard_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: load prog with wrong expected attach type", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: attach prog with wrong attach type", + sendmsg_v6_prog_load_raw, + sendmsg_v6_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: attach prog with wrong attach type", + sendmsg_unix_prog_load_raw, + sendmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, + + /* sendmsg - kernel calls (sock_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg (dgram)", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg (dgram)", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, { - SOCK_ADDR_TEST_CONNECT, - "connect_unix", - connect_unix_prog_load, - connect_unix_prog_destroy, + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_sock_sendmsg, AF_UNIX, - SOCK_STREAM, + SOCK_DGRAM, SERVUN_ADDRESS, 0, SERVUN_REWRITE_ADDRESS, 0, NULL, + SYSCALL_EPERM, + }, + + /* sendmsg - kernel calls (kernel_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg (dgram)", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg (dgram)", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, - "sendmsg_unix", + "sendmsg6: kernel_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: kernel_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_kernel_sendmsg, AF_UNIX, SOCK_DGRAM, SERVUN_ADDRESS, @@ -189,12 +1667,81 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SYSCALL_EPERM, + }, + + /* recvmsg - system calls */ + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg4: recvfrom (dgram)", + recvmsg4_prog_load, + recvmsg4_prog_destroy, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg4: attach prog with wrong attach type", + recvmsg4_prog_load_raw, + recvmsg4_prog_destroy_raw, + BPF_CGROUP_UDP6_RECVMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg6: recvfrom (dgram)", + recvmsg6_prog_load, + recvmsg6_prog_destroy, + BPF_CGROUP_UDP6_RECVMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg6: attach prog with wrong attach type", + recvmsg6_prog_load_raw, + recvmsg6_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + ATTACH_REJECT, }, { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-dgram", + "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, + &user_ops, AF_UNIX, SOCK_DGRAM, SERVUN_REWRITE_ADDRESS, @@ -202,12 +1749,15 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-stream", + "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_REWRITE_ADDRESS, @@ -215,12 +1765,357 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg_unix: attach prog with wrong attach type", + recvmsg_unix_prog_load_raw, + recvmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_ADDRESS, + ATTACH_REJECT, + }, + + /* getsockname - system calls */ + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: attach prog with wrong attach type", + getsockname_v4_prog_load_raw, + getsockname_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: attach prog with wrong attach type", + getsockname_v6_prog_load_raw, + getsockname_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: getsockname", + getsockname_unix_prog_load, + getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: attach prog with wrong attach type", + getsockname_unix_prog_load_raw, + getsockname_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, + + /* getsockname - kernel calls */ + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_GETSOCKNAME, - "getsockname_unix", + "getsockname_unix: kernel_getsockname", getsockname_unix_prog_load, getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + + /* getpeername - system calls */ + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: attach prog with wrong attach type", + getpeername_v4_prog_load_raw, + getpeername_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: attach prog with wrong attach type", + getpeername_v6_prog_load_raw, + getpeername_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: getpeername", + getpeername_unix_prog_load, + getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: attach prog with wrong attach type", + getpeername_unix_prog_load_raw, + getpeername_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -228,12 +2123,81 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + ATTACH_REJECT, + }, + + /* getpeername - kernel calls */ + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: kernel_getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_GETPEERNAME, - "getpeername_unix", + "getpeername6: kernel_getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: kernel_getpeername", getpeername_unix_prog_load, getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, + &kern_ops_kernel_sendmsg, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -241,6 +2205,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, }; @@ -294,28 +2259,40 @@ static int cmp_sock_addr(info_fn fn, int sock1, return cmp_addr(&addr1, len1, addr2, addr2_len, cmp_port); } -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) +static int load_sock_addr_kern(void) { - return cmp_sock_addr(getsockname, sock1, addr2, addr2_len, cmp_port); + int err; + + skel = sock_addr_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto err; + + err = 0; + goto out; +err: + err = -1; +out: + return err; } -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) +static void unload_sock_addr_kern(void) { - return cmp_sock_addr(getpeername, sock1, addr2, addr2_len, cmp_port); + sock_addr_kern__destroy(skel); } -static void test_bind(struct sock_addr_test *test) +static int test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, client = -1, err; - serv = start_server(test->socket_family, test->socket_type, - test->requested_addr, test->requested_port, 0); - if (!ASSERT_GE(serv, 0, "start_server")) - goto cleanup; + serv = test->ops->start_server(test->socket_family, test->socket_type, + test->requested_addr, + test->requested_port, 0); + if (serv < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, @@ -323,23 +2300,28 @@ static void test_bind(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; /* Try to connect to server just in case */ - client = connect_to_addr(&expected_addr, expected_addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &expected_addr, expected_addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; cleanup: + err = 0; +err: if (client != -1) close(client); if (serv != -1) - close(serv); + test->ops->close(serv); + + return err; } -static void test_connect(struct sock_addr_test *test) +static int test_connect(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr, expected_src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -357,9 +2339,12 @@ static void test_connect(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); - if (!ASSERT_GE(client, 0, "connect_to_addr")) - goto cleanup; + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); + if (client < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, &expected_addr, &expected_addr_len); @@ -373,29 +2358,34 @@ static void test_connect(struct sock_addr_test *test) goto cleanup; } - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; if (test->expected_src_addr) { - err = cmp_local_addr(client, &expected_src_addr, expected_src_addr_len, false); + err = cmp_sock_addr(test->ops->getsockname, client, + &expected_src_addr, expected_src_addr_len, + false); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; } cleanup: + err = 0; +err: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_xmsg(struct sock_addr_test *test) +static int test_xmsg(struct sock_addr_test *test) { struct sockaddr_storage addr, src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), src_addr_len = sizeof(struct sockaddr_storage); - struct msghdr hdr; - struct iovec iov; char data = 'a'; int serv = -1, client = -1, err; @@ -408,7 +2398,7 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; - client = socket(test->socket_family, test->socket_type, 0); + client = test->ops->socket(test->socket_family, test->socket_type, 0); if (!ASSERT_GE(client, 0, "socket")) goto cleanup; @@ -418,7 +2408,8 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = bind(client, (const struct sockaddr *) &src_addr, src_addr_len); + err = test->ops->bind(client, (struct sockaddr *)&src_addr, + src_addr_len); if (!ASSERT_OK(err, "bind")) goto cleanup; } @@ -429,17 +2420,13 @@ static void test_xmsg(struct sock_addr_test *test) goto cleanup; if (test->socket_type == SOCK_DGRAM) { - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)&addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; + err = test->ops->sendmsg(client, (struct sockaddr *)&addr, + addr_len, &data, sizeof(data)); + if (err < 0) { + err = errno; + goto err; + } - err = sendmsg(client, &hdr, 0); if (!ASSERT_EQ(err, sizeof(data), "sendmsg")) goto cleanup; } else { @@ -489,19 +2476,23 @@ static void test_xmsg(struct sock_addr_test *test) } cleanup: + err = 0; +err: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_getsockname(struct sock_addr_test *test) +static int test_getsockname(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, err; - serv = start_server(test->socket_family, test->socket_type, + serv = test->ops->start_server(test->socket_family, test->socket_type, test->requested_addr, test->requested_port, 0); if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; @@ -512,16 +2503,18 @@ static void test_getsockname(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; cleanup: if (serv != -1) - close(serv); + test->ops->close(serv); + + return 0; } -static void test_getpeername(struct sock_addr_test *test) +static int test_getpeername(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -538,7 +2531,8 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -547,19 +2541,58 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); + + return 0; +} + +static int setup_test_env(struct nstoken **tok) +{ + int err; + + SYS_NOFAIL("ip netns delete %s", TEST_NS); + SYS(fail, "ip netns add %s", TEST_NS); + *tok = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(*tok, "netns token")) + goto fail; + + SYS(fail, "ip link add dev %s1 type veth peer name %s2", TEST_IF_PREFIX, + TEST_IF_PREFIX); + SYS(fail, "ip link set lo up"); + SYS(fail, "ip link set %s1 up", TEST_IF_PREFIX); + SYS(fail, "ip link set %s2 up", TEST_IF_PREFIX); + SYS(fail, "ip -4 addr add %s/8 dev %s1", TEST_IPV4, TEST_IF_PREFIX); + SYS(fail, "ip -6 addr add %s/128 nodad dev %s1", TEST_IPV6, TEST_IF_PREFIX); + + err = 0; + goto out; +fail: + err = -1; + close_netns(*tok); + *tok = NULL; + SYS_NOFAIL("ip netns delete %s", TEST_NS); +out: + return err; +} + +static void cleanup_test_env(struct nstoken *tok) +{ + close_netns(tok); + SYS_NOFAIL("ip netns delete %s", TEST_NS); } void test_sock_addr(void) { + struct nstoken *tok = NULL; int cgroup_fd = -1; void *skel; @@ -567,13 +2600,22 @@ void test_sock_addr(void) if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) goto cleanup; + if (!ASSERT_OK(setup_test_env(&tok), "setup_test_env")) + goto cleanup; + + if (!ASSERT_OK(load_sock_addr_kern(), "load_sock_addr_kern")) + goto cleanup; + for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; + int err; if (!test__start_subtest(test->name)) continue; - skel = test->loadfn(cgroup_fd); + skel = test->loadfn(cgroup_fd, test->attach_type, + test->expected_result == LOAD_REJECT || + test->expected_result == ATTACH_REJECT); if (!skel) continue; @@ -583,30 +2625,39 @@ void test_sock_addr(void) * the future. */ case SOCK_ADDR_TEST_BIND: - test_bind(test); + err = test_bind(test); break; case SOCK_ADDR_TEST_CONNECT: - test_connect(test); + err = test_connect(test); break; case SOCK_ADDR_TEST_SENDMSG: case SOCK_ADDR_TEST_RECVMSG: - test_xmsg(test); + err = test_xmsg(test); break; case SOCK_ADDR_TEST_GETSOCKNAME: - test_getsockname(test); + err = test_getsockname(test); break; case SOCK_ADDR_TEST_GETPEERNAME: - test_getpeername(test); + err = test_getpeername(test); break; default: ASSERT_TRUE(false, "Unknown sock addr test type"); break; } + if (test->expected_result == SYSCALL_EPERM) + ASSERT_EQ(err, EPERM, "socket operation returns EPERM"); + else if (test->expected_result == SYSCALL_ENOTSUPP) + ASSERT_EQ(err, ENOTSUPP, "socket operation returns ENOTSUPP"); + else if (test->expected_result == SUCCESS) + ASSERT_OK(err, "socket operation succeeds"); + test->destroyfn(skel); } cleanup: + unload_sock_addr_kern(); + cleanup_test_env(tok); if (cgroup_fd >= 0) close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 77e26ecffa..1337153eb0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -131,6 +131,65 @@ out: test_skmsg_load_helpers__destroy(skel); } +static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) +{ + struct bpf_program *prog, *prog_clone, *prog_clone2; + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts); + struct test_skmsg_load_helpers *skel; + struct bpf_link *link, *link2; + int err, map; + + skel = test_skmsg_load_helpers__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load")) + return; + + prog = skel->progs.prog_msg_verdict; + prog_clone = skel->progs.prog_msg_verdict_clone; + prog_clone2 = skel->progs.prog_msg_verdict_clone2; + map = bpf_map__fd(skel->maps.sock_map); + + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + /* Fail since bpf_link for the same prog has been created. */ + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_ERR(err, "bpf_prog_attach")) + goto out; + + /* Fail since bpf_link for the same prog type has been created. */ + link2 = bpf_program__attach_sockmap(prog_clone, map); + if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) { + bpf_link__detach(link2); + goto out; + } + + err = bpf_link__update_program(link, prog_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different type attempts to do update. */ + err = bpf_link__update_program(link, skel->progs.prog_skb_verdict); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + /* Fail since the old prog does not match the one in the kernel. */ + opts.old_prog_fd = bpf_program__fd(prog_clone2); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_ERR(err, "bpf_link_update")) + goto out; + + opts.old_prog_fd = bpf_program__fd(prog_clone); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_link_update")) + goto out; +out: + bpf_link__detach(link); + test_skmsg_load_helpers__destroy(skel); +} + static void test_sockmap_update(enum bpf_map_type map_type) { int err, prog, src; @@ -298,6 +357,40 @@ out: test_sockmap_skb_verdict_attach__destroy(skel); } +static void test_sockmap_skb_verdict_attach_with_link(void) +{ + struct test_sockmap_skb_verdict_attach *skel; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + prog = skel->progs.prog_skb_verdict; + map = bpf_map__fd(skel->maps.sock_map); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + bpf_link__detach(link); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + /* Fail since attaching with the same prog/map has been done. */ + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap")) + bpf_link__detach(link); + + err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT); + if (!ASSERT_OK(err, "bpf_prog_detach2")) + goto out; +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + static __u32 query_prog_id(int prog_fd) { struct bpf_prog_info info = {}; @@ -475,30 +568,19 @@ out: test_sockmap_drop_prog__destroy(drop); } -static void test_sockmap_skb_verdict_peek(void) +static void test_sockmap_skb_verdict_peek_helper(int map) { - int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail; - struct test_sockmap_pass_prog *pass; + int err, s, c1, p1, zero = 0, sent, recvd, avail; char snd[256] = "0123456789"; char rcv[256] = "0"; - pass = test_sockmap_pass_prog__open_and_load(); - if (!ASSERT_OK_PTR(pass, "open_and_load")) - return; - verdict = bpf_program__fd(pass->progs.prog_skb_verdict); - map = bpf_map__fd(pass->maps.sock_map_rx); - - err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach")) - goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - goto out; + return; err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); if (!ASSERT_OK(err, "create_pairs(s)")) - goto out; + return; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) @@ -520,7 +602,58 @@ static void test_sockmap_skb_verdict_peek(void) out_close: close(c1); close(p1); +} + +static void test_sockmap_skb_verdict_peek(void) +{ + struct test_sockmap_pass_prog *pass; + int err, map, verdict; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); + map = bpf_map__fd(pass->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + +out: + test_sockmap_pass_prog__destroy(pass); +} + +static void test_sockmap_skb_verdict_peek_with_link(void) +{ + struct test_sockmap_pass_prog *pass; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + prog = pass->progs.prog_skb_verdict; + map = bpf_map__fd(pass->maps.sock_map_rx); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + err = bpf_link__update_program(link, pass->progs.prog_skb_verdict_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different attach type attempts to do update. */ + err = bpf_link__update_program(link, pass->progs.prog_skb_parser); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + ASSERT_EQ(pass->bss->clone_called, 1, "clone_called"); out: + bpf_link__detach(link); test_sockmap_pass_prog__destroy(pass); } @@ -788,6 +921,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, BPF_SK_SKB_VERDICT); } + if (test__start_subtest("sockmap skb_verdict attach_with_link")) + test_sockmap_skb_verdict_attach_with_link(); if (test__start_subtest("sockmap msg_verdict progs query")) test_sockmap_progs_query(BPF_SK_MSG_VERDICT); if (test__start_subtest("sockmap stream_parser progs query")) @@ -804,6 +939,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); + if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) + test_sockmap_skb_verdict_peek_with_link(); if (test__start_subtest("sockmap unconnected af_unix")) test_sockmap_unconnected_unix(); if (test__start_subtest("sockmap one socket to many map entries")) @@ -812,4 +949,8 @@ void test_sockmap_basic(void) test_sockmap_many_maps(); if (test__start_subtest("sockmap same socket replace")) test_sockmap_same_sock(); + if (test__start_subtest("sockmap sk_msg attach sockmap helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg attach sockhash helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index a92807bfcd..e91b593660 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -767,6 +767,24 @@ static void test_msg_redir_to_connected(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_connected_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int prog_msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int link_fd; + + link_fd = bpf_link_create(prog_msg_verdict, sock_map, BPF_SK_MSG_VERDICT, NULL); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + return; + + redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + close(link_fd); +} + static void redir_to_listening(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { @@ -869,6 +887,24 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + struct bpf_program *verdict = skel->progs.prog_msg_verdict; + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + struct bpf_link *link; + + link = bpf_program__attach_sockmap(verdict, sock_map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + return; + + redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + bpf_link__detach(link); +} + static void redir_partial(int family, int sotype, int sock_map, int parser_map) { int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; @@ -1316,7 +1352,9 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, TEST(test_skb_redir_to_listening), TEST(test_skb_redir_partial), TEST(test_msg_redir_to_connected), + TEST(test_msg_redir_to_connected_with_link), TEST(test_msg_redir_to_listening), + TEST(test_msg_redir_to_listening_with_link), }; const char *family_name, *map_name; const struct redir_test *t; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c index 5a4491d4ed..eaac83a7f3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c @@ -24,6 +24,7 @@ enum sockopt_test_error { static struct sockopt_test { const char *descr; const struct bpf_insn insns[64]; + enum bpf_prog_type prog_type; enum bpf_attach_type attach_type; enum bpf_attach_type expected_attach_type; @@ -928,9 +929,40 @@ static struct sockopt_test { .error = EPERM_SETSOCKOPT, }, + + /* ==================== prog_type ==================== */ + + { + .descr = "can attach only BPF_CGROUP_SETSOCKOP", + .insns = { + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_SETSOCKOPT, + .expected_attach_type = 0, + .error = DENY_ATTACH, + }, + + { + .descr = "can attach only BPF_CGROUP_GETSOCKOP", + .insns = { + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_GETSOCKOPT, + .expected_attach_type = 0, + .error = DENY_ATTACH, + }, }; static int load_prog(const struct bpf_insn *insns, + enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { LIBBPF_OPTS(bpf_prog_load_opts, opts, @@ -947,7 +979,7 @@ static int load_prog(const struct bpf_insn *insns, } insns_cnt++; - fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCKOPT, NULL, "GPL", insns, insns_cnt, &opts); + fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts); if (verbose && fd < 0) fprintf(stderr, "%s\n", bpf_log_buf); @@ -1036,13 +1068,18 @@ static int call_getsockopt(bool use_io_uring, int fd, int level, int optname, return getsockopt(fd, level, optname, optval, optlen); } -static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring) +static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring, + bool use_link) { - int sock_fd, err, prog_fd; + int prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT; + int sock_fd, err, prog_fd, link_fd = -1; void *optval = NULL; int ret = 0; - prog_fd = load_prog(test->insns, test->expected_attach_type); + if (test->prog_type) + prog_type = test->prog_type; + + prog_fd = load_prog(test->insns, prog_type, test->expected_attach_type); if (prog_fd < 0) { if (test->error == DENY_LOAD) return 0; @@ -1051,7 +1088,12 @@ static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring) return -1; } - err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0); + if (use_link) { + err = bpf_link_create(prog_fd, cgroup_fd, test->attach_type, NULL); + link_fd = err; + } else { + err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0); + } if (err < 0) { if (test->error == DENY_ATTACH) goto close_prog_fd; @@ -1142,7 +1184,12 @@ free_optval: close_sock_fd: close(sock_fd); detach_prog: - bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type); + if (use_link) { + if (link_fd >= 0) + close(link_fd); + } else { + bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type); + } close_prog_fd: close(prog_fd); return ret; @@ -1160,10 +1207,12 @@ void test_sockopt(void) if (!test__start_subtest(tests[i].descr)) continue; - ASSERT_OK(run_test(cgroup_fd, &tests[i], false), + ASSERT_OK(run_test(cgroup_fd, &tests[i], false, false), + tests[i].descr); + ASSERT_OK(run_test(cgroup_fd, &tests[i], false, true), tests[i].descr); if (tests[i].io_uring_support) - ASSERT_OK(run_test(cgroup_fd, &tests[i], true), + ASSERT_OK(run_test(cgroup_fd, &tests[i], true, false), tests[i].descr); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 917f486db8..1d3a20f01b 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include "cgroup_helpers.h" +#include "network_helpers.h" #include "sockopt_inherit.skel.h" @@ -9,35 +10,6 @@ #define CUSTOM_INHERIT2 1 #define CUSTOM_LISTENER 2 -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create client socket"); - return -1; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server"); - goto out; - } - - return fd; - -out: - close(fd); - return -1; -} - static int verify_sockopt(int fd, int optname, const char *msg, char expected) { socklen_t optlen = 1; @@ -98,47 +70,36 @@ static void *server_thread(void *arg) return (void *)(long)err; } -static int start_server(void) +static int custom_cb(int fd, const struct post_socket_opts *opts) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; char buf; int err; - int fd; int i; - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create server socket"); - return -1; - } - for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) { buf = 0x01; err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1); if (err) { log_err("Failed to call setsockopt(%d)", i); - close(fd); return -1; } } - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { - log_err("Failed to bind socket"); - close(fd); - return -1; - } - - return fd; + return 0; } static void run_test(int cgroup_fd) { struct bpf_link *link_getsockopt = NULL; struct bpf_link *link_setsockopt = NULL; + struct network_helper_opts opts = { + .post_socket_cb = custom_cb, + }; int server_fd = -1, client_fd; + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; struct sockopt_inherit *obj; void *server_err; pthread_t tid; @@ -160,7 +121,8 @@ static void run_test(int cgroup_fd) if (!ASSERT_OK_PTR(link_setsockopt, "cg-attach-setsockopt")) goto close_bpf_object; - server_fd = start_server(); + server_fd = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr, + sizeof(addr), &opts); if (!ASSERT_GE(server_fd, 0, "start_server")) goto close_bpf_object; @@ -173,7 +135,7 @@ static void run_test(int cgroup_fd) pthread_cond_wait(&server_started, &server_started_mtx); pthread_mutex_unlock(&server_started_mtx); - client_fd = connect_to_server(server_fd); + client_fd = connect_to_fd(server_fd, 0); if (!ASSERT_GE(client_fd, 0, "connect_to_server")) goto close_server_fd; diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 5db9eec24b..0832fd7874 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -35,7 +35,7 @@ retry: pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu 0 */, -1 /* group id */, 0 /* flags */); - if (pmu_fd < 0 && errno == ENOENT) { + if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index 15ee7b2fc4..b913572002 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -73,6 +73,16 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, "up primary"); ASSERT_OK(system("ip addr add dev " netkit_name " 10.0.0.1/24"), "addr primary"); + + if (mode == NETKIT_L3) { + ASSERT_EQ(system("ip link set dev " netkit_name + " addr ee:ff:bb:cc:aa:dd 2> /dev/null"), 512, + "set hwaddress"); + } else { + ASSERT_OK(system("ip link set dev " netkit_name + " addr ee:ff:bb:cc:aa:dd"), + "set hwaddress"); + } if (same_netns) { ASSERT_OK(system("ip link set dev " netkit_peer " up"), "up peer"); @@ -89,6 +99,16 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, return err; } +static void move_netkit(void) +{ + ASSERT_OK(system("ip link set " netkit_peer " netns foo"), + "move peer"); + ASSERT_OK(system("ip netns exec foo ip link set dev " + netkit_peer " up"), "up peer"); + ASSERT_OK(system("ip netns exec foo ip addr add dev " + netkit_peer " 10.0.0.2/24"), "addr peer"); +} + static void destroy_netkit(void) { ASSERT_OK(system("ip link del dev " netkit_name), "del primary"); @@ -685,3 +705,77 @@ void serial_test_tc_netkit_neigh_links(void) serial_test_tc_netkit_neigh_links_target(NETKIT_L2, BPF_NETKIT_PRIMARY); serial_test_tc_netkit_neigh_links_target(NETKIT_L3, BPF_NETKIT_PRIMARY); } + +static void serial_test_tc_netkit_pkt_type_mode(int mode) +{ + LIBBPF_OPTS(bpf_netkit_opts, optl_nk); + LIBBPF_OPTS(bpf_tcx_opts, optl_tcx); + int err, ifindex, ifindex2; + struct test_tc_link *skel; + struct bpf_link *link; + + err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, + &ifindex, true); + if (err) + return; + + ifindex2 = if_nametoindex(netkit_peer); + ASSERT_NEQ(ifindex, ifindex2, "ifindex_1_2"); + + skel = test_tc_link__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc1, + BPF_NETKIT_PRIMARY), 0, "tc1_attach_type"); + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc7, + BPF_TCX_INGRESS), 0, "tc7_attach_type"); + + err = test_tc_link__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 0); + + link = bpf_program__attach_netkit(skel->progs.tc1, ifindex, &optl_nk); + if (!ASSERT_OK_PTR(link, "link_attach")) + goto cleanup; + + skel->links.tc1 = link; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 0); + + link = bpf_program__attach_tcx(skel->progs.tc7, ifindex2, &optl_tcx); + if (!ASSERT_OK_PTR(link, "link_attach")) + goto cleanup; + + skel->links.tc7 = link; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 1); + + move_netkit(); + + tc_skel_reset_all_seen(skel); + skel->bss->set_type = true; + ASSERT_EQ(send_icmp(), 0, "icmp_pkt"); + + ASSERT_EQ(skel->bss->seen_tc1, true, "seen_tc1"); + ASSERT_EQ(skel->bss->seen_tc7, true, "seen_tc7"); + + ASSERT_EQ(skel->bss->seen_host, true, "seen_host"); + ASSERT_EQ(skel->bss->seen_mcast, true, "seen_mcast"); +cleanup: + test_tc_link__destroy(skel); + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + destroy_netkit(); +} + +void serial_test_tc_netkit_pkt_type(void) +{ + serial_test_tc_netkit_pkt_type_mode(NETKIT_L2); + serial_test_tc_netkit_pkt_type_mode(NETKIT_L3); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index dbe06aeaa2..b1073d36d7 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -530,7 +530,7 @@ static int wait_netstamp_needed_key(void) __u64 tstamp = 0; nstoken = open_netns(NS_DST); - if (!nstoken) + if (!ASSERT_OK_PTR(nstoken, "setns dst")) return -1; srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0); diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index 8fe84da1b9..f2b99d95d9 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; static void send_byte(int fd) @@ -83,6 +86,17 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, err++; } + /* Precise values of mrtt and srtt are unavailable, just make sure they are nonzero */ + if (val.mrtt_us == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[0] (mrtt_us) %u == 0", msg, val.mrtt_us); + err++; + } + + if (val.srtt == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[1] (srtt) %u == 0", msg, val.srtt); + err++; + } + return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index ee5372c7f2..29e183a80f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -4,6 +4,8 @@ #include #include "struct_ops_module.skel.h" +#include "struct_ops_nulled_out_cb.skel.h" +#include "struct_ops_forgotten_cb.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -66,6 +68,7 @@ static void test_struct_ops_load(void) * auto-loading, or it will fail to load. */ bpf_program__set_autoload(skel->progs.test_2, false); + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); err = struct_ops_module__load(skel); if (!ASSERT_OK(err, "struct_ops_module_load")) @@ -93,9 +96,163 @@ cleanup: struct_ops_module__destroy(skel); } +static void test_struct_ops_not_zeroed(void) +{ + struct struct_ops_module *skel; + int err; + + /* zeroed is 0, and zeroed_op is null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) + return; + + skel->struct_ops.testmod_zeroed->zeroed = 0; + /* zeroed_op prog should be not loaded automatically now */ + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; + + err = struct_ops_module__load(skel); + ASSERT_OK(err, "struct_ops_module_load"); + + struct_ops_module__destroy(skel); + + /* zeroed is not 0 */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed")) + return; + + /* libbpf should reject the testmod_zeroed since struct + * bpf_testmod_ops in the kernel has no "zeroed" field and the + * value of "zeroed" is non-zero. + */ + skel->struct_ops.testmod_zeroed->zeroed = 0xdeadbeef; + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed"); + + struct_ops_module__destroy(skel); + + /* zeroed_op is not null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed_op")) + return; + + /* libbpf should reject the testmod_zeroed since the value of its + * "zeroed_op" is not null. + */ + skel->struct_ops.testmod_zeroed->zeroed_op = skel->progs.test_3; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed_op"); + + struct_ops_module__destroy(skel); +} + +/* The signature of an implementation might not match the signature of the + * function pointer prototype defined in the BPF program. This mismatch + * should be allowed as long as the behavior of the operator program + * adheres to the signature in the kernel. Libbpf should not enforce the + * signature; rather, let the kernel verifier handle the enforcement. + */ +static void test_struct_ops_incompatible(void) +{ + struct struct_ops_module *skel; + struct bpf_link *link; + int err; + + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) + return; + + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); + + err = struct_ops_module__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible); + if (ASSERT_OK_PTR(link, "attach_struct_ops")) + bpf_link__destroy(link); + +cleanup: + struct_ops_module__destroy(skel); +} + +/* validate that it's ok to "turn off" callback that kernel supports */ +static void test_struct_ops_nulled_out_cb(void) +{ + struct struct_ops_nulled_out_cb *skel; + int err; + + skel = struct_ops_nulled_out_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + /* kernel knows about test_1, but we still null it out */ + skel->struct_ops.ops->test_1 = NULL; + + err = struct_ops_nulled_out_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + ASSERT_FALSE(bpf_program__autoload(skel->progs.test_1_turn_off), "prog_autoload"); + ASSERT_LT(bpf_program__fd(skel->progs.test_1_turn_off), 0, "prog_fd"); + +cleanup: + struct_ops_nulled_out_cb__destroy(skel); +} + +/* validate that libbpf generates reasonable error message if struct_ops is + * not referenced in any struct_ops map + */ +static void test_struct_ops_forgotten_cb(void) +{ + struct struct_ops_forgotten_cb *skel; + char *log; + int err; + + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + start_libbpf_log_capture(); + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_ERR(err, "skel_load")) + goto cleanup; + + log = stop_libbpf_log_capture(); + ASSERT_HAS_SUBSTR(log, + "prog 'test_1_forgotten': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?", + "libbpf_log"); + free(log); + + struct_ops_forgotten_cb__destroy(skel); + + /* now let's programmatically use it, we should be fine now */ + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->struct_ops.ops->test_1 = skel->progs.test_1_forgotten; /* not anymore */ + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + +cleanup: + struct_ops_forgotten_cb__destroy(skel); +} + void serial_test_struct_ops_module(void) { - if (test__start_subtest("test_struct_ops_load")) + if (test__start_subtest("struct_ops_load")) test_struct_ops_load(); + if (test__start_subtest("struct_ops_not_zeroed")) + test_struct_ops_not_zeroed(); + if (test__start_subtest("struct_ops_incompatible")) + test_struct_ops_incompatible(); + if (test__start_subtest("struct_ops_null_out_cb")) + test_struct_ops_nulled_out_cb(); + if (test__start_subtest("struct_ops_forgotten_cb")) + test_struct_ops_forgotten_cb(); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 5f1fb0a2ea..cec746e77c 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -612,6 +612,8 @@ static void test_ipip_tunnel(enum ipip_encap encap) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); if (!ASSERT_OK(err, "test_ping")) goto done; @@ -666,6 +668,8 @@ static void test_xfrm_tunnel(void) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); close_netns(nstoken); if (!ASSERT_OK(err, "test_ping")) diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c new file mode 100644 index 0000000000..871d16cb95 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "timer_lockup.skel.h" + +static long cpu; +static int *timer1_err; +static int *timer2_err; +static bool skip; + +volatile int k = 0; + +static void *timer_lockup_thread(void *arg) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1000, + ); + int i, prog_fd = *(int *)arg; + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset); + ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset), + &cpuset), + "cpu affinity"); + + for (i = 0; !READ_ONCE(*timer1_err) && !READ_ONCE(*timer2_err); i++) { + bpf_prog_test_run_opts(prog_fd, &opts); + /* Skip the test if we can't reproduce the race in a reasonable + * amount of time. + */ + if (i > 50) { + WRITE_ONCE(skip, true); + break; + } + } + + return NULL; +} + +void test_timer_lockup(void) +{ + int timer1_prog, timer2_prog; + struct timer_lockup *skel; + pthread_t thrds[2]; + void *ret; + + skel = timer_lockup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) + return; + + timer1_prog = bpf_program__fd(skel->progs.timer1_prog); + timer2_prog = bpf_program__fd(skel->progs.timer2_prog); + + timer1_err = &skel->bss->timer1_err; + timer2_err = &skel->bss->timer2_err; + + if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread, + &timer1_prog), + "pthread_create thread1")) + goto out; + if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread, + &timer2_prog), + "pthread_create thread2")) { + pthread_exit(&thrds[0]); + goto out; + } + + pthread_join(thrds[1], &ret); + pthread_join(thrds[0], &ret); + + if (skip) { + test__skip(); + goto out; + } + + if (*timer1_err != -EDEADLK && *timer1_err != 0) + ASSERT_FAIL("timer1_err bad value"); + if (*timer2_err != -EDEADLK && *timer2_err != 0) + ASSERT_FAIL("timer2_err bad value"); +out: + timer_lockup__destroy(skel); + return; +} diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c index 7b9124d506..e56e88596d 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c @@ -5,18 +5,19 @@ #include "trace_printk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "testing,testing" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_printk(void) { struct trace_printk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_printk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_printk_lskel__open(); if (!ASSERT_OK_PTR(skel, "trace_printk__open")) @@ -35,16 +36,6 @@ void serial_test_trace_printk(void) if (!ASSERT_OK(err, "trace_printk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_printk_lskel__detach(skel); @@ -56,21 +47,12 @@ void serial_test_trace_printk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_printk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_printk_ran, "found")) goto cleanup; cleanup: trace_printk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c index 44ea2fd88f..2af6a6f209 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c @@ -5,18 +5,19 @@ #include "trace_vprintk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "1,2,3,4,5,6,7,8,9,10" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_vprintk(void) { struct trace_vprintk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_vprintk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_vprintk_lskel__open_and_load(); if (!ASSERT_OK_PTR(skel, "trace_vprintk__open_and_load")) @@ -28,16 +29,6 @@ void serial_test_trace_vprintk(void) if (!ASSERT_OK(err, "trace_vprintk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_vprintk_lskel__detach(skel); @@ -49,14 +40,8 @@ void serial_test_trace_vprintk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_vprintk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_vprintk_ran, "found")) goto cleanup; @@ -66,7 +51,4 @@ void serial_test_trace_vprintk(void) cleanup: trace_vprintk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 38fda42fd7..bf6ca8e3eb 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" #include "uprobe_multi_usdt.skel.h" #include "bpf/libbpf_internal.h" #include "testing_helpers.h" +#include "../sdt.h" static char test_data[] = "test_data"; @@ -25,9 +27,17 @@ noinline void uprobe_multi_func_3(void) asm volatile (""); } +noinline void usdt_trigger(void) +{ + STAP_PROBE(test, pid_filter_usdt); +} + struct child { int go[2]; + int c2p[2]; /* child -> parent channel */ int pid; + int tid; + pthread_t thread; }; static void release_child(struct child *child) @@ -38,6 +48,10 @@ static void release_child(struct child *child) return; close(child->go[1]); close(child->go[0]); + if (child->thread) + pthread_join(child->thread, NULL); + close(child->c2p[0]); + close(child->c2p[1]); if (child->pid > 0) waitpid(child->pid, &child_status, 0); } @@ -63,7 +77,7 @@ static struct child *spawn_child(void) if (pipe(child.go)) return NULL; - child.pid = fork(); + child.pid = child.tid = fork(); if (child.pid < 0) { release_child(&child); errno = EINVAL; @@ -82,6 +96,7 @@ static struct child *spawn_child(void) uprobe_multi_func_1(); uprobe_multi_func_2(); uprobe_multi_func_3(); + usdt_trigger(); exit(errno); } @@ -89,6 +104,67 @@ static struct child *spawn_child(void) return &child; } +static void *child_thread(void *ctx) +{ + struct child *child = ctx; + int c = 0, err; + + child->tid = syscall(SYS_gettid); + + /* let parent know we are ready */ + err = write(child->c2p[1], &c, 1); + if (err != 1) + pthread_exit(&err); + + /* wait for parent's kick */ + err = read(child->go[0], &c, 1); + if (err != 1) + pthread_exit(&err); + + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + usdt_trigger(); + + err = 0; + pthread_exit(&err); +} + +static struct child *spawn_thread(void) +{ + static struct child child; + int c, err; + + /* pipe to notify child to execute the trigger functions */ + if (pipe(child.go)) + return NULL; + /* pipe to notify parent that child thread is ready */ + if (pipe(child.c2p)) { + close(child.go[0]); + close(child.go[1]); + return NULL; + } + + child.pid = getpid(); + + err = pthread_create(&child.thread, NULL, child_thread, &child); + if (err) { + err = -errno; + close(child.go[0]); + close(child.go[1]); + close(child.c2p[0]); + close(child.c2p[1]); + errno = -err; + return NULL; + } + + err = read(child.c2p[0], &c, 1); + if (!ASSERT_EQ(err, 1, "child_thread_ready")) + return NULL; + + return &child; +} + static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child) { skel->bss->uprobe_multi_func_1_addr = (__u64) uprobe_multi_func_1; @@ -103,15 +179,23 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child * passed at the probe attach. */ skel->bss->pid = child ? 0 : getpid(); + skel->bss->expect_pid = child ? child->pid : 0; + + /* trigger all probes, if we are testing child *process*, just to make + * sure that PID filtering doesn't let through activations from wrong + * PIDs; when we test child *thread*, we don't want to do this to + * avoid double counting number of triggering events + */ + if (!child || !child->thread) { + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + usdt_trigger(); + } if (child) kick_child(child); - /* trigger all probes */ - uprobe_multi_func_1(); - uprobe_multi_func_2(); - uprobe_multi_func_3(); - /* * There are 2 entry and 2 exit probe called for each uprobe_multi_func_[123] * function and each slepable probe (6) increments uprobe_multi_sleep_result. @@ -126,8 +210,12 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child ASSERT_EQ(skel->bss->uprobe_multi_sleep_result, 6, "uprobe_multi_sleep_result"); - if (child) + ASSERT_FALSE(skel->bss->bad_pid_seen, "bad_pid_seen"); + + if (child) { ASSERT_EQ(skel->bss->child_pid, child->pid, "uprobe_multi_child_pid"); + ASSERT_EQ(skel->bss->child_tid, child->tid, "uprobe_multi_child_tid"); + } } static void test_skel_api(void) @@ -190,8 +278,24 @@ __test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_mul if (!ASSERT_OK_PTR(skel->links.uprobe_extra, "bpf_program__attach_uprobe_multi")) goto cleanup; + /* Attach (uprobe-backed) USDTs */ + skel->links.usdt_pid = bpf_program__attach_usdt(skel->progs.usdt_pid, pid, binary, + "test", "pid_filter_usdt", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_pid, "attach_usdt_pid")) + goto cleanup; + + skel->links.usdt_extra = bpf_program__attach_usdt(skel->progs.usdt_extra, -1, binary, + "test", "pid_filter_usdt", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_extra, "attach_usdt_extra")) + goto cleanup; + uprobe_multi_test_run(skel, child); + ASSERT_FALSE(skel->bss->bad_pid_seen_usdt, "bad_pid_seen_usdt"); + if (child) { + ASSERT_EQ(skel->bss->child_pid_usdt, child->pid, "usdt_multi_child_pid"); + ASSERT_EQ(skel->bss->child_tid_usdt, child->tid, "usdt_multi_child_tid"); + } cleanup: uprobe_multi__destroy(skel); } @@ -210,6 +314,13 @@ test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_multi return; __test_attach_api(binary, pattern, opts, child); + + /* pid filter (thread) */ + child = spawn_thread(); + if (!ASSERT_OK_PTR(child, "spawn_thread")) + return; + + __test_attach_api(binary, pattern, opts, child); } static void test_attach_api_pattern(void) @@ -495,6 +606,13 @@ static void test_link_api(void) return; __test_link_api(child); + + /* pid filter (thread) */ + child = spawn_thread(); + if (!ASSERT_OK_PTR(child, "spawn_thread")) + return; + + __test_link_api(child); } static void test_bench_attach_uprobe(void) diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index c4f9f30664..98ef39efa7 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -53,6 +53,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" +#include "verifier_or_jmp32_k.skel.h" #include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" #include "verifier_raw_stack.skel.h" @@ -66,6 +67,8 @@ #include "verifier_sdiv.skel.h" #include "verifier_search_pruning.skel.h" #include "verifier_sock.skel.h" +#include "verifier_sock_addr.skel.h" +#include "verifier_sockmap_mutate.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" @@ -168,6 +171,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } +void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); } void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } void test_verifier_raw_stack(void) { RUN(verifier_raw_stack); } @@ -181,6 +185,8 @@ void test_verifier_scalar_ids(void) { RUN(verifier_scalar_ids); } void test_verifier_sdiv(void) { RUN(verifier_sdiv); } void test_verifier_search_pruning(void) { RUN(verifier_search_pruning); } void test_verifier_sock(void) { RUN(verifier_sock); } +void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } +void test_verifier_sockmap_mutate(void) { RUN(verifier_sockmap_mutate); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c new file mode 100644 index 0000000000..3918ecc2ee --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include + +#include "verifier_kfunc_prog_types.skel.h" + +void test_verifier_kfunc_prog_types(void) +{ + RUN_TESTS(verifier_kfunc_prog_types); +} diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c new file mode 100644 index 0000000000..99e438fe12 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires */ +#include +#include "wq.skel.h" +#include "wq_failures.skel.h" + +void serial_test_wq(void) +{ + struct wq *wq_skel = NULL; + int err, prog_fd; + + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq); + + /* re-run the success test to check if the timer was actually executed */ + + wq_skel = wq__open_and_load(); + if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + return; + + err = wq__attach(wq_skel); + if (!ASSERT_OK(err, "wq_attach")) + return; + + prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(50); /* 10 usecs should be enough, but give it extra */ + + ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); + wq__destroy(wq_skel); +} + +void serial_test_failures_wq(void) +{ + RUN_TESTS(wq_failures); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index f09505f8b0..53d6ad8c22 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -222,7 +222,7 @@ static void test_xdp_adjust_frags_tail_grow(void) prog = bpf_object__next_program(obj, NULL); if (bpf_object__load(obj)) - return; + goto out; prog_fd = bpf_program__fd(prog); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 05edcf32f5..f76b5d67a3 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -384,6 +384,8 @@ void test_xdp_metadata(void) SYS(out, "ip netns add " RX_NETNS_NAME); tok = open_netns(TX_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add numtxqueues 1 numrxqueues 1 " TX_NAME " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1"); SYS(out, "ip link set " RX_NAME " netns " RX_NETNS_NAME); @@ -400,6 +402,8 @@ void test_xdp_metadata(void) SYS(out, "ip -4 neigh add " RX_ADDR " lladdr " RX_MAC " dev " TX_NAME_VLAN); switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; SYS(out, "ip link set dev " RX_NAME " address " RX_MAC); SYS(out, "ip link set dev " RX_NAME " up"); @@ -449,6 +453,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Setup separate AF_XDP for TX interface nad send packet to the RX socket. */ tx_ifindex = if_nametoindex(TX_NAME); @@ -461,6 +467,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; /* Verify packet sent from AF_XDP has proper metadata. */ if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, true), 0, @@ -468,6 +476,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; complete_tx(&tx_xsk); /* Now check metadata of packet, generated with network stack */ @@ -475,6 +485,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, false), 0, "verify_xsk_metadata")) @@ -498,6 +510,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Send packet to trigger . */ if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0, @@ -505,6 +519,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; while (!retries--) { if (bpf_obj2->bss->called) diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c new file mode 100644 index 0000000000..55f1056320 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "bpf_arena_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 10); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +__u32 pid = 0; + +#undef __arena +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) +#define __arena __attribute__((address_space(1))) +#else +#define __arena SEC(".addr_space.1") +#endif + +__u64 __arena add64_value = 1; +__u64 __arena add64_result = 0; +__u32 __arena add32_value = 1; +__u32 __arena add32_result = 0; +__u64 __arena add_stack_value_copy = 0; +__u64 __arena add_stack_result = 0; +__u64 __arena add_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int add(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 add_stack_value = 1; + + add64_result = __sync_fetch_and_add(&add64_value, 2); + add32_result = __sync_fetch_and_add(&add32_value, 2); + add_stack_result = __sync_fetch_and_add(&add_stack_value, 2); + add_stack_value_copy = add_stack_value; + __sync_fetch_and_add(&add_noreturn_value, 2); +#endif + + return 0; +} + +__s64 __arena sub64_value = 1; +__s64 __arena sub64_result = 0; +__s32 __arena sub32_value = 1; +__s32 __arena sub32_result = 0; +__s64 __arena sub_stack_value_copy = 0; +__s64 __arena sub_stack_result = 0; +__s64 __arena sub_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int sub(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 sub_stack_value = 1; + + sub64_result = __sync_fetch_and_sub(&sub64_value, 2); + sub32_result = __sync_fetch_and_sub(&sub32_value, 2); + sub_stack_result = __sync_fetch_and_sub(&sub_stack_value, 2); + sub_stack_value_copy = sub_stack_value; + __sync_fetch_and_sub(&sub_noreturn_value, 2); +#endif + + return 0; +} + +__u64 __arena and64_value = (0x110ull << 32); +__u32 __arena and32_value = 0x110; + +SEC("raw_tp/sys_enter") +int and(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + + __sync_fetch_and_and(&and64_value, 0x011ull << 32); + __sync_fetch_and_and(&and32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena or32_value = 0x110; +__u64 __arena or64_value = (0x110ull << 32); + +SEC("raw_tp/sys_enter") +int or(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_or(&or64_value, 0x011ull << 32); + __sync_fetch_and_or(&or32_value, 0x011); +#endif + + return 0; +} + +__u64 __arena xor64_value = (0x110ull << 32); +__u32 __arena xor32_value = 0x110; + +SEC("raw_tp/sys_enter") +int xor(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_xor(&xor64_value, 0x011ull << 32); + __sync_fetch_and_xor(&xor32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena cmpxchg32_value = 1; +__u32 __arena cmpxchg32_result_fail = 0; +__u32 __arena cmpxchg32_result_succeed = 0; +__u64 __arena cmpxchg64_value = 1; +__u64 __arena cmpxchg64_result_fail = 0; +__u64 __arena cmpxchg64_result_succeed = 0; + +SEC("raw_tp/sys_enter") +int cmpxchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + cmpxchg64_result_fail = __sync_val_compare_and_swap(&cmpxchg64_value, 0, 3); + cmpxchg64_result_succeed = __sync_val_compare_and_swap(&cmpxchg64_value, 1, 2); + + cmpxchg32_result_fail = __sync_val_compare_and_swap(&cmpxchg32_value, 0, 3); + cmpxchg32_result_succeed = __sync_val_compare_and_swap(&cmpxchg32_value, 1, 2); +#endif + + return 0; +} + +__u64 __arena xchg64_value = 1; +__u64 __arena xchg64_result = 0; +__u32 __arena xchg32_value = 1; +__u32 __arena xchg32_result = 0; + +SEC("raw_tp/sys_enter") +int xchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 val64 = 2; + __u32 val32 = 2; + + xchg64_result = __sync_lock_test_and_set(&xchg64_value, val64); + xchg32_result = __sync_lock_test_and_set(&xchg32_value, val32); +#endif + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index c0422c58ce..93bd0600eb 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -49,7 +49,7 @@ int arena_list_add(void *ctx) list_head = &global_head; - for (i = zero; i < cnt; cond_break, i++) { + for (i = zero; i < cnt && can_loop; i++) { struct elem __arena *n = bpf_alloc(sizeof(*n)); test_val++; diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index a487f60b73..b7ddf8ec4e 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -12,6 +12,8 @@ #include #include +#include "bind_prog.h" + #define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ #define SERV4_PORT 4040 #define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ @@ -118,23 +120,23 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24; + user_ip4 |= load_byte(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 1, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 2, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 3, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16; + user_ip4 |= load_word(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_word(ctx->user_ip4, 1, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; @@ -156,4 +158,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind4") +int bind_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index d62cd9e9cf..501c3fc11d 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -12,6 +12,8 @@ #include #include +#include "bind_prog.h" + #define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ #define SERV6_IP_1 0x12345678 #define SERV6_IP_2 0x00000000 @@ -129,25 +131,25 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24; + user_ip6 |= load_byte(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 1, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 2, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 3, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16; + user_ip6 |= load_word(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_word(ctx->user_ip6[i], 1, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } @@ -173,4 +175,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind6") +int bind_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind_prog.h b/tools/testing/selftests/bpf/progs/bind_prog.h new file mode 100644 index 0000000000..e830caa940 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind_prog.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BIND_PROG_H__ +#define __BIND_PROG_H__ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[b] << 8 * b) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * w) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[(b) + (sizeof(src) - (s))] << 8 * ((s) - (b) - 1)) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * (((s) / 2) - (w) - 1)) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#endif diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c new file mode 100644 index 0000000000..1654a530aa --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Highlights: + * 1. The major difference between this bpf program and tcp_cubic.c + * is that this bpf program relies on `cong_control` rather than + * `cong_avoid` in the struct tcp_congestion_ops. + * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and + * tcp_update_pacing_rate is bypassed when `cong_control` is + * defined, so moving these logic to `cong_control`. + * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c. + * The main purpose is to show use cases of the arguments in + * `cong_control`. For simplicity's sake, it reuses tcp cubic's + * kernel functions. + */ + +#include "bpf_tracing_net.h" +#include +#include + +#define USEC_PER_SEC 1000000UL +#define TCP_PACING_SS_RATIO (200) +#define TCP_PACING_CA_RATIO (120) +#define TCP_REORDERING (12) + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define after(seq2, seq1) before(seq1, seq2) + +extern void cubictcp_init(struct sock *sk) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym; +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; + +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} + +static __u64 div64_u64(__u64 dividend, __u64 divisor) +{ + return dividend / divisor; +} + +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= TCP_PACING_SS_RATIO; + else + rate *= TCP_PACING_CA_RATIO; + + rate *= max(tp->snd_cwnd, tp->packets_out); + + if (tp->srtt_us) + rate = div64_u64(rate, (__u64)tp->srtt_us); + + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); +} + +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int newly_lost, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; + int delta = tp->snd_ssthresh - pkts_in_flight; + + if (newly_acked_sacked <= 0 || !tp->prior_cwnd) + return; + + __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; + + if (delta < 0) { + __u64 dividend = + (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; + sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) + sndcnt++; + sndcnt = min(delta, sndcnt); + } + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); + tp->snd_cwnd = pkts_in_flight + sndcnt; +} + +/* Decide wheather to run the increase function of congestion control. */ +static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) +{ + if (tcp_sk(sk)->reordering > TCP_REORDERING) + return flag & FLAG_FORWARD_PROGRESS; + + return flag & FLAG_DATA_ACKED; +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_init, struct sock *sk) +{ + cubictcp_init(sk); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (((1<icsk_ca_state)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); + + if (!before(tp->snd_una, tp->high_seq)) { + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_jiffies32; + } + } + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + cubictcp_cong_avoid(sk, ack, rs->acked_sacked); + tp->snd_cwnd_stamp = tcp_jiffies32; + } + + tcp_update_pacing_rate(sk); +} + +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) +{ + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) +{ + cubictcp_state(sk, new_state); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) +{ + return tcp_reno_undo_cwnd(sk); +} + +SEC(".struct_ops") +struct tcp_congestion_ops cc_cubic = { + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_control = (void *)bpf_cubic_cong_control, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, + .name = "bpf_cc_cubic", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index c997e3e3d3..d665b8a15c 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -14,14 +14,22 @@ * "ca->ack_cnt / delta" operation. */ -#include -#include -#include -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" +#include char _license[] SEC("license") = "GPL"; #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} +#define after(seq2, seq1) before(seq1, seq2) + +extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; +extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta @@ -70,7 +78,7 @@ static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) / (bic_scale * 10); /* BIC TCP Parameters */ -struct bictcp { +struct bpf_bictcp { __u32 cnt; /* increase cwnd by 1 after ACKs */ __u32 last_max_cwnd; /* last maximum snd_cwnd */ __u32 last_cwnd; /* the last snd_cwnd */ @@ -91,7 +99,7 @@ struct bictcp { __u32 curr_rtt; /* the minimum rtt of current round */ }; -static inline void bictcp_reset(struct bictcp *ca) +static void bictcp_reset(struct bpf_bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; @@ -112,7 +120,7 @@ extern unsigned long CONFIG_HZ __kconfig; #define USEC_PER_SEC 1000000UL #define USEC_PER_JIFFY (USEC_PER_SEC / HZ) -static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) +static __u64 div64_u64(__u64 dividend, __u64 divisor) { return dividend / divisor; } @@ -120,7 +128,7 @@ static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) #define div64_ul div64_u64 #define BITS_PER_U64 (sizeof(__u64) * 8) -static __always_inline int fls64(__u64 x) +static int fls64(__u64 x) { int num = BITS_PER_U64 - 1; @@ -153,15 +161,15 @@ static __always_inline int fls64(__u64 x) return num + 1; } -static __always_inline __u32 bictcp_clock_us(const struct sock *sk) +static __u32 bictcp_clock_us(const struct sock *sk) { return tcp_sk(sk)->tcp_mstamp; } -static __always_inline void bictcp_hystart_reset(struct sock *sk) +static void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; @@ -169,11 +177,10 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -/* "struct_ops/" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_init") +SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); @@ -184,12 +191,11 @@ void BPF_PROG(bpf_cubic_init, struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* "struct_ops" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_cwnd_event") +SEC("struct_ops") void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 now = tcp_jiffies32; __s32 delta; @@ -230,7 +236,7 @@ static const __u8 v[] = { * Newton-Raphson iteration. * Avg err ~= 0.195% */ -static __always_inline __u32 cubic_root(__u64 a) +static __u32 cubic_root(__u64 a) { __u32 x, b, shift; @@ -263,8 +269,7 @@ static __always_inline __u32 cubic_root(__u64 a) /* * Compute congestion window to use. */ -static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, - __u32 acked) +static void bictcp_update(struct bpf_bictcp *ca, __u32 cwnd, __u32 acked) { __u32 delta, bic_target, max_cnt; __u64 offs, t; @@ -377,11 +382,11 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ -void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; @@ -397,10 +402,11 @@ void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acke tcp_cong_avoid_ai(tp, ca->cnt, acked); } -__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ @@ -414,7 +420,8 @@ __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -433,7 +440,7 @@ void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ -static __always_inline __u32 hystart_ack_delay(struct sock *sk) +static __u32 hystart_ack_delay(struct sock *sk) { unsigned long rate; @@ -444,10 +451,10 @@ static __always_inline __u32 hystart_ack_delay(struct sock *sk) div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); } -static __always_inline void hystart_update(struct sock *sk, __u32 delay) +static void hystart_update(struct sock *sk, __u32 delay) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { @@ -492,11 +499,11 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) int bpf_cubic_acked_called = 0; -void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, - const struct ack_sample *sample) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 delay; bpf_cubic_acked_called = 1; @@ -524,7 +531,8 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; -__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) { return tcp_reno_undo_cwnd(sk); } diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 460682759a..3c9ffe3403 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -6,15 +6,23 @@ * the kernel BPF logic. */ -#include -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" + +#ifndef EBUSY +#define EBUSY 16 +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} char _license[] SEC("license") = "GPL"; @@ -35,7 +43,7 @@ struct { #define DCTCP_MAX_ALPHA 1024U -struct dctcp { +struct bpf_dctcp { __u32 old_delivered; __u32 old_delivered_ce; __u32 prior_rcv_nxt; @@ -48,8 +56,7 @@ struct dctcp { static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */ static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA; -static __always_inline void dctcp_reset(const struct tcp_sock *tp, - struct dctcp *ca) +static void dctcp_reset(const struct tcp_sock *tp, struct bpf_dctcp *ca) { ca->next_seq = tp->snd_nxt; @@ -57,11 +64,11 @@ static __always_inline void dctcp_reset(const struct tcp_sock *tp, ca->old_delivered_ce = tp->delivered_ce; } -SEC("struct_ops/dctcp_init") +SEC("struct_ops") void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); int *stg; if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { @@ -104,21 +111,21 @@ void BPF_PROG(dctcp_init, struct sock *sk) dctcp_reset(tp, ca); } -SEC("struct_ops/dctcp_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } -SEC("struct_ops/dctcp_update_alpha") +SEC("struct_ops") void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { @@ -144,16 +151,16 @@ void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) } } -static __always_inline void dctcp_react_to_loss(struct sock *sk) +static void dctcp_react_to_loss(struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); } -SEC("struct_ops/dctcp_state") +SEC("struct_ops") void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Recovery && @@ -164,7 +171,7 @@ void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) */ } -static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) +static void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); @@ -179,9 +186,8 @@ static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ -static __always_inline -void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, - __u32 *prior_rcv_nxt, __u32 *ce_state) +static void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + __u32 *prior_rcv_nxt, __u32 *ce_state) { __u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; @@ -201,10 +207,10 @@ void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, dctcp_ece_ack_cwr(sk, new_ce_state); } -SEC("struct_ops/dctcp_cwnd_event") +SEC("struct_ops") void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); switch (ev) { case CA_EVENT_ECN_IS_CE: @@ -220,17 +226,17 @@ void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) } } -SEC("struct_ops/dctcp_cwnd_undo") +SEC("struct_ops") __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + const struct bpf_dctcp *ca = inet_csk_ca(sk); return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -SEC("struct_ops/dctcp_reno_cong_avoid") +SEC("struct_ops") void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { tcp_reno_cong_avoid(sk, ack, acked); diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c index d836f7c372..c91763f248 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -1,19 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; const char cubic[] = "cubic"; -void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(dctcp_nouse_release, struct sock *sk) { bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, (void *)cubic, sizeof(cubic)); diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c index 2ecd833dcd..8a7a4c1b54 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -1,14 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include +#include "bpf_tracing_net.h" #include -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "X"; -void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(nogpltcp_init, struct sock *sk) { } diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 7001965d1c..59843b430f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -2,6 +2,9 @@ #ifndef __BPF_TRACING_NET_H__ #define __BPF_TRACING_NET_H__ +#include +#include + #define AF_INET 2 #define AF_INET6 10 @@ -22,6 +25,7 @@ #define IP_TOS 1 +#define SOL_IPV6 41 #define IPV6_TCLASS 67 #define IPV6_AUTOFLOWLABEL 70 @@ -46,6 +50,13 @@ #define TCP_CA_NAME_MAX 16 #define TCP_NAGLE_OFF 1 +#define TCP_ECN_OK 1 +#define TCP_ECN_QUEUE_CWR 2 +#define TCP_ECN_DEMAND_CWR 4 +#define TCP_ECN_SEEN 8 + +#define TCP_CONG_NEEDS_ECN 0x2 + #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 #define ICSK_TIME_LOSS_PROBE 5 @@ -80,6 +91,14 @@ #define TCP_INFINITE_SSTHRESH 0x7fffffff #define TCP_PINGPONG_THRESH 3 +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_SND_UNA_ADVANCED \ + 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED) + #define fib_nh_dev nh_common.nhc_dev #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw6 nh_common.nhc_gw.ipv6 @@ -119,4 +138,37 @@ #define tw_v6_daddr __tw_common.skc_v6_daddr #define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr +#define tcp_jiffies32 ((__u32)bpf_jiffies64()) + +static inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static inline void *inet_csk_ca(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ca_priv; +} + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd < tp->snd_ssthresh; +} + +static inline bool tcp_is_cwnd_limited(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ + if (tcp_in_slow_start(tp)) + return tp->snd_cwnd < 2 * tp->max_packets_out; + + return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); +} + #endif diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c index ba97165bdb..a657651eba 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c @@ -14,9 +14,9 @@ typedef int *ptr_arr_t[6]; typedef int *ptr_multiarr_t[7][8][9][10]; -typedef int * (*fn_ptr_arr_t[11])(); +typedef int * (*fn_ptr_arr_t[11])(void); -typedef int * (*fn_ptr_multiarr_t[12][13])(); +typedef int * (*fn_ptr_multiarr_t[12][13])(void); struct root_struct { arr_t _1; diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index ad21ee8c7e..29d01fff32 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -100,7 +100,7 @@ typedef void (*printf_fn_t)(const char *, ...); * `int -> char *` function and returns pointer to a char. Equivalent: * typedef char * (*fn_input_t)(int); * typedef char * (*fn_output_outer_t)(fn_input_t); - * typedef const fn_output_outer_t (* fn_output_inner_t)(); + * typedef const fn_output_outer_t (* fn_output_inner_t)(void); * typedef const fn_output_inner_t fn_ptr_arr2_t[5]; */ /* ----- START-EXPECTED-OUTPUT ----- */ @@ -127,7 +127,7 @@ typedef void (* (*signal_t)(int, void (*)(int)))(int); typedef char * (*fn_ptr_arr1_t[10])(int **); -typedef char * (* (* const fn_ptr_arr2_t[5])())(char * (*)(int)); +typedef char * (* (* const fn_ptr_arr2_t[5])(void))(char * (*)(int)); struct struct_w_typedefs { int_t a; diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index 22914a70db..73ba32e9a6 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -13,7 +13,7 @@ struct __cgrps_kfunc_map_value { struct cgroup __kptr * cgrp; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __cgrps_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 7ef49ec048..9e9ebf27b8 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -14,8 +14,6 @@ #include #include -#include "bpf_tcp_helpers.h" - #define SRC_REWRITE_IP4 0x7f000004U #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 @@ -32,6 +30,10 @@ #define IFNAMSIZ 16 #endif +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + __attribute__ ((noinline)) __weak int do_bind(struct bpf_sock_addr *ctx) { @@ -197,4 +199,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) return do_bind(ctx) ? 1 : 0; } +SEC("cgroup/connect4") +int connect_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c index 40266d2c73..e98573b00d 100644 --- a/tools/testing/selftests/bpf/progs/connect6_prog.c +++ b/tools/testing/selftests/bpf/progs/connect6_prog.c @@ -90,4 +90,10 @@ int connect_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect6") +int connect_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect_unix_prog.c b/tools/testing/selftests/bpf/progs/connect_unix_prog.c index 2ef0e0c46d..ba60adadb3 100644 --- a/tools/testing/selftests/bpf/progs/connect_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/connect_unix_prog.c @@ -36,4 +36,10 @@ int connect_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect_unix") +int connect_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index c705d8112a..b979e91f55 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -9,7 +9,7 @@ int err; -#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) +#define private(name) SEC(".bss." #name) __attribute__((aligned(8))) private(MASK) static struct bpf_cpumask __kptr * global_mask; struct __cpumask_map_value { diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index a9bf6ea336..a988d2823b 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -61,11 +61,8 @@ SEC("tp_btf/task_newtask") __failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask") int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags) { - struct bpf_cpumask *cpumask; - /* Can't set the CPU of a non-struct bpf_cpumask. */ bpf_cpumask_set_cpu(0, (struct bpf_cpumask *)task->cpus_ptr); - __sink(cpumask); return 0; } diff --git a/tools/testing/selftests/bpf/progs/crypto_basic.c b/tools/testing/selftests/bpf/progs/crypto_basic.c new file mode 100644 index 0000000000..8cf7168b42 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_basic.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +int status; +SEC("syscall") +int crypto_release(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +SEC("syscall") +__failure __msg("Unreleased reference") +int crypto_acquire(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + cctx = bpf_crypto_ctx_acquire(cctx); + if (!cctx) + return -EINVAL; + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c new file mode 100644 index 0000000000..e61fe08822 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_bench.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +const volatile unsigned int len = 16; +char cipher[128] = {}; +u32 key_len, authsize; +char dst[256] = {}; +u8 key[256] = {}; +long hits = 0; +int status; + +SEC("syscall") +int crypto_setup(void *args) +{ + struct bpf_crypto_ctx *cctx; + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + int err = 0; + + status = 0; + + if (!cipher[0] || !key_len || key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, cipher, sizeof(cipher)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int crypto_encrypt(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return 0; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return 0; + } + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +SEC("tc") +int crypto_decrypt(struct __sk_buff *skb) +{ + struct bpf_dynptr psrc, pdst, iv; + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + + v = crypto_ctx_value_lookup(); + if (!v) + return -ENOENT; + + ctx = v->ctx; + if (!ctx) + return -ENOENT; + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_common.h b/tools/testing/selftests/bpf/progs/crypto_common.h new file mode 100644 index 0000000000..57dd7a68a8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_common.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#ifndef _CRYPTO_COMMON_H +#define _CRYPTO_COMMON_H + +#include "errno.h" +#include + +struct bpf_crypto_ctx *bpf_crypto_ctx_create(const struct bpf_crypto_params *params, + u32 params__sz, int *err) __ksym; +struct bpf_crypto_ctx *bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) __ksym; +void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) __ksym; +int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; +int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; + +struct __crypto_ctx_value { + struct bpf_crypto_ctx __kptr * ctx; +}; + +struct array_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct __crypto_ctx_value); + __uint(max_entries, 1); +} __crypto_ctx_map SEC(".maps"); + +static inline struct __crypto_ctx_value *crypto_ctx_value_lookup(void) +{ + u32 key = 0; + + return bpf_map_lookup_elem(&__crypto_ctx_map, &key); +} + +static inline int crypto_ctx_insert(struct bpf_crypto_ctx *ctx) +{ + struct __crypto_ctx_value local, *v; + struct bpf_crypto_ctx *old; + u32 key = 0; + int err; + + local.ctx = NULL; + err = bpf_map_update_elem(&__crypto_ctx_map, &key, &local, 0); + if (err) { + bpf_crypto_ctx_release(ctx); + return err; + } + + v = bpf_map_lookup_elem(&__crypto_ctx_map, &key); + if (!v) { + bpf_crypto_ctx_release(ctx); + return -ENOENT; + } + + old = bpf_kptr_xchg(&v->ctx, ctx); + if (old) { + bpf_crypto_ctx_release(old); + return -EEXIST; + } + + return 0; +} + +#endif /* _CRYPTO_COMMON_H */ diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c new file mode 100644 index 0000000000..1be0a3fa5e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +unsigned char key[256] = {}; +u16 udp_test_port = 7777; +u32 authsize, key_len; +char algo[128] = {}; +char dst[16] = {}; +int status; + +static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) +{ + struct ipv6hdr ip6h; + struct udphdr udph; + u32 offset; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IPV6)) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip6h, sizeof(ip6h))) + return -1; + + if (ip6h.nexthdr != IPPROTO_UDP) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(ip6h), &udph, sizeof(udph))) + return -1; + + if (udph.dest != __bpf_htons(udp_test_port)) + return -1; + + offset = ETH_HLEN + sizeof(ip6h) + sizeof(udph); + if (skb->len < offset + 16) + return -1; + + /* let's make sure that 16 bytes of payload are in the linear part of skb */ + bpf_skb_pull_data(skb, offset + 16); + bpf_dynptr_from_skb(skb, 0, psrc); + bpf_dynptr_adjust(psrc, offset, offset + 16); + + return 0; +} + +SEC("syscall") +int skb_crypto_setup(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + if (key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int decrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +SEC("tc") +int encrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + status = 0; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index 7ce7e827d5..66a60bfb58 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -80,7 +80,7 @@ SEC("?raw_tp") __failure __msg("Unreleased reference id=2") int ringbuf_missing_release1(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr); @@ -1385,7 +1385,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_adjust_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_adjust(&ptr, 1, 2); @@ -1398,7 +1398,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_null_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_null(&ptr); @@ -1411,7 +1411,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_rdonly_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_rdonly(&ptr); @@ -1424,7 +1424,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_size_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_size(&ptr); @@ -1437,7 +1437,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int clone_invalid1(void *ctx) { - struct bpf_dynptr ptr1; + struct bpf_dynptr ptr1 = {}; struct bpf_dynptr ptr2; /* this should fail */ diff --git a/tools/testing/selftests/bpf/progs/fib_lookup.c b/tools/testing/selftests/bpf/progs/fib_lookup.c index c4514dd58c..7b5dd2214f 100644 --- a/tools/testing/selftests/bpf/progs/fib_lookup.c +++ b/tools/testing/selftests/bpf/progs/fib_lookup.c @@ -3,8 +3,8 @@ #include #include +#include #include -#include "bpf_tracing_net.h" struct bpf_fib_lookup fib_params = {}; int fib_lookup_ret = 0; diff --git a/tools/testing/selftests/bpf/progs/for_each_multi_maps.c b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c new file mode 100644 index 0000000000..ff0bed7d44 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 5); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct callback_ctx { + int output; +}; + +u32 data_output = 0; +int use_array = 0; + +static __u64 +check_map_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + return 0; +} + +SEC("tc") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + if (use_array) + bpf_for_each_map_elem(&arraymap, check_map_elem, &data, 0); + else + bpf_for_each_map_elem(&hashmap, check_map_elem, &data, 0); + data_output = data.output; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/getpeername4_prog.c b/tools/testing/selftests/bpf/progs/getpeername4_prog.c new file mode 100644 index 0000000000..4c97208cd2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getpeername4") +int getpeername_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getpeername6_prog.c b/tools/testing/selftests/bpf/progs/getpeername6_prog.c new file mode 100644 index 0000000000..070e4d7f63 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getpeername6") +int getpeername_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname4_prog.c b/tools/testing/selftests/bpf/progs/getsockname4_prog.c new file mode 100644 index 0000000000..e298487c63 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getsockname4") +int getsockname_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname6_prog.c b/tools/testing/selftests/bpf/progs/getsockname6_prog.c new file mode 100644 index 0000000000..811d10cd55 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getsockname6") +int getsockname_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 3db416606f..fe65e0952a 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -673,7 +673,7 @@ static __noinline void fill(struct bpf_iter_num *it, int *arr, __u32 n, int mul) static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n) { - int *t, i, sum = 0;; + int *t, i, sum = 0; while ((t = bpf_iter_num_next(it))) { i = *t; diff --git a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c index f46965053a..4d619bea9c 100644 --- a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c +++ b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c @@ -4,6 +4,10 @@ #include #include "bpf_misc.h" +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + char _license[] SEC("license") = "GPL"; struct { diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c new file mode 100644 index 0000000000..bbba9eb465 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0])) + +char _license[] SEC("license") = "GPL"; + +extern const void bpf_fentry_test1 __ksym; +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_fentry_test5 __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; +extern const void bpf_fentry_test8 __ksym; + +int pid = 0; + +__u64 kprobe_session_result[8]; + +static int session_check(void *ctx) +{ + unsigned int i; + __u64 addr; + const void *kfuncs[] = { + &bpf_fentry_test1, + &bpf_fentry_test2, + &bpf_fentry_test3, + &bpf_fentry_test4, + &bpf_fentry_test5, + &bpf_fentry_test6, + &bpf_fentry_test7, + &bpf_fentry_test8, + }; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + addr = bpf_get_func_ip(ctx); + + for (i = 0; i < ARRAY_SIZE(kfuncs); i++) { + if (kfuncs[i] == (void *) addr) { + kprobe_session_result[i]++; + break; + } + } + + /* + * Force probes for function bpf_fentry_test[5-8] not to + * install and execute the return probe + */ + if (((const void *) addr == &bpf_fentry_test5) || + ((const void *) addr == &bpf_fentry_test6) || + ((const void *) addr == &bpf_fentry_test7) || + ((const void *) addr == &bpf_fentry_test8)) + return 1; + + return 0; +} + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +SEC("kprobe.session/bpf_fentry_test*") +int test_kprobe(struct pt_regs *ctx) +{ + return session_check(ctx); +} diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c new file mode 100644 index 0000000000..0835b5edf6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +char _license[] SEC("license") = "GPL"; + +int pid = 0; + +__u64 test_kprobe_1_result = 0; +__u64 test_kprobe_2_result = 0; +__u64 test_kprobe_3_result = 0; + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +static int check_cookie(__u64 val, __u64 *result) +{ + __u64 *cookie; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + cookie = bpf_session_cookie(); + + if (bpf_session_is_return()) + *result = *cookie == val ? val : 0; + else + *cookie = val; + return 0; +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_1(struct pt_regs *ctx) +{ + return check_cookie(1, &test_kprobe_1_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_2(struct pt_regs *ctx) +{ + return check_cookie(2, &test_kprobe_2_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_3(struct pt_regs *ctx) +{ + return check_cookie(3, &test_kprobe_3_result); +} diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index da30f0d593..ab0ce1d01a 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -110,10 +110,14 @@ DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_map, array_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_map, array_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_malloc_map, array_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, lru_hash_map, array_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_array_map, array_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_hash_map, array_of_pcpu_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, array_map, hash_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_map, hash_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_malloc_map, hash_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, lru_hash_map, hash_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_array_map, hash_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_hash_map, hash_of_pcpu_hash_maps); #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) @@ -204,6 +208,8 @@ int test_map_kptr(struct __sk_buff *ctx) TEST(hash_map); TEST(hash_malloc_map); TEST(lru_hash_map); + TEST(pcpu_array_map); + TEST(pcpu_hash_map); #undef TEST return 0; @@ -281,10 +287,14 @@ int test_map_in_map_kptr(struct __sk_buff *ctx) TEST(array_of_hash_maps); TEST(array_of_hash_malloc_maps); TEST(array_of_lru_hash_maps); + TEST(array_of_pcpu_array_maps); + TEST(array_of_pcpu_hash_maps); TEST(hash_of_array_maps); TEST(hash_of_hash_maps); TEST(hash_of_hash_malloc_maps); TEST(hash_of_lru_hash_maps); + TEST(hash_of_pcpu_array_maps); + TEST(hash_of_pcpu_hash_maps); #undef TEST return 0; diff --git a/tools/testing/selftests/bpf/progs/mptcp_sock.c b/tools/testing/selftests/bpf/progs/mptcp_sock.c index 91a0d7eff2..f3acb90588 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_sock.c +++ b/tools/testing/selftests/bpf/progs/mptcp_sock.c @@ -2,9 +2,9 @@ /* Copyright (c) 2020, Tessares SA. */ /* Copyright (c) 2022, SUSE. */ -#include +#include "bpf_tracing_net.h" #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; __u32 token = 0; diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c index 53301ae8a8..cbdc730c3a 100644 --- a/tools/testing/selftests/bpf/progs/mptcpify.c +++ b/tools/testing/selftests/bpf/progs/mptcpify.c @@ -6,10 +6,14 @@ #include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; +int pid; SEC("fmod_ret/update_socket_protocol") int BPF_PROG(mptcpify, int family, int type, int protocol) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return protocol; + if ((family == AF_INET || family == AF_INET6) && type == SOCK_STREAM && (!protocol || protocol == IPPROTO_TCP)) { diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c new file mode 100644 index 0000000000..672fc368d9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("3 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_3(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_enable(); + bpf_preempt_enable(); + return 0; +} + +static __noinline void preempt_disable(void) +{ + bpf_preempt_disable(); +} + +static __noinline void preempt_enable(void) +{ + bpf_preempt_enable(); +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + preempt_enable(); + return 0; +} + +static __noinline void preempt_balance_subprog(void) +{ + preempt_disable(); + preempt_enable(); +} + +SEC("?tc") +__success int preempt_balance(struct __sk_buff *ctx) +{ + bpf_guard_preempt(); + return 0; +} + +SEC("?tc") +__success int preempt_balance_subprog_test(struct __sk_buff *ctx) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#") +int preempt_sleepable_helper(void *ctx) +{ + u32 data; + + bpf_preempt_disable(); + bpf_copy_from_user(&data, sizeof(data), NULL); + bpf_preempt_enable(); + return 0; +} + +int __noinline preempt_global_subprog(void) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?tc") +__failure __msg("global function calls are not allowed with preemption disabled") +int preempt_global_subprog_test(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_global_subprog(); + preempt_enable(); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sample_map_ret0.c b/tools/testing/selftests/bpf/progs/sample_map_ret0.c deleted file mode 100644 index 495990d355..0000000000 --- a/tools/testing/selftests/bpf/progs/sample_map_ret0.c +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, __u32); - __type(value, long); - __uint(max_entries, 2); -} htab SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, __u32); - __type(value, long); - __uint(max_entries, 2); -} array SEC(".maps"); - -/* Sample program which should always load for testing control paths. */ -SEC(".text") int func() -{ - __u64 key64 = 0; - __u32 key = 0; - long *value; - - value = bpf_map_lookup_elem(&htab, &key); - if (!value) - return 1; - value = bpf_map_lookup_elem(&array, &key64); - if (!value) - return 1; - - return 0; -} diff --git a/tools/testing/selftests/bpf/progs/sample_ret0.c b/tools/testing/selftests/bpf/progs/sample_ret0.c deleted file mode 100644 index fec99750d6..0000000000 --- a/tools/testing/selftests/bpf/progs/sample_ret0.c +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ - -/* Sample program which should always load for testing control paths. */ -int func() -{ - return 0; -} diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c index 351e79aef2..edc159598a 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c @@ -49,4 +49,10 @@ int sendmsg_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg4") +int sendmsg_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index bf9b46b806..36a7f96079 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -20,6 +20,11 @@ #define DST_REWRITE_IP6_2 0 #define DST_REWRITE_IP6_3 1 +#define DST_REWRITE_IP6_V4_MAPPED_0 0 +#define DST_REWRITE_IP6_V4_MAPPED_1 0 +#define DST_REWRITE_IP6_V4_MAPPED_2 0x0000FFFF +#define DST_REWRITE_IP6_V4_MAPPED_3 0xc0a80004 // 192.168.0.4 + #define DST_REWRITE_PORT6 6666 SEC("cgroup/sendmsg6") @@ -59,4 +64,56 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_v4mapped_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_0); + ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_1); + ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_2); + ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_3); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + +SEC("cgroup/sendmsg6") +int sendmsg_v6_wildcard_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(0); + ctx->user_ip6[1] = bpf_htonl(0); + ctx->user_ip6[2] = bpf_htonl(0); + ctx->user_ip6[3] = bpf_htonl(0); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + +SEC("cgroup/sendmsg6") +int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg6") +int sendmsg_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c index d8869b03dd..332d0eb111 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c @@ -36,4 +36,10 @@ int sendmsg_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg_unix") +int sendmsg_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c index 992b786100..db4abd2682 100644 --- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX #define BPF_NO_PRESERVE_ACCESS_INDEX +#endif #include #include #include diff --git a/tools/testing/selftests/bpf/progs/sock_addr_kern.c b/tools/testing/selftests/bpf/progs/sock_addr_kern.c new file mode 100644 index 0000000000..8386bb15cc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sock_addr_kern.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ +#include +#include +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +SEC("syscall") +int init_sock(struct init_sock_args *args) +{ + bpf_kfunc_init_sock(args); + + return 0; +} + +SEC("syscall") +int close_sock(void *ctx) +{ + bpf_kfunc_close_sock(); + + return 0; +} + +SEC("syscall") +int kernel_connect(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_connect(args); +} + +SEC("syscall") +int kernel_bind(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_bind(args); +} + +SEC("syscall") +int kernel_listen(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_listen(); +} + +SEC("syscall") +int kernel_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_kernel_sendmsg(args); +} + +SEC("syscall") +int sock_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_sock_sendmsg(args); +} + +SEC("syscall") +int kernel_getsockname(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getsockname(args); +} + +SEC("syscall") +int kernel_getpeername(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getpeername(args); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c index 83753b00a5..5c3614333b 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c +++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c @@ -1,24 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include -#include -#include -#include -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; __s32 page_size = 0; +const char cc_reno[TCP_CA_NAME_MAX] = "reno"; +const char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; + SEC("cgroup/setsockopt") int sockopt_qos_to_cc(struct bpf_sockopt *ctx) { void *optval_end = ctx->optval_end; int *optval = ctx->optval; char buf[TCP_CA_NAME_MAX]; - char cc_reno[TCP_CA_NAME_MAX] = "reno"; - char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS) goto out; @@ -29,11 +25,11 @@ int sockopt_qos_to_cc(struct bpf_sockopt *ctx) if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) return 0; - if (!tcp_cc_eq(buf, cc_cubic)) + if (bpf_strncmp(buf, sizeof(buf), cc_cubic)) return 0; if (*optval == 0x2d) { - if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno, + if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, (void *)&cc_reno, sizeof(cc_reno))) return 0; } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c new file mode 100644 index 0000000000..3c822103bd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_forgotten) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + /* we forgot to reference test_1_forgotten above, oops */ +}; + diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 026cabfa7f..4c56d4a9d9 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -23,7 +23,7 @@ void BPF_PROG(test_2, int a, int b) test_2_result = a + b; } -SEC("struct_ops/test_3") +SEC("?struct_ops/test_3") int BPF_PROG(test_3, int a, int b) { test_2_result = a + b + 3; @@ -54,3 +54,37 @@ struct bpf_testmod_ops___v2 testmod_2 = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, }; + +struct bpf_testmod_ops___zeroed { + int (*test_1)(void); + void (*test_2)(int a, int b); + int (*test_maybe_null)(int dummy, struct task_struct *task); + void (*zeroed_op)(int a, int b); + int zeroed; +}; + +SEC("struct_ops/test_3") +int BPF_PROG(zeroed_op) +{ + return 1; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops___zeroed testmod_zeroed = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2_v2, + .zeroed_op = (void *)zeroed_op, +}; + +struct bpf_testmod_ops___incompatible { + int (*test_1)(void); + void (*test_2)(int *a); + int data; +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops___incompatible testmod_incompatible = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2, + .data = 3, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c new file mode 100644 index 0000000000..fa20213884 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +int rand; +int arr[1]; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_turn_off) +{ + return arr[rand]; /* potentially way out of range access */ +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + .test_1 = (void *)test_1_turn_off, +}; + diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index 41f2d44f49..6720c4b5be 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -13,7 +13,7 @@ struct __tasks_kfunc_map_value { struct task_struct __kptr * task; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __tasks_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c index 7bb872fb22..0016c90e9c 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -1,24 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include char _license[] SEC("license") = "GPL"; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -SEC("struct_ops/incompl_cong_ops_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/incompl_cong_ops_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c new file mode 100644 index 0000000000..f95862f570 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Facebook */ + +#include "vmlinux.h" +#include + +extern void bbr_init(struct sock *sk) __ksym; +extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym; +extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; +extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; +extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern u32 bbr_ssthresh(struct sock *sk) __ksym; +extern u32 bbr_min_tso_segs(struct sock *sk) __ksym; +extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym; + +extern void dctcp_init(struct sock *sk) __ksym; +extern void dctcp_update_alpha(struct sock *sk, u32 flags) __ksym; +extern void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) __ksym; +extern u32 dctcp_ssthresh(struct sock *sk) __ksym; +extern u32 dctcp_cwnd_undo(struct sock *sk) __ksym; +extern void dctcp_state(struct sock *sk, u8 new_state) __ksym; + +extern void cubictcp_init(struct sock *sk) __ksym; +extern u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) __ksym; +extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; + +SEC("struct_ops") +void BPF_PROG(init, struct sock *sk) +{ + bbr_init(sk); + dctcp_init(sk); + cubictcp_init(sk); +} + +SEC("struct_ops") +void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) +{ + dctcp_update_alpha(sk, flags); +} + +SEC("struct_ops") +void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) +{ + bbr_main(sk, ack, flag, rs); +} + +SEC("struct_ops") +void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) +{ + cubictcp_cong_avoid(sk, ack, acked); +} + +SEC("struct_ops") +u32 BPF_PROG(sndbuf_expand, struct sock *sk) +{ + return bbr_sndbuf_expand(sk); +} + +SEC("struct_ops") +u32 BPF_PROG(undo_cwnd, struct sock *sk) +{ + bbr_undo_cwnd(sk); + return dctcp_cwnd_undo(sk); +} + +SEC("struct_ops") +void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + bbr_cwnd_event(sk, event); + dctcp_cwnd_event(sk, event); + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops") +u32 BPF_PROG(ssthresh, struct sock *sk) +{ + bbr_ssthresh(sk); + dctcp_ssthresh(sk); + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops") +u32 BPF_PROG(min_tso_segs, struct sock *sk) +{ + return bbr_min_tso_segs(sk); +} + +SEC("struct_ops") +void BPF_PROG(set_state, struct sock *sk, u8 new_state) +{ + bbr_set_state(sk, new_state); + dctcp_state(sk, new_state); + cubictcp_state(sk, new_state); +} + +SEC("struct_ops") +void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC(".struct_ops") +struct tcp_congestion_ops tcp_ca_kfunc = { + .init = (void *)init, + .in_ack_event = (void *)in_ack_event, + .cong_control = (void *)cong_control, + .cong_avoid = (void *)cong_avoid, + .sndbuf_expand = (void *)sndbuf_expand, + .undo_cwnd = (void *)undo_cwnd, + .cwnd_event = (void *)cwnd_event, + .ssthresh = (void *)ssthresh, + .min_tso_segs = (void *)min_tso_segs, + .set_state = (void *)set_state, + .pkts_acked = (void *)pkts_acked, + .name = "tcp_ca_kfunc", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c index c06f4a41c2..54f916a931 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c @@ -7,7 +7,7 @@ char _license[] SEC("license") = "GPL"; -SEC("struct_ops/unsupp_cong_op_get_info") +SEC("struct_ops") size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c index b93a0ed330..e4bd82bc0d 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_update.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include @@ -10,36 +9,31 @@ char _license[] SEC("license") = "GPL"; int ca1_cnt = 0; int ca2_cnt = 0; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -SEC("struct_ops/ca_update_1_init") +SEC("struct_ops") void BPF_PROG(ca_update_1_init, struct sock *sk) { ca1_cnt++; } -SEC("struct_ops/ca_update_2_init") +SEC("struct_ops") void BPF_PROG(ca_update_2_init, struct sock *sk) { ca2_cnt++; } -SEC("struct_ops/ca_update_cong_control") +SEC("struct_ops") void BPF_PROG(ca_update_cong_control, struct sock *sk, const struct rate_sample *rs) { } -SEC("struct_ops/ca_update_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(ca_update_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/ca_update_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index 0724a79cec..a58b5194fc 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include @@ -11,22 +10,17 @@ char _license[] SEC("license") = "GPL"; #define min(a, b) ((a) < (b) ? (a) : (b)) -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -static inline unsigned int tcp_left_out(const struct tcp_sock *tp) +static unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } -static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } -SEC("struct_ops/write_sk_pacing_init") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_init, struct sock *sk) { #ifdef ENABLE_ATOMICS_TESTS @@ -37,7 +31,7 @@ void BPF_PROG(write_sk_pacing_init, struct sock *sk) #endif } -SEC("struct_ops/write_sk_pacing_cong_control") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, const struct rate_sample *rs) { @@ -49,13 +43,13 @@ void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ?: 1; } -SEC("struct_ops/write_sk_pacing_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/write_sk_pacing_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c index 0988d79f15..42c729f855 100644 --- a/tools/testing/selftests/bpf/progs/tcp_rtt.c +++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; struct { @@ -55,5 +58,8 @@ int _sockops(struct bpf_sock_ops *ctx) storage->delivered_ce = tcp_sk->delivered_ce; storage->icsk_retransmits = tcp_sk->icsk_retransmits; + storage->mrtt_us = ctx->args[0]; + storage->srtt = ctx->args[1]; + return 1; } diff --git a/tools/testing/selftests/bpf/progs/test_access_variable_array.c b/tools/testing/selftests/bpf/progs/test_access_variable_array.c index 808c49b798..326b7d1f49 100644 --- a/tools/testing/selftests/bpf/progs/test_access_variable_array.c +++ b/tools/testing/selftests/bpf/progs/test_access_variable_array.c @@ -7,7 +7,7 @@ unsigned long span = 0; -SEC("fentry/load_balance") +SEC("fentry/sched_balance_rq") int BPF_PROG(fentry_fentry, int this_cpu, struct rq *this_rq, struct sched_domain *sd) { diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c index 5a3a80f751..c83142b55f 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -15,6 +15,8 @@ __u64 uprobe_res; __u64 uretprobe_res; __u64 tp_res; __u64 pe_res; +__u64 raw_tp_res; +__u64 tp_btf_res; __u64 fentry_res; __u64 fexit_res; __u64 fmod_ret_res; @@ -87,6 +89,20 @@ int handle_pe(struct pt_regs *ctx) return 0; } +SEC("raw_tp/sys_enter") +int handle_raw_tp(void *ctx) +{ + update(ctx, &raw_tp_res); + return 0; +} + +SEC("tp_btf/sys_enter") +int handle_tp_btf(void *ctx) +{ + update(ctx, &tp_btf_res); + return 0; +} + SEC("fentry/bpf_fentry_test1") int BPF_PROG(fentry_test1, int a) { diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c index e2bea4da19..f0759efff6 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c @@ -1,19 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" + +#ifndef ENOENT +#define ENOENT 2 +#endif struct sockaddr_in6 srv_sa6 = {}; __u16 listen_tp_sport = 0; diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c index 8fba3f3649..5da001ca57 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func10.c +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -4,6 +4,10 @@ #include #include "bpf_misc.h" +#if !defined(__clang__) +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + struct Small { long x; }; diff --git a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c index 8c895122f2..83439b87b7 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c @@ -3,7 +3,7 @@ #include #include #include -#include "bpf_tracing_net.h" +#include /* We don't care about whether the packet can be received by network stack. * Just care if the packet is sent to the correct device at correct direction diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index 8a1b50f3a0..cc1a012d03 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -73,6 +73,29 @@ int BPF_PROG(handle_fentry_manual, return 0; } +__u32 fentry_explicit_read_sz = 0; + +SEC("fentry/bpf_testmod:bpf_testmod_test_read") +int BPF_PROG(handle_fentry_explicit, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_read_sz = len; + return 0; +} + + +__u32 fentry_explicit_manual_read_sz = 0; + +SEC("fentry") +int BPF_PROG(handle_fentry_explicit_manual, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_manual_read_sz = len; + return 0; +} + __u32 fexit_read_sz = 0; int fexit_ret = 0; diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c index 0763d49f9c..386315afad 100644 --- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -5,23 +5,48 @@ #include #include +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u32); +} sock_map SEC(".maps"); + __u64 user_pid = 0; __u64 user_tgid = 0; __u64 dev = 0; __u64 ino = 0; -SEC("tracepoint/syscalls/sys_enter_nanosleep") -int handler(const void *ctx) +static void get_pid_tgid(void) { struct bpf_pidns_info nsdata; if (bpf_get_ns_current_pid_tgid(dev, ino, &nsdata, sizeof(struct bpf_pidns_info))) - return 0; + return; user_pid = nsdata.pid; user_tgid = nsdata.tgid; +} +SEC("?tracepoint/syscalls/sys_enter_nanosleep") +int tp_handler(const void *ctx) +{ + get_pid_tgid(); return 0; } +SEC("?cgroup/bind4") +int cgroup_bind4(struct bpf_sock_addr *ctx) +{ + get_pid_tgid(); + return 1; +} + +SEC("?sk_msg") +int sk_msg(struct sk_msg_md *msg) +{ + get_pid_tgid(); + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_perf_skip.c b/tools/testing/selftests/bpf/progs/test_perf_skip.c new file mode 100644 index 0000000000..7eb8b6de7a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_skip.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include + +uintptr_t ip; + +SEC("perf_event") +int handler(struct bpf_perf_event_data *data) +{ + /* Skip events that have the correct ip. */ + return ip != PT_REGS_IP(&data->regs); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_n.c b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c new file mode 100644 index 0000000000..8669eb42db --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Andrea Righi + +#include +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +#define TASK_COMM_LEN 16 + +struct sample { + int pid; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +int pid = 0; +long value = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_n(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) + return 0; + + sample->pid = pid; + sample->value = value; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + + bpf_ringbuf_submit(sample, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_write.c b/tools/testing/selftests/bpf/progs/test_ringbuf_write.c new file mode 100644 index 0000000000..350513c0e4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_write.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +/* inputs */ +int pid = 0; + +/* outputs */ +long passed = 0; +long discarded = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_write(void *ctx) +{ + int *foo, cur_pid = bpf_get_current_pid_tgid() >> 32; + void *sample1, *sample2; + + if (cur_pid != pid) + return 0; + + sample1 = bpf_ringbuf_reserve(&ringbuf, 0x3000, 0); + if (!sample1) + return 0; + /* first one can pass */ + sample2 = bpf_ringbuf_reserve(&ringbuf, 0x3000, 0); + if (!sample2) { + bpf_ringbuf_discard(sample1, 0); + __sync_fetch_and_add(&discarded, 1); + return 0; + } + /* second one must not */ + __sync_fetch_and_add(&passed, 1); + foo = sample2 + 4084; + *foo = 256; + bpf_ringbuf_discard(sample1, 0); + bpf_ringbuf_discard(sample2, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c index 02e718f06e..40531e5677 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c @@ -84,7 +84,7 @@ int BPF_PROG(trace_tcp_connect, struct sock *sk) } SEC("fexit/inet_csk_accept") -int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, +int BPF_PROG(inet_csk_accept, struct sock *sk, struct proto_accept_arg *arg, struct sock *accepted_sk) { set_task_info(accepted_sk); diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c index 45e8fc75a7..996b177324 100644 --- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -24,8 +24,7 @@ struct { __type(value, __u64); } socket_storage SEC(".maps"); -SEC("sk_msg") -int prog_msg_verdict(struct sk_msg_md *msg) +static int prog_msg_verdict_common(struct sk_msg_md *msg) { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); int verdict = SK_PASS; @@ -44,4 +43,28 @@ int prog_msg_verdict(struct sk_msg_md *msg) return verdict; } +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone2(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c index f75e531bf3..196844be34 100644 --- a/tools/testing/selftests/bpf/progs/test_sock_fields.c +++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c @@ -7,7 +7,6 @@ #include #include -#include "bpf_tcp_helpers.h" enum bpf_linum_array_idx { EGRESS_LINUM_IDX, @@ -42,6 +41,10 @@ struct { __type(value, struct bpf_spinlock_cnt); } sk_pkt_out_cnt10 SEC(".maps"); +struct tcp_sock { + __u32 lsndtime; +} __attribute__((preserve_access_index)); + struct bpf_tcp_sock listen_tp = {}; struct sockaddr_in6 srv_sa6 = {}; struct bpf_tcp_sock cli_tp = {}; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 1d86a717a2..69aacc96db 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -23,10 +23,25 @@ struct { __type(value, int); } sock_map_msg SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/stream_verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_PASS; } +int clone_called; + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_clone(struct __sk_buff *skb) +{ + clone_called = 1; + return SK_PASS; +} + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c index 3c69aa9717..d25b0bb30f 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -9,7 +9,7 @@ struct { __type(value, __u64); } sock_map SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_DROP; diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c index 992400acb9..ab3eae3d6a 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_link.c +++ b/tools/testing/selftests/bpf/progs/test_tc_link.c @@ -4,7 +4,8 @@ #include #include - +#include +#include #include #include @@ -16,7 +17,13 @@ bool seen_tc3; bool seen_tc4; bool seen_tc5; bool seen_tc6; +bool seen_tc7; + +bool set_type; + bool seen_eth; +bool seen_host; +bool seen_mcast; SEC("tc/ingress") int tc1(struct __sk_buff *skb) @@ -28,8 +35,16 @@ int tc1(struct __sk_buff *skb) if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth))) goto out; seen_eth = eth.h_proto == bpf_htons(ETH_P_IP); + seen_host = skb->pkt_type == PACKET_HOST; + if (seen_host && set_type) { + eth.h_dest[0] = 4; + if (bpf_skb_store_bytes(skb, 0, ð, sizeof(eth), 0)) + goto fail; + bpf_skb_change_type(skb, PACKET_MULTICAST); + } out: seen_tc1 = true; +fail: return TCX_NEXT; } @@ -67,3 +82,21 @@ int tc6(struct __sk_buff *skb) seen_tc6 = true; return TCX_PASS; } + +SEC("tc/ingress") +int tc7(struct __sk_buff *skb) +{ + struct ethhdr eth = {}; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + goto out; + if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth))) + goto out; + if (eth.h_dest[0] == 4 && set_type) { + seen_mcast = skb->pkt_type == PACKET_MULTICAST; + bpf_skb_change_type(skb, PACKET_HOST); + } +out: + seen_tc7 = true; + return TCX_PASS; +} diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index a3f3f43fc1..6935f32eeb 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -1,18 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" #include "test_tcpbpf.h" struct tcpbpf_globals global = {}; diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 3e436e6f73..3f5abcf3ff 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -567,12 +567,18 @@ int ip6vxlan_get_tunnel_src(struct __sk_buff *skb) return TC_ACT_OK; } +struct local_geneve_opt { + struct geneve_opt gopt; + int data; +}; + SEC("tc") int geneve_set_tunnel(struct __sk_buff *skb) { int ret; struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; __builtin_memset(&key, 0x0, sizeof(key)); key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ @@ -580,14 +586,14 @@ int geneve_set_tunnel(struct __sk_buff *skb) key.tunnel_tos = 0; key.tunnel_ttl = 64; - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xdeadbeef); ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); @@ -596,7 +602,7 @@ int geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(local_gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -631,7 +637,8 @@ SEC("tc") int ip6geneve_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; int ret; __builtin_memset(&key, 0x0, sizeof(key)); @@ -647,16 +654,16 @@ int ip6geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xfeedbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xfeedbeef); - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 5c7e4758a0..fad94e41ce 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -318,6 +318,14 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC push_options +/* GCC optimization collapses functions and increases the number of arguments + * beyond the compatible amount supported by BPF. + */ +#pragma GCC optimize("-fno-ipa-sra") +#endif + static __attribute__ ((noinline)) bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, struct packet_description *pckt, @@ -372,6 +380,10 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC pop_options +#endif + static __attribute__ ((noinline)) int swap_mac_and_send(void *data, void *data_end) { @@ -588,12 +600,13 @@ static void connection_table_lookup(struct real_definition **real, __attribute__ ((noinline)) static int process_l3_headers_v6(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct ipv6hdr *ip6h; __u64 iph_len; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; ip6h = data + off; if (ip6h + 1 > data_end) @@ -619,11 +632,12 @@ static int process_l3_headers_v6(struct packet_description *pckt, __attribute__ ((noinline)) static int process_l3_headers_v4(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct iphdr *iph; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; iph = data + off; if (iph + 1 > data_end) @@ -666,13 +680,14 @@ static int process_packet(void *data, __u64 off, void *data_end, __u8 protocol; __u32 vip_num; int action; + void *extra_args[2] = { data, data_end }; if (is_ipv6) action = process_l3_headers_v6(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); else action = process_l3_headers_v4(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); if (action >= 0) return action; protocol = pckt.flow.proto; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c index f3ec808648..a758830226 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c @@ -160,7 +160,7 @@ int xdp_prognum1(struct xdp_md *ctx) /* Modifying VLAN, preserve top 4 bits */ vlan_hdr->h_vlan_TCI = - bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000) + bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000U) | TO_VLAN); } diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index f615da97df..4c677c0012 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -2,9 +2,10 @@ /* Copyright (c) 2021 Facebook */ #include #include +#include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_failure.c b/tools/testing/selftests/bpf/progs/timer_failure.c index 0996c2486f..5a2e9dabf1 100644 --- a/tools/testing/selftests/bpf/progs/timer_failure.c +++ b/tools/testing/selftests/bpf/progs/timer_failure.c @@ -5,8 +5,8 @@ #include #include #include +#include #include "bpf_misc.h" -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer_lockup.c b/tools/testing/selftests/bpf/progs/timer_lockup.c new file mode 100644 index 0000000000..3e52013328 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_lockup.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer1_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer2_map SEC(".maps"); + +int timer1_err; +int timer2_err; + +static int timer_cb1(void *map, int *k, struct elem *v) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer2_map, &key); + if (timer) + timer2_err = bpf_timer_cancel(timer); + + return 0; +} + +static int timer_cb2(void *map, int *k, struct elem *v) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer1_map, &key); + if (timer) + timer1_err = bpf_timer_cancel(timer); + + return 0; +} + +SEC("tc") +int timer1_prog(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer1_map, &key); + if (timer) { + bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, timer_cb1); + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); + } + + return 0; +} + +SEC("tc") +int timer2_prog(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer2_map, &key); + if (timer) { + bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, timer_cb2); + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); + } + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c index 2fee7ab105..50ebc3f685 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim.c +++ b/tools/testing/selftests/bpf/progs/timer_mim.c @@ -4,7 +4,7 @@ #include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c index 5d648e3d8a..dd3f1ed6d6 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim_reject.c +++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c @@ -4,7 +4,7 @@ #include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 5fda439010..2619ed193c 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook - #include #include #include @@ -9,82 +8,126 @@ char _license[] SEC("license") = "GPL"; -long hits = 0; +#define CPU_MASK 255 +#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */ -SEC("tp/syscalls/sys_enter_getpgid") -int bench_trigger_tp(void *ctx) +/* matches struct counter in bench.h */ +struct counter { + long value; +} __attribute__((aligned(128))); + +struct counter hits[MAX_CPUS]; + +static __always_inline void inc_counter(void) +{ + int cpu = bpf_get_smp_processor_id(); + + __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); +} + +SEC("?uprobe") +int bench_trigger_uprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("raw_tp/sys_enter") -int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) +const volatile int batch_iters = 0; + +SEC("?raw_tp") +int trigger_count(void *ctx) { - if (id == __NR_getpgid) - __sync_add_and_fetch(&hits, 1); + int i; + + for (i = 0; i < batch_iters; i++) + inc_counter(); + return 0; } -SEC("kprobe/" SYS_PREFIX "sys_getpgid") +SEC("?raw_tp") +int trigger_driver(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_get_numa_node_id(); /* attach point for benchmarking */ + + return 0; +} + +extern int bpf_modify_return_test_tp(int nonce) __ksym __weak; + +SEC("?raw_tp") +int trigger_driver_kfunc(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */ + + return 0; +} + +SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kretprobe/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fentry/" SYS_PREFIX "sys_getpgid") +SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fexit/" SYS_PREFIX "sys_getpgid") +SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fentry.s/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry_sleep(void *ctx) +SEC("?fmod_ret/bpf_modify_return_test_tp") +int bench_trigger_fmodret(void *ctx) { - __sync_add_and_fetch(&hits, 1); - return 0; + inc_counter(); + return -22; } -SEC("fmod_ret/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fmodret(void *ctx) +SEC("?tp/bpf_test_run/bpf_trigger_tp") +int bench_trigger_tp(void *ctx) { - __sync_add_and_fetch(&hits, 1); - return -22; + inc_counter(); + return 0; } -SEC("uprobe") -int bench_trigger_uprobe(void *ctx) +SEC("?raw_tp/bpf_trigger_tp") +int bench_trigger_rawtp(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi.c b/tools/testing/selftests/bpf/progs/uprobe_multi.c index 419d9aa28f..44190efcdb 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include "vmlinux.h" #include #include -#include +#include char _license[] SEC("license") = "GPL"; @@ -22,6 +22,13 @@ __u64 uprobe_multi_sleep_result = 0; int pid = 0; int child_pid = 0; +int child_tid = 0; +int child_pid_usdt = 0; +int child_tid_usdt = 0; + +int expect_pid = 0; +bool bad_pid_seen = false; +bool bad_pid_seen_usdt = false; bool test_cookie = false; void *user_ptr = 0; @@ -36,11 +43,19 @@ static __always_inline bool verify_sleepable_user_copy(void) static void uprobe_multi_check(void *ctx, bool is_return, bool is_sleep) { - child_pid = bpf_get_current_pid_tgid() >> 32; + __u64 cur_pid_tgid = bpf_get_current_pid_tgid(); + __u32 cur_pid; - if (pid && child_pid != pid) + cur_pid = cur_pid_tgid >> 32; + if (pid && cur_pid != pid) return; + if (expect_pid && cur_pid != expect_pid) + bad_pid_seen = true; + + child_pid = cur_pid_tgid >> 32; + child_tid = (__u32)cur_pid_tgid; + __u64 cookie = test_cookie ? bpf_get_attach_cookie(ctx) : 0; __u64 addr = bpf_get_func_ip(ctx); @@ -97,5 +112,32 @@ int uretprobe_sleep(struct pt_regs *ctx) SEC("uprobe.multi//proc/self/exe:uprobe_multi_func_*") int uprobe_extra(struct pt_regs *ctx) { + /* we need this one just to mix PID-filtered and global uprobes */ + return 0; +} + +SEC("usdt") +int usdt_pid(struct pt_regs *ctx) +{ + __u64 cur_pid_tgid = bpf_get_current_pid_tgid(); + __u32 cur_pid; + + cur_pid = cur_pid_tgid >> 32; + if (pid && cur_pid != pid) + return 0; + + if (expect_pid && cur_pid != expect_pid) + bad_pid_seen_usdt = true; + + child_pid_usdt = cur_pid_tgid >> 32; + child_tid_usdt = (__u32)cur_pid_tgid; + + return 0; +} + +SEC("usdt") +int usdt_extra(struct pt_regs *ctx) +{ + /* we need this one just to mix PID-filtered and global USDT probes */ return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 960998f163..a0bb7fb40e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -885,6 +885,69 @@ l1_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("bounds check for non const xor src dst") +__success __log_level(2) +__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_xor_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 ^= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("bounds check for non const or src dst") +__success __log_level(2) +__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_or_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 |= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("bounds check for non const mul regs") +__success __log_level(2) +__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__naked void non_const_mul_regs(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xff; \ + r0 &= 0x0f; \ + r0 *= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("bounds checks after 32-bit truncation. test 1") __success __failure_unpriv __msg_unpriv("R0 leaks addr") diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 0ede0ccd09..059aa716e3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -30,7 +30,7 @@ struct { SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_kprobe_1(void) { asm volatile (" \ @@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void) SEC("tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_tracepoint_1(void) { asm volatile (" \ @@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void) SEC("perf_event") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_perf_event_1(void) { asm volatile (" \ @@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void) SEC("raw_tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_raw_tracepoint_1(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index 99e561f18f..80c737b6d3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -274,6 +274,58 @@ static __naked void iter_limit_bug_cb(void) ); } +int tmp_var; +SEC("socket") +__failure __msg("infinite loop detected at insn 2") +__naked void jgt_imm64_and_may_goto(void) +{ + asm volatile (" \ + r0 = %[tmp_var] ll; \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short -3; /* off -3 */ \ + .long 0; /* imm */ \ + if r0 > 10 goto l0_%=; \ + r0 = 0; \ + exit; \ +" :: __imm_addr(tmp_var) + : __clobber_all); +} + +SEC("socket") +__failure __msg("infinite loop detected at insn 1") +__naked void may_goto_self(void) +{ + asm volatile (" \ + r0 = *(u32 *)(r10 - 4); \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short -1; /* off -1 */ \ + .long 0; /* imm */ \ + if r0 > 10 goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + +SEC("socket") +__success __retval(0) +__naked void may_goto_neg_off(void) +{ + asm volatile (" \ + r0 = *(u32 *)(r10 - 4); \ + goto l0_%=; \ + goto l1_%=; \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short -2; /* off -2 */ \ + .long 0; /* imm */ \ + if r0 > 10 goto l0_%=; \ +l1_%=: r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + SEC("tc") __failure __flag(BPF_F_TEST_STATE_FREQ) @@ -307,6 +359,100 @@ int iter_limit_bug(struct __sk_buff *skb) return 0; } +SEC("socket") +__success __retval(0) +__naked void ja_and_may_goto(void) +{ + asm volatile (" \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_common); +} + +SEC("socket") +__success __retval(0) +__naked void ja_and_may_goto2(void) +{ + asm volatile (" \ +l0_%=: r0 = 0; \ + .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_common); +} + +SEC("socket") +__success __retval(0) +__naked void jlt_and_may_goto(void) +{ + asm volatile (" \ +l0_%=: call %[bpf_jiffies64]; \ + .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + if r0 < 10 goto l0_%=; \ + r0 = 0; \ + exit; \ +" :: __imm(bpf_jiffies64) + : __clobber_all); +} + +#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ + defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ + defined(__TARGET_ARCH_loongarch)) && \ + __clang_major__ >= 18 +SEC("socket") +__success __retval(0) +__naked void gotol_and_may_goto(void) +{ + asm volatile (" \ +l0_%=: r0 = 0; \ + .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + gotol l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_common); +} +#endif + +SEC("socket") +__success __retval(0) +__naked void ja_and_may_goto_subprog(void) +{ + asm volatile (" \ + call subprog_with_may_goto; \ + exit; \ +" ::: __clobber_all); +} + +static __naked __noinline __used +void subprog_with_may_goto(void) +{ + asm volatile (" \ +l0_%=: .byte 0xe5; /* may_goto */ \ + .byte 0; /* regs */ \ + .short 1; /* off 1 */ \ + .long 0; /* imm */ \ + goto l0_%=; \ + r0 = 0; \ + exit; \ +" ::: __clobber_all); +} + #define ARR_SZ 1000000 int zero; char arr[ARR_SZ]; @@ -318,7 +464,7 @@ int cond_break1(const void *ctx) unsigned long i; unsigned int sum = 0; - for (i = zero; i < ARR_SZ; cond_break, i++) + for (i = zero; i < ARR_SZ && can_loop; i++) sum += i; for (i = zero; i < ARR_SZ; i++) { barrier_var(i); @@ -336,12 +482,11 @@ int cond_break2(const void *ctx) int i, j; int sum = 0; - for (i = zero; i < 1000; cond_break, i++) + for (i = zero; i < 1000 && can_loop; i++) for (j = zero; j < 1000; j++) { sum += i + j; cond_break; - } - + } return sum; } @@ -349,7 +494,7 @@ static __noinline int loop(void) { int i, sum = 0; - for (i = zero; i <= 1000000; i++, cond_break) + for (i = zero; i <= 1000000 && can_loop; i++) sum += i; return sum; diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c new file mode 100644 index 0000000000..cb32b0cfc8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "cgrp_kfunc_common.h" +#include "cpumask_common.h" +#include "task_kfunc_common.h" + +char _license[] SEC("license") = "GPL"; + +/*************** + * Task kfuncs * + ***************/ + +static void task_kfunc_load_test(void) +{ + struct task_struct *current, *ref_1, *ref_2; + + current = bpf_get_current_task_btf(); + ref_1 = bpf_task_from_pid(current->pid); + if (!ref_1) + return; + + ref_2 = bpf_task_acquire(ref_1); + if (ref_2) + bpf_task_release(ref_2); + bpf_task_release(ref_1); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(task_kfunc_raw_tp) +{ + task_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(task_kfunc_syscall) +{ + task_kfunc_load_test(); + return 0; +} + +/***************** + * cgroup kfuncs * + *****************/ + +static void cgrp_kfunc_load_test(void) +{ + struct cgroup *cgrp, *ref; + + cgrp = bpf_cgroup_from_id(0); + if (!cgrp) + return; + + ref = bpf_cgroup_acquire(cgrp); + if (!ref) { + bpf_cgroup_release(cgrp); + return; + } + + bpf_cgroup_release(ref); + bpf_cgroup_release(cgrp); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cgrp_kfunc_raw_tp) +{ + cgrp_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cgrp_kfunc_syscall) +{ + cgrp_kfunc_load_test(); + return 0; +} + +/****************** + * cpumask kfuncs * + ******************/ + +static void cpumask_kfunc_load_test(void) +{ + struct bpf_cpumask *alloc, *ref; + + alloc = bpf_cpumask_create(); + if (!alloc) + return; + + ref = bpf_cpumask_acquire(alloc); + bpf_cpumask_set_cpu(0, alloc); + bpf_cpumask_test_cpu(0, (const struct cpumask *)ref); + + bpf_cpumask_release(ref); + bpf_cpumask_release(alloc); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cpumask_kfunc_raw_tp) +{ + cpumask_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cpumask_kfunc_syscall) +{ + cpumask_kfunc_load_test(); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/verifier_movsx.c b/tools/testing/selftests/bpf/progs/verifier_movsx.c index cbb9d6714f..028ec85558 100644 --- a/tools/testing/selftests/bpf/progs/verifier_movsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_movsx.c @@ -224,6 +224,69 @@ l0_%=: \ : __clobber_all); } +SEC("socket") +__description("MOV32SX, S8, var_off u32_max") +__failure __msg("infinite loop detected") +__failure_unpriv __msg_unpriv("back-edge from insn 2 to 0") +__naked void mov64sx_s32_varoff_1(void) +{ + asm volatile (" \ +l0_%=: \ + r3 = *(u8 *)(r10 -387); \ + w7 = (s8)w3; \ + if w7 >= 0x2533823b goto l0_%=; \ + w0 = 0; \ + exit; \ +" : + : + : __clobber_all); +} + +SEC("socket") +__description("MOV32SX, S8, var_off not u32_max, positive after s8 extension") +__success __retval(0) +__failure_unpriv __msg_unpriv("frame pointer is read only") +__naked void mov64sx_s32_varoff_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r3 = r0; \ + r3 &= 0xf; \ + w7 = (s8)w3; \ + if w7 s>= 16 goto l0_%=; \ + w0 = 0; \ + exit; \ +l0_%=: \ + r10 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("MOV32SX, S8, var_off not u32_max, negative after s8 extension") +__success __retval(0) +__failure_unpriv __msg_unpriv("frame pointer is read only") +__naked void mov64sx_s32_varoff_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r3 = r0; \ + r3 &= 0xf; \ + r3 |= 0x80; \ + w7 = (s8)w3; \ + if w7 s>= -5 goto l0_%=; \ + w0 = 0; \ + exit; \ +l0_%=: \ + r10 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + #else SEC("socket") diff --git a/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c b/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c new file mode 100644 index 0000000000..f37713a265 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("socket") +__description("or_jmp32_k: bit ops + branch on unknown value") +__failure +__msg("R0 invalid mem access 'scalar'") +__naked void or_jmp32_k(void) +{ + asm volatile (" \ + r0 = 0xffffffff; \ + r0 /= 1; \ + r1 = 0; \ + w1 = -1; \ + w1 >>= 1; \ + w0 &= w1; \ + w0 |= 2; \ + if w0 != 0x7ffffffd goto l1; \ + r0 = 1; \ + exit; \ +l3: \ + r0 = 5; \ + *(u64*)(r0 - 8) = r0; \ + exit; \ +l2: \ + w0 -= 0xe; \ + if w0 == 1 goto l3; \ + r0 = 4; \ + exit; \ +l1: \ + w0 -= 0x7ffffff0; \ + if w0 s>= 0xe goto l2; \ + r0 = 3; \ + exit; \ +" ::: __clobber_all); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c new file mode 100644 index 0000000000..9c31448a0f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include +#include +#include +#include "bpf_misc.h" + +SEC("cgroup/recvmsg4") +__success +int recvmsg4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/recvmsg6") +__success +int recvmsg6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/recvmsg_unix") +__success +int recvmsg_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/getpeername4") +__success +int getpeername4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername6") +__success +int getpeername6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername_unix") +__success +int getpeername_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname4") +__success +int getsockname4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname6") +__success +int getsockname6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname_unix") +__success +int getsockname_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname_unix_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind4") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind6") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c b/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c new file mode 100644 index 0000000000..fe4b123187 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "bpf_misc.h" + +#define __always_unused __attribute__((unused)) + +char _license[] SEC("license") = "GPL"; + +struct sock { +} __attribute__((preserve_access_index)); + +struct bpf_iter__sockmap { + union { + struct sock *sk; + }; +} __attribute__((preserve_access_index)); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sockhash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sockmap SEC(".maps"); + +enum { CG_OK = 1 }; + +int zero = 0; + +static __always_inline void test_sockmap_delete(void) +{ + bpf_map_delete_elem(&sockmap, &zero); + bpf_map_delete_elem(&sockhash, &zero); +} + +static __always_inline void test_sockmap_update(void *sk) +{ + if (sk) { + bpf_map_update_elem(&sockmap, &zero, sk, BPF_ANY); + bpf_map_update_elem(&sockhash, &zero, sk, BPF_ANY); + } +} + +static __always_inline void test_sockmap_lookup_and_update(void) +{ + struct bpf_sock *sk = bpf_map_lookup_elem(&sockmap, &zero); + + if (sk) { + test_sockmap_update(sk); + bpf_sk_release(sk); + } +} + +static __always_inline void test_sockmap_mutate(void *sk) +{ + test_sockmap_delete(); + test_sockmap_update(sk); +} + +static __always_inline void test_sockmap_lookup_and_mutate(void) +{ + test_sockmap_delete(); + test_sockmap_lookup_and_update(); +} + +SEC("action") +__success +int test_sched_act(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("classifier") +__success +int test_sched_cls(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("flow_dissector") +__success +int test_flow_dissector_delete(struct __sk_buff *skb __always_unused) +{ + test_sockmap_delete(); + return 0; +} + +SEC("flow_dissector") +__failure __msg("program of this type cannot use helper bpf_sk_release") +int test_flow_dissector_update(struct __sk_buff *skb __always_unused) +{ + test_sockmap_lookup_and_update(); /* no access to skb->sk */ + return 0; +} + +SEC("iter/sockmap") +__success +int test_trace_iter(struct bpf_iter__sockmap *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("raw_tp/kfree") +__failure __msg("cannot update sockmap in this context") +int test_raw_tp_delete(const void *ctx __always_unused) +{ + test_sockmap_delete(); + return 0; +} + +SEC("raw_tp/kfree") +__failure __msg("cannot update sockmap in this context") +int test_raw_tp_update(const void *ctx __always_unused) +{ + test_sockmap_lookup_and_update(); + return 0; +} + +SEC("sk_lookup") +__success +int test_sk_lookup(struct bpf_sk_lookup *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("sk_reuseport") +__success +int test_sk_reuseport(struct sk_reuseport_md *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("socket") +__success +int test_socket_filter(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("sockops") +__success +int test_sockops_delete(struct bpf_sock_ops *ctx __always_unused) +{ + test_sockmap_delete(); + return CG_OK; +} + +SEC("sockops") +__failure __msg("cannot update sockmap in this context") +int test_sockops_update(struct bpf_sock_ops *ctx) +{ + test_sockmap_update(ctx->sk); + return CG_OK; +} + +SEC("sockops") +__success +int test_sockops_update_dedicated(struct bpf_sock_ops *ctx) +{ + bpf_sock_map_update(ctx, &sockmap, &zero, BPF_ANY); + bpf_sock_hash_update(ctx, &sockhash, &zero, BPF_ANY); + return CG_OK; +} + +SEC("xdp") +__success +int test_xdp(struct xdp_md *ctx __always_unused) +{ + test_sockmap_lookup_and_mutate(); + return XDP_PASS; +} diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 6f5d19665c..4a58e0398e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -6,6 +6,7 @@ #include #include #include "bpf_misc.h" +#include <../../../tools/include/linux/filter.h> #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) @@ -76,6 +77,94 @@ __naked int subprog_result_precise(void) ); } +__naked __noinline __used +static unsigned long fp_leaking_subprog() +{ + asm volatile ( + ".8byte %[r0_eq_r10_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r10_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_10, 8)) + ); +} + +__naked __noinline __used +static unsigned long sneaky_fp_leaking_subprog() +{ + asm volatile ( + "r1 = r10;" + ".8byte %[r0_eq_r1_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r1_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_1, 8)) + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") +__msg("7: R0_w=scalar") +__naked int fp_precise_subprog_result(void) +{ + asm volatile ( + "call fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 11: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") +/* here r1 is marked precise, even though it's fp register, but that's fine + * because by the time we get out of subprogram it has to be derived from r10 + * anyways, at which point we'll break precision chain + */ +__msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") +__msg("7: R0_w=scalar") +__naked int sneaky_fp_precise_subprog_result(void) +{ + asm volatile ( + "call sneaky_fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r0") diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c new file mode 100644 index 0000000000..49e712acbf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct hmap_elem { + int counter; + struct bpf_timer timer; /* unused */ + struct bpf_spin_lock lock; /* unused */ + struct bpf_wq work; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap_malloc SEC(".maps"); + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +__u32 ok; +__u32 ok_sleepable; + +static int test_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) +{ + struct elem init = {}, *val; + struct bpf_wq *wq; + + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + + if (map == &lru && + bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + wq = &val->w; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + if (bpf_wq_start(wq, 0)) + return -5; + + return 0; +} + +static int test_hmap_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) +{ + struct hmap_elem init = {}, *val; + struct bpf_wq *wq; + + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + + if (bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + wq = &val->work; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + if (bpf_wq_start(wq, 0)) + return -5; + + return 0; +} + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + ok |= (1 << *key); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + ok_sleepable |= (1 << *key); + return 0; +} + +SEC("tc") +/* test that workqueues can be used from an array */ +__retval(0) +long test_call_array_sleepable(void *ctx) +{ + int key = 0; + + return test_elem_callback(&array, &key, wq_cb_sleepable); +} + +SEC("syscall") +/* Same test than above but from a sleepable context. */ +__retval(0) +long test_syscall_array_sleepable(void *ctx) +{ + int key = 1; + + return test_elem_callback(&array, &key, wq_cb_sleepable); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap */ +__retval(0) +long test_call_hash_sleepable(void *ctx) +{ + int key = 2; + + return test_hmap_elem_callback(&hmap, &key, wq_callback); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap with NO_PREALLOC. */ +__retval(0) +long test_call_hash_malloc_sleepable(void *ctx) +{ + int key = 3; + + return test_hmap_elem_callback(&hmap_malloc, &key, wq_callback); +} + +SEC("tc") +/* test that workqueues can be used from a LRU map */ +__retval(0) +long test_call_lru_sleepable(void *ctx) +{ + int key = 4; + + return test_elem_callback(&lru, &key, wq_callback); +} diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c new file mode 100644 index 0000000000..4cbdb425f2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + return 0; +} + +SEC("tc") +/* test that bpf_wq_init takes a map as a second argument + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("pointer in R2 isn't map pointer") +long test_wq_init_nomap(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &key, 0) != 0) + return -3; + + return 0; +} + +SEC("tc") +/* test that the workqueue is part of the map in bpf_wq_init + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("workqueue pointer in R1 map_uid=0 doesn't match map pointer in R2 map_uid=0") +long test_wq_init_wrong_map(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &lru, 0) != 0) + return -3; + + return 0; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("arg#0 doesn't point to a map value") +long test_wrong_wq_pointer(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)&wq, wq_callback, 0)) + return 3; + + return -22; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") +long test_wrong_wq_pointer_offset(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)wq + 1, wq_cb_sleepable, 0)) + return 3; + + return -22; +} diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index f4936834f7..dde0bb16e7 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -7,6 +7,7 @@ #include #include #include "test_core_extern.skel.h" +#include "struct_ops_module.skel.h" template class Skeleton { @@ -98,6 +99,7 @@ int main(int argc, char *argv[]) { struct btf_dump_opts opts = { }; struct test_core_extern *skel; + struct struct_ops_module *skel2; struct btf *btf; int fd; @@ -118,6 +120,9 @@ int main(int argc, char *argv[]) skel = test_core_extern__open_and_load(); test_core_extern__destroy(skel); + skel2 = struct_ops_module__open_and_load(); + struct_ops_module__destroy(skel2); + fd = bpf_enable_stats(BPF_STATS_RUN_TIME); if (fd < 0) std::cout << "FAILED to enable stats: " << fd << std::endl; diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py deleted file mode 100755 index 6157f884d0..0000000000 --- a/tools/testing/selftests/bpf/test_offload.py +++ /dev/null @@ -1,1405 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2017 Netronome Systems, Inc. -# Copyright (c) 2019 Mellanox Technologies. All rights reserved -# -# This software is licensed under the GNU General License Version 2, -# June 1991 as shown in the file COPYING in the top-level directory of this -# source tree. -# -# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" -# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, -# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE -# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME -# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - -from datetime import datetime -import argparse -import errno -import json -import os -import pprint -import random -import re -import stat -import string -import struct -import subprocess -import time -import traceback - -logfile = None -log_level = 1 -skip_extack = False -bpf_test_dir = os.path.dirname(os.path.realpath(__file__)) -pp = pprint.PrettyPrinter() -devs = [] # devices we created for clean up -files = [] # files to be removed -netns = [] # net namespaces to be removed - -def log_get_sec(level=0): - return "*" * (log_level + level) - -def log_level_inc(add=1): - global log_level - log_level += add - -def log_level_dec(sub=1): - global log_level - log_level -= sub - -def log_level_set(level): - global log_level - log_level = level - -def log(header, data, level=None): - """ - Output to an optional log. - """ - if logfile is None: - return - if level is not None: - log_level_set(level) - - if not isinstance(data, str): - data = pp.pformat(data) - - if len(header): - logfile.write("\n" + log_get_sec() + " ") - logfile.write(header) - if len(header) and len(data.strip()): - logfile.write("\n") - logfile.write(data) - -def skip(cond, msg): - if not cond: - return - print("SKIP: " + msg) - log("SKIP: " + msg, "", level=1) - os.sys.exit(0) - -def fail(cond, msg): - if not cond: - return - print("FAIL: " + msg) - tb = "".join(traceback.extract_stack().format()) - print(tb) - log("FAIL: " + msg, tb, level=1) - os.sys.exit(1) - -def start_test(msg): - log(msg, "", level=1) - log_level_inc() - print(msg) - -def cmd(cmd, shell=True, include_stderr=False, background=False, fail=True): - """ - Run a command in subprocess and return tuple of (retval, stdout); - optionally return stderr as well as third value. - """ - proc = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if background: - msg = "%s START: %s" % (log_get_sec(1), - datetime.now().strftime("%H:%M:%S.%f")) - log("BKG " + proc.args, msg) - return proc - - return cmd_result(proc, include_stderr=include_stderr, fail=fail) - -def cmd_result(proc, include_stderr=False, fail=False): - stdout, stderr = proc.communicate() - stdout = stdout.decode("utf-8") - stderr = stderr.decode("utf-8") - proc.stdout.close() - proc.stderr.close() - - stderr = "\n" + stderr - if stderr[-1] == "\n": - stderr = stderr[:-1] - - sec = log_get_sec(1) - log("CMD " + proc.args, - "RETCODE: %d\n%s STDOUT:\n%s%s STDERR:%s\n%s END: %s" % - (proc.returncode, sec, stdout, sec, stderr, - sec, datetime.now().strftime("%H:%M:%S.%f"))) - - if proc.returncode != 0 and fail: - if len(stderr) > 0 and stderr[-1] == "\n": - stderr = stderr[:-1] - raise Exception("Command failed: %s\n%s" % (proc.args, stderr)) - - if include_stderr: - return proc.returncode, stdout, stderr - else: - return proc.returncode, stdout - -def rm(f): - cmd("rm -f %s" % (f)) - if f in files: - files.remove(f) - -def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False): - params = "" - if JSON: - params += "%s " % (flags["json"]) - - if ns != "": - ns = "ip netns exec %s " % (ns) - - if include_stderr: - ret, stdout, stderr = cmd(ns + name + " " + params + args, - fail=fail, include_stderr=True) - else: - ret, stdout = cmd(ns + name + " " + params + args, - fail=fail, include_stderr=False) - - if JSON and len(stdout.strip()) != 0: - out = json.loads(stdout) - else: - out = stdout - - if include_stderr: - return ret, out, stderr - else: - return ret, out - -def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False): - return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, - fail=fail, include_stderr=include_stderr) - -def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True): - _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) - # Remove the base progs - for p in base_progs: - if p in progs: - progs.remove(p) - if exclude_orphaned: - progs = [ p for p in progs if not p['orphaned'] ] - if expected is not None: - if len(progs) != expected: - fail(True, "%d BPF programs loaded, expected %d" % - (len(progs), expected)) - return progs - -def bpftool_map_list(expected=None, ns=""): - _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) - # Remove the base maps - maps = [m for m in maps if m not in base_maps and m.get('name') and m.get('name') not in base_map_names] - if expected is not None: - if len(maps) != expected: - fail(True, "%d BPF maps loaded, expected %d" % - (len(maps), expected)) - return maps - -def bpftool_prog_list_wait(expected=0, n_retry=20): - for i in range(n_retry): - nprogs = len(bpftool_prog_list()) - if nprogs == expected: - return - time.sleep(0.05) - raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) - -def bpftool_map_list_wait(expected=0, n_retry=20): - for i in range(n_retry): - nmaps = len(bpftool_map_list()) - if nmaps == expected: - return - time.sleep(0.05) - raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) - -def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, - fail=True, include_stderr=False): - args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) - if prog_type is not None: - args += " type " + prog_type - if dev is not None: - args += " dev " + dev - if len(maps): - args += " map " + " map ".join(maps) - - res = bpftool(args, fail=fail, include_stderr=include_stderr) - if res[0] == 0: - files.append(file_name) - return res - -def ip(args, force=False, JSON=True, ns="", fail=True, include_stderr=False): - if force: - args = "-force " + args - return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns, - fail=fail, include_stderr=include_stderr) - -def tc(args, JSON=True, ns="", fail=True, include_stderr=False): - return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns, - fail=fail, include_stderr=include_stderr) - -def ethtool(dev, opt, args, fail=True): - return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail) - -def bpf_obj(name, sec=".text", path=bpf_test_dir,): - return "obj %s sec %s" % (os.path.join(path, name), sec) - -def bpf_pinned(name): - return "pinned %s" % (name) - -def bpf_bytecode(bytecode): - return "bytecode \"%s\"" % (bytecode) - -def mknetns(n_retry=10): - for i in range(n_retry): - name = ''.join([random.choice(string.ascii_letters) for i in range(8)]) - ret, _ = ip("netns add %s" % (name), fail=False) - if ret == 0: - netns.append(name) - return name - return None - -def int2str(fmt, val): - ret = [] - for b in struct.pack(fmt, val): - ret.append(int(b)) - return " ".join(map(lambda x: str(x), ret)) - -def str2int(strtab): - inttab = [] - for i in strtab: - inttab.append(int(i, 16)) - ba = bytearray(inttab) - if len(strtab) == 4: - fmt = "I" - elif len(strtab) == 8: - fmt = "Q" - else: - raise Exception("String array of len %d can't be unpacked to an int" % - (len(strtab))) - return struct.unpack(fmt, ba)[0] - -class DebugfsDir: - """ - Class for accessing DebugFS directories as a dictionary. - """ - - def __init__(self, path): - self.path = path - self._dict = self._debugfs_dir_read(path) - - def __len__(self): - return len(self._dict.keys()) - - def __getitem__(self, key): - if type(key) is int: - key = list(self._dict.keys())[key] - return self._dict[key] - - def __setitem__(self, key, value): - log("DebugFS set %s = %s" % (key, value), "") - log_level_inc() - - cmd("echo '%s' > %s/%s" % (value, self.path, key)) - log_level_dec() - - _, out = cmd('cat %s/%s' % (self.path, key)) - self._dict[key] = out.strip() - - def _debugfs_dir_read(self, path): - dfs = {} - - log("DebugFS state for %s" % (path), "") - log_level_inc(add=2) - - _, out = cmd('ls ' + path) - for f in out.split(): - if f == "ports": - continue - - p = os.path.join(path, f) - if not os.stat(p).st_mode & stat.S_IRUSR: - continue - - if os.path.isfile(p): - # We need to init trap_flow_action_cookie before read it - if f == "trap_flow_action_cookie": - cmd('echo deadbeef > %s/%s' % (path, f)) - _, out = cmd('cat %s/%s' % (path, f)) - dfs[f] = out.strip() - elif os.path.isdir(p): - dfs[f] = DebugfsDir(p) - else: - raise Exception("%s is neither file nor directory" % (p)) - - log_level_dec() - log("DebugFS state", dfs) - log_level_dec() - - return dfs - -class NetdevSimDev: - """ - Class for netdevsim bus device and its attributes. - """ - @staticmethod - def ctrl_write(path, val): - fullpath = os.path.join("/sys/bus/netdevsim/", path) - try: - with open(fullpath, "w") as f: - f.write(val) - except OSError as e: - log("WRITE %s: %r" % (fullpath, val), -e.errno) - raise e - log("WRITE %s: %r" % (fullpath, val), 0) - - def __init__(self, port_count=1): - addr = 0 - while True: - try: - self.ctrl_write("new_device", "%u %u" % (addr, port_count)) - except OSError as e: - if e.errno == errno.ENOSPC: - addr += 1 - continue - raise e - break - self.addr = addr - - # As probe of netdevsim device might happen from a workqueue, - # so wait here until all netdevs appear. - self.wait_for_netdevs(port_count) - - ret, out = cmd("udevadm settle", fail=False) - if ret: - raise Exception("udevadm settle failed") - ifnames = self.get_ifnames() - - devs.append(self) - self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr - - self.nsims = [] - for port_index in range(port_count): - self.nsims.append(NetdevSim(self, port_index, ifnames[port_index])) - - def get_ifnames(self): - ifnames = [] - listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr) - for ifname in listdir: - ifnames.append(ifname) - ifnames.sort() - return ifnames - - def wait_for_netdevs(self, port_count): - timeout = 5 - timeout_start = time.time() - - while True: - try: - ifnames = self.get_ifnames() - except FileNotFoundError as e: - ifnames = [] - if len(ifnames) == port_count: - break - if time.time() < timeout_start + timeout: - continue - raise Exception("netdevices did not appear within timeout") - - def dfs_num_bound_progs(self): - path = os.path.join(self.dfs_dir, "bpf_bound_progs") - _, progs = cmd('ls %s' % (path)) - return len(progs.split()) - - def dfs_get_bound_progs(self, expected): - progs = DebugfsDir(os.path.join(self.dfs_dir, "bpf_bound_progs")) - if expected is not None: - if len(progs) != expected: - fail(True, "%d BPF programs bound, expected %d" % - (len(progs), expected)) - return progs - - def remove(self): - self.ctrl_write("del_device", "%u" % (self.addr, )) - devs.remove(self) - - def remove_nsim(self, nsim): - self.nsims.remove(nsim) - self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), - "%u" % (nsim.port_index, )) - -class NetdevSim: - """ - Class for netdevsim netdevice and its attributes. - """ - - def __init__(self, nsimdev, port_index, ifname): - # In case udev renamed the netdev to according to new schema, - # check if the name matches the port_index. - nsimnamere = re.compile("eni\d+np(\d+)") - match = nsimnamere.match(ifname) - if match and int(match.groups()[0]) != port_index + 1: - raise Exception("netdevice name mismatches the expected one") - - self.nsimdev = nsimdev - self.port_index = port_index - self.ns = "" - self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) - self.dfs_refresh() - _, [self.dev] = ip("link show dev %s" % ifname) - - def __getitem__(self, key): - return self.dev[key] - - def remove(self): - self.nsimdev.remove_nsim(self) - - def dfs_refresh(self): - self.dfs = DebugfsDir(self.dfs_dir) - return self.dfs - - def dfs_read(self, f): - path = os.path.join(self.dfs_dir, f) - _, data = cmd('cat %s' % (path)) - return data.strip() - - def wait_for_flush(self, bound=0, total=0, n_retry=20): - for i in range(n_retry): - nbound = self.nsimdev.dfs_num_bound_progs() - nprogs = len(bpftool_prog_list()) - if nbound == bound and nprogs == total: - return - time.sleep(0.05) - raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs)) - - def set_ns(self, ns): - name = "1" if ns == "" else ns - ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns) - self.ns = ns - - def set_mtu(self, mtu, fail=True): - return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu), - fail=fail) - - def set_xdp(self, bpf, mode, force=False, JSON=True, verbose=False, - fail=True, include_stderr=False): - if verbose: - bpf += " verbose" - return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf), - force=force, JSON=JSON, - fail=fail, include_stderr=include_stderr) - - def unset_xdp(self, mode, force=False, JSON=True, - fail=True, include_stderr=False): - return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode), - force=force, JSON=JSON, - fail=fail, include_stderr=include_stderr) - - def ip_link_show(self, xdp): - _, link = ip("link show dev %s" % (self['ifname'])) - if len(link) > 1: - raise Exception("Multiple objects on ip link show") - if len(link) < 1: - return {} - fail(xdp != "xdp" in link, - "XDP program not reporting in iplink (reported %s, expected %s)" % - ("xdp" in link, xdp)) - return link[0] - - def tc_add_ingress(self): - tc("qdisc add dev %s ingress" % (self['ifname'])) - - def tc_del_ingress(self): - tc("qdisc del dev %s ingress" % (self['ifname'])) - - def tc_flush_filters(self, bound=0, total=0): - self.tc_del_ingress() - self.tc_add_ingress() - self.wait_for_flush(bound=bound, total=total) - - def tc_show_ingress(self, expected=None): - # No JSON support, oh well... - flags = ["skip_sw", "skip_hw", "in_hw"] - named = ["protocol", "pref", "chain", "handle", "id", "tag"] - - args = "-s filter show dev %s ingress" % (self['ifname']) - _, out = tc(args, JSON=False) - - filters = [] - lines = out.split('\n') - for line in lines: - words = line.split() - if "handle" not in words: - continue - fltr = {} - for flag in flags: - fltr[flag] = flag in words - for name in named: - try: - idx = words.index(name) - fltr[name] = words[idx + 1] - except ValueError: - pass - filters.append(fltr) - - if expected is not None: - fail(len(filters) != expected, - "%d ingress filters loaded, expected %d" % - (len(filters), expected)) - return filters - - def cls_filter_op(self, op, qdisc="ingress", prio=None, handle=None, - chain=None, cls="", params="", - fail=True, include_stderr=False): - spec = "" - if prio is not None: - spec += " prio %d" % (prio) - if handle: - spec += " handle %s" % (handle) - if chain is not None: - spec += " chain %d" % (chain) - - return tc("filter {op} dev {dev} {qdisc} {spec} {cls} {params}"\ - .format(op=op, dev=self['ifname'], qdisc=qdisc, spec=spec, - cls=cls, params=params), - fail=fail, include_stderr=include_stderr) - - def cls_bpf_add_filter(self, bpf, op="add", prio=None, handle=None, - chain=None, da=False, verbose=False, - skip_sw=False, skip_hw=False, - fail=True, include_stderr=False): - cls = "bpf " + bpf - - params = "" - if da: - params += " da" - if verbose: - params += " verbose" - if skip_sw: - params += " skip_sw" - if skip_hw: - params += " skip_hw" - - return self.cls_filter_op(op=op, prio=prio, handle=handle, cls=cls, - chain=chain, params=params, - fail=fail, include_stderr=include_stderr) - - def set_ethtool_tc_offloads(self, enable, fail=True): - args = "hw-tc-offload %s" % ("on" if enable else "off") - return ethtool(self, "-K", args, fail=fail) - -################################################################################ -def clean_up(): - global files, netns, devs - - for dev in devs: - dev.remove() - for f in files: - cmd("rm -f %s" % (f)) - for ns in netns: - cmd("ip netns delete %s" % (ns)) - files = [] - netns = [] - -def pin_prog(file_name, idx=0): - progs = bpftool_prog_list(expected=(idx + 1)) - prog = progs[idx] - bpftool("prog pin id %d %s" % (prog["id"], file_name)) - files.append(file_name) - - return file_name, bpf_pinned(file_name) - -def pin_map(file_name, idx=0, expected=1): - maps = bpftool_map_list(expected=expected) - m = maps[idx] - bpftool("map pin id %d %s" % (m["id"], file_name)) - files.append(file_name) - - return file_name, bpf_pinned(file_name) - -def check_dev_info_removed(prog_file=None, map_file=None): - bpftool_prog_list(expected=0) - bpftool_prog_list(expected=1, exclude_orphaned=False) - ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) - fail(ret != 0, "failed to show prog with removed device") - - bpftool_map_list(expected=0) - ret, err = bpftool("map show pin %s" % (map_file), fail=False) - fail(ret == 0, "Showing map with removed device did not fail") - fail(err["error"].find("No such device") == -1, - "Showing map with removed device expected ENODEV, error is %s" % - (err["error"])) - -def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): - progs = bpftool_prog_list(expected=1, ns=ns) - prog = progs[0] - - fail("dev" not in prog.keys(), "Device parameters not reported") - dev = prog["dev"] - fail("ifindex" not in dev.keys(), "Device parameters not reported") - fail("ns_dev" not in dev.keys(), "Device parameters not reported") - fail("ns_inode" not in dev.keys(), "Device parameters not reported") - - if not other_ns: - fail("ifname" not in dev.keys(), "Ifname not reported") - fail(dev["ifname"] != sim["ifname"], - "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"])) - else: - fail("ifname" in dev.keys(), "Ifname is reported for other ns") - - maps = bpftool_map_list(expected=2, ns=ns) - for m in maps: - fail("dev" not in m.keys(), "Device parameters not reported") - fail(dev != m["dev"], "Map's device different than program's") - -def check_extack(output, reference, args): - if skip_extack: - return - lines = output.split("\n") - comp = len(lines) >= 2 and lines[1] == 'Error: ' + reference - fail(not comp, "Missing or incorrect netlink extack message") - -def check_extack_nsim(output, reference, args): - check_extack(output, "netdevsim: " + reference, args) - -def check_no_extack(res, needle): - fail((res[1] + res[2]).count(needle) or (res[1] + res[2]).count("Warning:"), - "Found '%s' in command output, leaky extack?" % (needle)) - -def check_verifier_log(output, reference): - lines = output.split("\n") - for l in reversed(lines): - if l == reference: - return - fail(True, "Missing or incorrect message from netdevsim in verifier log") - -def check_multi_basic(two_xdps): - fail(two_xdps["mode"] != 4, "Bad mode reported with multiple programs") - fail("prog" in two_xdps, "Base program reported in multi program mode") - fail(len(two_xdps["attached"]) != 2, - "Wrong attached program count with two programs") - fail(two_xdps["attached"][0]["prog"]["id"] == - two_xdps["attached"][1]["prog"]["id"], - "Offloaded and other programs have the same id") - -def test_spurios_extack(sim, obj, skip_hw, needle): - res = sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=skip_hw, - include_stderr=True) - check_no_extack(res, needle) - res = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, - skip_hw=skip_hw, include_stderr=True) - check_no_extack(res, needle) - res = sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf", - include_stderr=True) - check_no_extack(res, needle) - -def test_multi_prog(simdev, sim, obj, modename, modeid): - start_test("Test multi-attachment XDP - %s + offload..." % - (modename or "default", )) - sim.set_xdp(obj, "offload") - xdp = sim.ip_link_show(xdp=True)["xdp"] - offloaded = sim.dfs_read("bpf_offloaded_id") - fail("prog" not in xdp, "Base program not reported in single program mode") - fail(len(xdp["attached"]) != 1, - "Wrong attached program count with one program") - - sim.set_xdp(obj, modename) - two_xdps = sim.ip_link_show(xdp=True)["xdp"] - - fail(xdp["attached"][0] not in two_xdps["attached"], - "Offload program not reported after other activated") - check_multi_basic(two_xdps) - - offloaded2 = sim.dfs_read("bpf_offloaded_id") - fail(offloaded != offloaded2, - "Offload ID changed after loading other program") - - start_test("Test multi-attachment XDP - replace...") - ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) - fail(ret == 0, "Replaced one of programs without -force") - check_extack(err, "XDP program already attached.", args) - - start_test("Test multi-attachment XDP - remove without mode...") - ret, _, err = sim.unset_xdp("", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program without a mode flag") - check_extack(err, "More than one program loaded, unset mode is ambiguous.", args) - - sim.unset_xdp("offload") - xdp = sim.ip_link_show(xdp=True)["xdp"] - offloaded = sim.dfs_read("bpf_offloaded_id") - - fail(xdp["mode"] != modeid, "Bad mode reported after multiple programs") - fail("prog" not in xdp, - "Base program not reported after multi program mode") - fail(xdp["attached"][0] not in two_xdps["attached"], - "Offload program not reported after other activated") - fail(len(xdp["attached"]) != 1, - "Wrong attached program count with remaining programs") - fail(offloaded != "0", "Offload ID reported with only other program left") - - start_test("Test multi-attachment XDP - reattach...") - sim.set_xdp(obj, "offload") - two_xdps = sim.ip_link_show(xdp=True)["xdp"] - - fail(xdp["attached"][0] not in two_xdps["attached"], - "Other program not reported after offload activated") - check_multi_basic(two_xdps) - - start_test("Test multi-attachment XDP - device remove...") - simdev.remove() - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_ethtool_tc_offloads(True) - return [simdev, sim] - -# Parse command line -parser = argparse.ArgumentParser() -parser.add_argument("--log", help="output verbose log to given file") -args = parser.parse_args() -if args.log: - logfile = open(args.log, 'w+') - logfile.write("# -*-Org-*-") - -log("Prepare...", "", level=1) -log_level_inc() - -# Check permissions -skip(os.getuid() != 0, "test must be run as root") - -# Check tools -ret, progs = bpftool("prog", fail=False) -skip(ret != 0, "bpftool not installed") -base_progs = progs -_, base_maps = bpftool("map") -base_map_names = [ - 'pid_iter.rodata', # created on each bpftool invocation - 'libbpf_det_bind', # created on each bpftool invocation -] - -# Check netdevsim -if not os.path.isdir("/sys/bus/netdevsim/"): - ret, out = cmd("modprobe netdevsim", fail=False) - skip(ret != 0, "netdevsim module could not be loaded") - -# Check debugfs -_, out = cmd("mount") -if out.find("/sys/kernel/debug type debugfs") == -1: - cmd("mount -t debugfs none /sys/kernel/debug") - -# Check samples are compiled -samples = ["sample_ret0.bpf.o", "sample_map_ret0.bpf.o"] -for s in samples: - ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False) - skip(ret != 0, "sample %s/%s not found, please compile it" % - (bpf_test_dir, s)) - -# Check if iproute2 is built with libmnl (needed by extack support) -_, _, err = cmd("tc qdisc delete dev lo handle 0", - fail=False, include_stderr=True) -if err.find("Error: Failed to find qdisc with specified handle.") == -1: - print("Warning: no extack message in iproute2 output, libmnl missing?") - log("Warning: no extack message in iproute2 output, libmnl missing?", "") - skip_extack = True - -# Check if net namespaces seem to work -ns = mknetns() -skip(ns is None, "Could not create a net namespace") -cmd("ip netns delete %s" % (ns)) -netns = [] - -try: - obj = bpf_obj("sample_ret0.bpf.o") - bytecode = bpf_bytecode("1,6 0 0 4294967295,") - - start_test("Test destruction of generic XDP...") - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_xdp(obj, "generic") - simdev.remove() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.tc_add_ingress() - - start_test("Test TC non-offloaded...") - ret, _ = sim.cls_bpf_add_filter(obj, skip_hw=True, fail=False) - fail(ret != 0, "Software TC filter did not load") - - start_test("Test TC non-offloaded isn't getting bound...") - ret, _ = sim.cls_bpf_add_filter(obj, fail=False) - fail(ret != 0, "Software TC filter did not load") - simdev.dfs_get_bound_progs(expected=0) - - sim.tc_flush_filters() - - start_test("Test TC offloads are off by default...") - ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "TC filter loaded without enabling TC offloads") - check_extack(err, "TC offload is disabled on net device.", args) - sim.wait_for_flush() - - sim.set_ethtool_tc_offloads(True) - sim.dfs["bpf_tc_non_bound_accept"] = "Y" - - start_test("Test TC offload by default...") - ret, _ = sim.cls_bpf_add_filter(obj, fail=False) - fail(ret != 0, "Software TC filter did not load") - simdev.dfs_get_bound_progs(expected=0) - ingress = sim.tc_show_ingress(expected=1) - fltr = ingress[0] - fail(not fltr["in_hw"], "Filter not offloaded by default") - - sim.tc_flush_filters() - - start_test("Test TC cBPF bytcode tries offload by default...") - ret, _ = sim.cls_bpf_add_filter(bytecode, fail=False) - fail(ret != 0, "Software TC filter did not load") - simdev.dfs_get_bound_progs(expected=0) - ingress = sim.tc_show_ingress(expected=1) - fltr = ingress[0] - fail(not fltr["in_hw"], "Bytecode not offloaded by default") - - sim.tc_flush_filters() - sim.dfs["bpf_tc_non_bound_accept"] = "N" - - start_test("Test TC cBPF unbound bytecode doesn't offload...") - ret, _, err = sim.cls_bpf_add_filter(bytecode, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "TC bytecode loaded for offload") - check_extack_nsim(err, "netdevsim configured to reject unbound programs.", - args) - sim.wait_for_flush() - - start_test("Test non-0 chain offload...") - ret, _, err = sim.cls_bpf_add_filter(obj, chain=1, prio=1, handle=1, - skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "Offloaded a filter to chain other than 0") - check_extack(err, "Driver supports only offload of chain 0.", args) - sim.tc_flush_filters() - - start_test("Test TC replace...") - sim.cls_bpf_add_filter(obj, prio=1, handle=1) - sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_sw=True) - sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_sw=True) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=True) - sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_hw=True) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - start_test("Test TC replace bad flags...") - for i in range(3): - for j in range(3): - ret, _ = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, - skip_sw=(j == 1), skip_hw=(j == 2), - fail=False) - fail(bool(ret) != bool(j), - "Software TC incorrect load in replace test, iteration %d" % - (j)) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - start_test("Test spurious extack from the driver...") - test_spurios_extack(sim, obj, False, "netdevsim") - test_spurios_extack(sim, obj, True, "netdevsim") - - sim.set_ethtool_tc_offloads(False) - - test_spurios_extack(sim, obj, False, "TC offload is disabled") - test_spurios_extack(sim, obj, True, "TC offload is disabled") - - sim.set_ethtool_tc_offloads(True) - - sim.tc_flush_filters() - - start_test("Test TC offloads failure...") - sim.dfs["dev/bpf_bind_verifier_accept"] = 0 - ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "TC filter did not reject with TC offloads enabled") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") - sim.dfs["dev/bpf_bind_verifier_accept"] = 1 - - start_test("Test TC offloads work...") - ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, - fail=False, include_stderr=True) - fail(ret != 0, "TC filter did not load with TC offloads enabled") - - start_test("Test TC offload basics...") - dfs = simdev.dfs_get_bound_progs(expected=1) - progs = bpftool_prog_list(expected=1) - ingress = sim.tc_show_ingress(expected=1) - - dprog = dfs[0] - prog = progs[0] - fltr = ingress[0] - fail(fltr["skip_hw"], "TC does reports 'skip_hw' on offloaded filter") - fail(not fltr["in_hw"], "TC does not report 'in_hw' for offloaded filter") - fail(not fltr["skip_sw"], "TC does not report 'skip_sw' back") - - start_test("Test TC offload is device-bound...") - fail(str(prog["id"]) != fltr["id"], "Program IDs don't match") - fail(prog["tag"] != fltr["tag"], "Program tags don't match") - fail(fltr["id"] != dprog["id"], "Program IDs don't match") - fail(dprog["state"] != "xlated", "Offloaded program state not translated") - fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") - - start_test("Test disabling TC offloads is rejected while filters installed...") - ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) - fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...") - sim.set_ethtool_tc_offloads(True) - - start_test("Test qdisc removal frees things...") - sim.tc_flush_filters() - sim.tc_show_ingress(expected=0) - - start_test("Test disabling TC offloads is OK without filters...") - ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) - fail(ret != 0, - "Driver refused to disable TC offloads without filters installed...") - - sim.set_ethtool_tc_offloads(True) - - start_test("Test destroying device gets rid of TC filters...") - sim.cls_bpf_add_filter(obj, skip_sw=True) - simdev.remove() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_ethtool_tc_offloads(True) - - start_test("Test destroying device gets rid of XDP...") - sim.set_xdp(obj, "offload") - simdev.remove() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_ethtool_tc_offloads(True) - - start_test("Test XDP prog reporting...") - sim.set_xdp(obj, "drv") - ipl = sim.ip_link_show(xdp=True) - progs = bpftool_prog_list(expected=1) - fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], - "Loaded program has wrong ID") - - start_test("Test XDP prog replace without force...") - ret, _ = sim.set_xdp(obj, "drv", fail=False) - fail(ret == 0, "Replaced XDP program without -force") - sim.wait_for_flush(total=1) - - start_test("Test XDP prog replace with force...") - ret, _ = sim.set_xdp(obj, "drv", force=True, fail=False) - fail(ret != 0, "Could not replace XDP program with -force") - bpftool_prog_list_wait(expected=1) - ipl = sim.ip_link_show(xdp=True) - progs = bpftool_prog_list(expected=1) - fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], - "Loaded program has wrong ID") - fail("dev" in progs[0].keys(), - "Device parameters reported for non-offloaded program") - - start_test("Test XDP prog replace with bad flags...") - ret, _, err = sim.set_xdp(obj, "generic", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Replaced XDP program with a program in different mode") - check_extack(err, - "Native and generic XDP can't be active at the same time.", - args) - - start_test("Test MTU restrictions...") - ret, _ = sim.set_mtu(9000, fail=False) - fail(ret == 0, - "Driver should refuse to increase MTU to 9000 with XDP loaded...") - sim.unset_xdp("drv") - bpftool_prog_list_wait(expected=0) - sim.set_mtu(9000) - ret, _, err = sim.set_xdp(obj, "drv", fail=False, include_stderr=True) - fail(ret == 0, "Driver should refuse to load program with MTU of 9000...") - check_extack_nsim(err, "MTU too large w/ XDP enabled.", args) - sim.set_mtu(1500) - - sim.wait_for_flush() - start_test("Test non-offload XDP attaching to HW...") - bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/nooffload") - nooffload = bpf_pinned("/sys/fs/bpf/nooffload") - ret, _, err = sim.set_xdp(nooffload, "offload", - fail=False, include_stderr=True) - fail(ret == 0, "attached non-offloaded XDP program to HW") - check_extack_nsim(err, "xdpoffload of non-bound program.", args) - rm("/sys/fs/bpf/nooffload") - - start_test("Test offload XDP attaching to drv...") - bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", - dev=sim['ifname']) - offload = bpf_pinned("/sys/fs/bpf/offload") - ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) - fail(ret == 0, "attached offloaded XDP program to drv") - check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args) - rm("/sys/fs/bpf/offload") - sim.wait_for_flush() - - start_test("Test XDP load failure...") - sim.dfs["dev/bpf_bind_verifier_accept"] = 0 - ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", - dev=sim['ifname'], fail=False, include_stderr=True) - fail(ret == 0, "verifier should fail on load") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") - sim.dfs["dev/bpf_bind_verifier_accept"] = 1 - sim.wait_for_flush() - - start_test("Test XDP offload...") - _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True) - ipl = sim.ip_link_show(xdp=True) - link_xdp = ipl["xdp"]["prog"] - progs = bpftool_prog_list(expected=1) - prog = progs[0] - fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID") - - start_test("Test XDP offload is device bound...") - dfs = simdev.dfs_get_bound_progs(expected=1) - dprog = dfs[0] - - fail(prog["id"] != link_xdp["id"], "Program IDs don't match") - fail(prog["tag"] != link_xdp["tag"], "Program tags don't match") - fail(str(link_xdp["id"]) != dprog["id"], "Program IDs don't match") - fail(dprog["state"] != "xlated", "Offloaded program state not translated") - fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") - - start_test("Test removing XDP program many times...") - sim.unset_xdp("offload") - sim.unset_xdp("offload") - sim.unset_xdp("drv") - sim.unset_xdp("drv") - sim.unset_xdp("") - sim.unset_xdp("") - bpftool_prog_list_wait(expected=0) - - start_test("Test attempt to use a program for a wrong device...") - simdev2 = NetdevSimDev() - sim2, = simdev2.nsims - sim2.set_xdp(obj, "offload") - pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") - - ret, _, err = sim.set_xdp(pinned, "offload", - fail=False, include_stderr=True) - fail(ret == 0, "Pinned program loaded for a different device accepted") - check_extack(err, "Program bound to different device.", args) - simdev2.remove() - ret, _, err = sim.set_xdp(pinned, "offload", - fail=False, include_stderr=True) - fail(ret == 0, "Pinned program loaded for a removed device accepted") - check_extack(err, "Program bound to different device.", args) - rm(pin_file) - bpftool_prog_list_wait(expected=0) - - simdev, sim = test_multi_prog(simdev, sim, obj, "", 1) - simdev, sim = test_multi_prog(simdev, sim, obj, "drv", 1) - simdev, sim = test_multi_prog(simdev, sim, obj, "generic", 2) - - start_test("Test mixing of TC and XDP...") - sim.tc_add_ingress() - sim.set_xdp(obj, "offload") - ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "Loading TC when XDP active should fail") - check_extack_nsim(err, "driver and netdev offload states mismatch.", args) - sim.unset_xdp("offload") - sim.wait_for_flush() - - sim.cls_bpf_add_filter(obj, skip_sw=True) - ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) - fail(ret == 0, "Loading XDP when TC active should fail") - check_extack_nsim(err, "TC program is already loaded.", args) - - start_test("Test binding TC from pinned...") - pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") - sim.tc_flush_filters(bound=1, total=1) - sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True) - sim.tc_flush_filters(bound=1, total=1) - - start_test("Test binding XDP from pinned...") - sim.set_xdp(obj, "offload") - pin_file, pinned = pin_prog("/sys/fs/bpf/tmp2", idx=1) - - sim.set_xdp(pinned, "offload", force=True) - sim.unset_xdp("offload") - sim.set_xdp(pinned, "offload", force=True) - sim.unset_xdp("offload") - - start_test("Test offload of wrong type fails...") - ret, _ = sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True, fail=False) - fail(ret == 0, "Managed to attach XDP program to TC") - - start_test("Test asking for TC offload of two filters...") - sim.cls_bpf_add_filter(obj, da=True, skip_sw=True) - ret, _, err = sim.cls_bpf_add_filter(obj, da=True, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "Managed to offload two TC filters at the same time") - check_extack_nsim(err, "driver and netdev offload states mismatch.", args) - - sim.tc_flush_filters(bound=2, total=2) - - start_test("Test if netdev removal waits for translation...") - delay_msec = 500 - sim.dfs["dev/bpf_bind_verifier_delay"] = delay_msec - start = time.time() - cmd_line = "tc filter add dev %s ingress bpf %s da skip_sw" % \ - (sim['ifname'], obj) - tc_proc = cmd(cmd_line, background=True, fail=False) - # Wait for the verifier to start - while simdev.dfs_num_bound_progs() <= 2: - pass - simdev.remove() - end = time.time() - ret, _ = cmd_result(tc_proc, fail=False) - time_diff = end - start - log("Time", "start:\t%s\nend:\t%s\ndiff:\t%s" % (start, end, time_diff)) - - fail(ret == 0, "Managed to load TC filter on a unregistering device") - delay_sec = delay_msec * 0.001 - fail(time_diff < delay_sec, "Removal process took %s, expected %s" % - (time_diff, delay_sec)) - - # Remove all pinned files and reinstantiate the netdev - clean_up() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - map_obj = bpf_obj("sample_map_ret0.bpf.o") - start_test("Test loading program with maps...") - sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - - start_test("Test bpftool bound info reporting (own ns)...") - check_dev_info(False, "") - - start_test("Test bpftool bound info reporting (other ns)...") - ns = mknetns() - sim.set_ns(ns) - check_dev_info(True, "") - - start_test("Test bpftool bound info reporting (remote ns)...") - check_dev_info(False, ns) - - start_test("Test bpftool bound info reporting (back to own ns)...") - sim.set_ns("") - check_dev_info(False, "") - - prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog") - map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2) - simdev.remove() - - start_test("Test bpftool bound info reporting (removed dev)...") - check_dev_info_removed(prog_file=prog_file, map_file=map_file) - - # Remove all pinned files and reinstantiate the netdev - clean_up() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - - start_test("Test map update (no flags)...") - sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - maps = bpftool_map_list(expected=2) - array = maps[0] if maps[0]["type"] == "array" else maps[1] - htab = maps[0] if maps[0]["type"] == "hash" else maps[1] - for m in maps: - for i in range(2): - bpftool("map update id %d key %s value %s" % - (m["id"], int2str("I", i), int2str("Q", i * 3))) - - for m in maps: - ret, _ = bpftool("map update id %d key %s value %s" % - (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), - fail=False) - fail(ret == 0, "added too many entries") - - start_test("Test map update (exists)...") - for m in maps: - for i in range(2): - bpftool("map update id %d key %s value %s exist" % - (m["id"], int2str("I", i), int2str("Q", i * 3))) - - for m in maps: - ret, err = bpftool("map update id %d key %s value %s exist" % - (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), - fail=False) - fail(ret == 0, "updated non-existing key") - fail(err["error"].find("No such file or directory") == -1, - "expected ENOENT, error is '%s'" % (err["error"])) - - start_test("Test map update (noexist)...") - for m in maps: - for i in range(2): - ret, err = bpftool("map update id %d key %s value %s noexist" % - (m["id"], int2str("I", i), int2str("Q", i * 3)), - fail=False) - fail(ret == 0, "updated existing key") - fail(err["error"].find("File exists") == -1, - "expected EEXIST, error is '%s'" % (err["error"])) - - start_test("Test map dump...") - for m in maps: - _, entries = bpftool("map dump id %d" % (m["id"])) - for i in range(2): - key = str2int(entries[i]["key"]) - fail(key != i, "expected key %d, got %d" % (key, i)) - val = str2int(entries[i]["value"]) - fail(val != i * 3, "expected value %d, got %d" % (val, i * 3)) - - start_test("Test map getnext...") - for m in maps: - _, entry = bpftool("map getnext id %d" % (m["id"])) - key = str2int(entry["next_key"]) - fail(key != 0, "next key %d, expected %d" % (key, 0)) - _, entry = bpftool("map getnext id %d key %s" % - (m["id"], int2str("I", 0))) - key = str2int(entry["next_key"]) - fail(key != 1, "next key %d, expected %d" % (key, 1)) - ret, err = bpftool("map getnext id %d key %s" % - (m["id"], int2str("I", 1)), fail=False) - fail(ret == 0, "got next key past the end of map") - fail(err["error"].find("No such file or directory") == -1, - "expected ENOENT, error is '%s'" % (err["error"])) - - start_test("Test map delete (htab)...") - for i in range(2): - bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i))) - - start_test("Test map delete (array)...") - for i in range(2): - ret, err = bpftool("map delete id %d key %s" % - (htab["id"], int2str("I", i)), fail=False) - fail(ret == 0, "removed entry from an array") - fail(err["error"].find("No such file or directory") == -1, - "expected ENOENT, error is '%s'" % (err["error"])) - - start_test("Test map remove...") - sim.unset_xdp("offload") - bpftool_map_list_wait(expected=0) - simdev.remove() - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - simdev.remove() - bpftool_map_list_wait(expected=0) - - start_test("Test map creation fail path...") - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.dfs["bpf_map_accept"] = "N" - ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) - fail(ret == 0, - "netdevsim didn't refuse to create a map with offload disabled") - - simdev.remove() - - start_test("Test multi-dev ASIC program reuse...") - simdevA = NetdevSimDev() - simA, = simdevA.nsims - simdevB = NetdevSimDev(3) - simB1, simB2, simB3 = simdevB.nsims - sims = (simA, simB1, simB2, simB3) - simB = (simB1, simB2, simB3) - - bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA", - dev=simA['ifname']) - progA = bpf_pinned("/sys/fs/bpf/nsimA") - bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB", - dev=simB1['ifname']) - progB = bpf_pinned("/sys/fs/bpf/nsimB") - - simA.set_xdp(progA, "offload", JSON=False) - for d in simdevB.nsims: - d.set_xdp(progB, "offload", JSON=False) - - start_test("Test multi-dev ASIC cross-dev replace...") - ret, _ = simA.set_xdp(progB, "offload", force=True, JSON=False, fail=False) - fail(ret == 0, "cross-ASIC program allowed") - for d in simdevB.nsims: - ret, _ = d.set_xdp(progA, "offload", force=True, JSON=False, fail=False) - fail(ret == 0, "cross-ASIC program allowed") - - start_test("Test multi-dev ASIC cross-dev install...") - for d in sims: - d.unset_xdp("offload") - - ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False, - fail=False, include_stderr=True) - fail(ret == 0, "cross-ASIC program allowed") - check_extack(err, "Program bound to different device.", args) - for d in simdevB.nsims: - ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False, - fail=False, include_stderr=True) - fail(ret == 0, "cross-ASIC program allowed") - check_extack(err, "Program bound to different device.", args) - - start_test("Test multi-dev ASIC cross-dev map reuse...") - - mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0] - mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0] - - ret, _ = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", - dev=simB3['ifname'], - maps=["idx 0 id %d" % (mapB)], - fail=False) - fail(ret != 0, "couldn't reuse a map on the same ASIC") - rm("/sys/fs/bpf/nsimB_") - - ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA_", - dev=simA['ifname'], - maps=["idx 0 id %d" % (mapB)], - fail=False, include_stderr=True) - fail(ret == 0, "could reuse a map on a different ASIC") - fail(err.count("offload device mismatch between prog and map") == 0, - "error message missing for cross-ASIC map") - - ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", - dev=simB1['ifname'], - maps=["idx 0 id %d" % (mapA)], - fail=False, include_stderr=True) - fail(ret == 0, "could reuse a map on a different ASIC") - fail(err.count("offload device mismatch between prog and map") == 0, - "error message missing for cross-ASIC map") - - start_test("Test multi-dev ASIC cross-dev destruction...") - bpftool_prog_list_wait(expected=2) - - simdevA.remove() - bpftool_prog_list_wait(expected=1) - - ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] - fail(ifnameB != simB1['ifname'], "program not bound to original device") - simB1.remove() - bpftool_prog_list_wait(expected=1) - - start_test("Test multi-dev ASIC cross-dev destruction - move...") - ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] - fail(ifnameB not in (simB2['ifname'], simB3['ifname']), - "program not bound to remaining devices") - - simB2.remove() - ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] - fail(ifnameB != simB3['ifname'], "program not bound to remaining device") - - simB3.remove() - simdevB.remove() - bpftool_prog_list_wait(expected=0) - - start_test("Test multi-dev ASIC cross-dev destruction - orphaned...") - ret, out = bpftool("prog show %s" % (progB), fail=False) - fail(ret != 0, "couldn't get information about orphaned program") - - print("%s: OK" % (os.path.basename(__file__))) - -finally: - log("Clean up...", "", level=1) - log_level_inc() - clean_up() diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c deleted file mode 100644 index 80c42583f5..0000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ /dev/null @@ -1,1434 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#define _GNU_SOURCE - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include "cgroup_helpers.h" -#include "testing_helpers.h" -#include "bpf_util.h" - -#ifndef ENOTSUPP -# define ENOTSUPP 524 -#endif - -#define CG_PATH "/foo" -#define CONNECT4_PROG_PATH "./connect4_prog.bpf.o" -#define CONNECT6_PROG_PATH "./connect6_prog.bpf.o" -#define SENDMSG4_PROG_PATH "./sendmsg4_prog.bpf.o" -#define SENDMSG6_PROG_PATH "./sendmsg6_prog.bpf.o" -#define RECVMSG4_PROG_PATH "./recvmsg4_prog.bpf.o" -#define RECVMSG6_PROG_PATH "./recvmsg6_prog.bpf.o" -#define BIND4_PROG_PATH "./bind4_prog.bpf.o" -#define BIND6_PROG_PATH "./bind6_prog.bpf.o" - -#define SERV4_IP "192.168.1.254" -#define SERV4_REWRITE_IP "127.0.0.1" -#define SRC4_IP "172.16.0.1" -#define SRC4_REWRITE_IP "127.0.0.4" -#define SERV4_PORT 4040 -#define SERV4_REWRITE_PORT 4444 - -#define SERV6_IP "face:b00c:1234:5678::abcd" -#define SERV6_REWRITE_IP "::1" -#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" -#define SRC6_IP "::1" -#define SRC6_REWRITE_IP "::6" -#define WILDCARD6_IP "::" -#define SERV6_PORT 6060 -#define SERV6_REWRITE_PORT 6666 - -#define INET_NTOP_BUF 40 - -struct sock_addr_test; - -typedef int (*load_fn)(const struct sock_addr_test *test); -typedef int (*info_fn)(int, struct sockaddr *, socklen_t *); - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -struct sock_addr_test { - const char *descr; - /* BPF prog properties */ - load_fn loadfn; - enum bpf_attach_type expected_attach_type; - enum bpf_attach_type attach_type; - /* Socket properties */ - int domain; - int type; - /* IP:port pairs for BPF prog to override */ - const char *requested_ip; - unsigned short requested_port; - const char *expected_ip; - unsigned short expected_port; - const char *expected_src_ip; - /* Expected test result */ - enum { - LOAD_REJECT, - ATTACH_REJECT, - ATTACH_OKAY, - SYSCALL_EPERM, - SYSCALL_ENOTSUPP, - SUCCESS, - } expected_result; -}; - -static int bind4_prog_load(const struct sock_addr_test *test); -static int bind6_prog_load(const struct sock_addr_test *test); -static int connect4_prog_load(const struct sock_addr_test *test); -static int connect6_prog_load(const struct sock_addr_test *test); -static int sendmsg_allow_prog_load(const struct sock_addr_test *test); -static int sendmsg_deny_prog_load(const struct sock_addr_test *test); -static int recvmsg_allow_prog_load(const struct sock_addr_test *test); -static int recvmsg_deny_prog_load(const struct sock_addr_test *test); -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); - -static struct sock_addr_test tests[] = { - /* bind */ - { - "bind4: load prog with wrong expected attach type", - bind4_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "bind4: attach prog with wrong attach type", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "bind4: rewrite IP & TCP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind4: rewrite IP & UDP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind6: load prog with wrong expected attach type", - bind6_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "bind6: attach prog with wrong attach type", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "bind6: rewrite IP & TCP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind6: rewrite IP & UDP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, - - /* connect */ - { - "connect4: load prog with wrong expected attach type", - connect4_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "connect4: attach prog with wrong attach type", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "connect4: rewrite IP & TCP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "connect4: rewrite IP & UDP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "connect6: load prog with wrong expected attach type", - connect6_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "connect6: attach prog with wrong attach type", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "connect6: rewrite IP & TCP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "connect6: rewrite IP & UDP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - - /* sendmsg */ - { - "sendmsg4: load prog with wrong expected attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "sendmsg4: attach prog with wrong attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "sendmsg4: rewrite IP & port (asm)", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg4: rewrite IP & port (C)", - sendmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg4: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SYSCALL_EPERM, - }, - { - "sendmsg6: load prog with wrong expected attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "sendmsg6: attach prog with wrong attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "sendmsg6: rewrite IP & port (asm)", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: rewrite IP & port (C)", - sendmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: IPv4-mapped IPv6", - sendmsg6_rw_v4mapped_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_ENOTSUPP, - }, - { - "sendmsg6: set dst IP = [::] (BSD'ism)", - sendmsg6_rw_wildcard_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: preserve dst IP = [::] (BSD'ism)", - sendmsg_allow_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - WILDCARD6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_PORT, - SRC6_IP, - SUCCESS, - }, - { - "sendmsg6: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_EPERM, - }, - - /* recvmsg */ - { - "recvmsg4: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg4: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "recvmsg6: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg6: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "recvmsg4: rewrite IP & port (C)", - recvmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_IP, - SUCCESS, - }, - { - "recvmsg6: rewrite IP & port (C)", - recvmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_IP, - SUCCESS, - }, -}; - -static int mk_sockaddr(int domain, const char *ip, unsigned short port, - struct sockaddr *addr, socklen_t addr_len) -{ - struct sockaddr_in6 *addr6; - struct sockaddr_in *addr4; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - return -1; - } - - memset(addr, 0, addr_len); - - if (domain == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) - return -1; - addr4 = (struct sockaddr_in *)addr; - addr4->sin_family = domain; - addr4->sin_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) { - log_err("Invalid IPv4: %s", ip); - return -1; - } - } else if (domain == AF_INET6) { - if (addr_len < sizeof(struct sockaddr_in6)) - return -1; - addr6 = (struct sockaddr_in6 *)addr; - addr6->sin6_family = domain; - addr6->sin6_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) { - log_err("Invalid IPv6: %s", ip); - return -1; - } - } - - return 0; -} - -static int load_insns(const struct sock_addr_test *test, - const struct bpf_insn *insns, size_t insns_cnt) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts); - int ret; - - opts.expected_attach_type = test->expected_attach_type; - opts.log_buf = bpf_log_buf; - opts.log_size = BPF_LOG_BUF_SIZE; - - ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, NULL, "GPL", insns, insns_cnt, &opts); - if (ret < 0 && test->expected_result != LOAD_REJECT) { - log_err(">>> Loading program error.\n" - ">>> Verifier output:\n%s\n-------\n", bpf_log_buf); - } - - return ret; -} - -static int load_path(const struct sock_addr_test *test, const char *path) -{ - struct bpf_object *obj; - struct bpf_program *prog; - int err; - - obj = bpf_object__open_file(path, NULL); - err = libbpf_get_error(obj); - if (err) { - log_err(">>> Opening BPF object (%s) error.\n", path); - return -1; - } - - prog = bpf_object__next_program(obj, NULL); - if (!prog) - goto err_out; - - bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR); - bpf_program__set_expected_attach_type(prog, test->expected_attach_type); - bpf_program__set_flags(prog, testing_prog_flags()); - - err = bpf_object__load(obj); - if (err) { - if (test->expected_result != LOAD_REJECT) - log_err(">>> Loading program (%s) error.\n", path); - goto err_out; - } - - return bpf_program__fd(prog); -err_out: - bpf_object__close(obj); - return -1; -} - -static int bind4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND4_PROG_PATH); -} - -static int bind6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND6_PROG_PATH); -} - -static int connect4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT4_PROG_PATH); -} - -static int connect6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT6_PROG_PATH); -} - -static int xmsg_ret_only_prog_load(const struct sock_addr_test *test, - int32_t rc) -{ - struct bpf_insn insns[] = { - /* return rc */ - BPF_MOV64_IMM(BPF_REG_0, rc), - BPF_EXIT_INSN(), - }; - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - -static int sendmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - -static int recvmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - -static int recvmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) -{ - struct sockaddr_in dst4_rw_addr; - struct in_addr src4_rw_ip; - - if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) { - log_err("Invalid IPv4: %s", SRC4_REWRITE_IP); - return -1; - } - - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr *)&dst4_rw_addr, - sizeof(dst4_rw_addr)) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8), - - /* sk.type == SOCK_DGRAM) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, type)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6), - - /* msg_src_ip4 = src4_rw_ip */ - BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, msg_src_ip4)), - - /* user_ip4 = dst4_rw_addr.sin_addr */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_ip4)), - - /* user_port = dst4_rw_addr.sin_port */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG4_PROG_PATH); -} - -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG4_PROG_PATH); -} - -static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, - const char *rw_dst_ip) -{ - struct sockaddr_in6 dst6_rw_addr; - struct in6_addr src6_rw_ip; - - if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) { - log_err("Invalid IPv6: %s", SRC6_REWRITE_IP); - return -1; - } - - if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, - (struct sockaddr *)&dst6_rw_addr, - sizeof(dst6_rw_addr)) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET6) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), - -#define STORE_IPV6_WORD_N(DST, SRC, N) \ - BPF_MOV32_IMM(BPF_REG_7, SRC[N]), \ - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ - offsetof(struct bpf_sock_addr, DST[N])) - -#define STORE_IPV6(DST, SRC) \ - STORE_IPV6_WORD_N(DST, SRC, 0), \ - STORE_IPV6_WORD_N(DST, SRC, 1), \ - STORE_IPV6_WORD_N(DST, SRC, 2), \ - STORE_IPV6_WORD_N(DST, SRC, 3) - - STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32), - STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32), - - /* user_port = dst6_rw_addr.sin6_port */ - BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); -} - -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG6_PROG_PATH); -} - -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP); -} - -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); -} - -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG6_PROG_PATH); -} - -static int cmp_addr(const struct sockaddr_storage *addr1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - const struct sockaddr_in *four1, *four2; - const struct sockaddr_in6 *six1, *six2; - - if (addr1->ss_family != addr2->ss_family) - return -1; - - if (addr1->ss_family == AF_INET) { - four1 = (const struct sockaddr_in *)addr1; - four2 = (const struct sockaddr_in *)addr2; - return !((four1->sin_port == four2->sin_port || !cmp_port) && - four1->sin_addr.s_addr == four2->sin_addr.s_addr); - } else if (addr1->ss_family == AF_INET6) { - six1 = (const struct sockaddr_in6 *)addr1; - six2 = (const struct sockaddr_in6 *)addr2; - return !((six1->sin6_port == six2->sin6_port || !cmp_port) && - !memcmp(&six1->sin6_addr, &six2->sin6_addr, - sizeof(struct in6_addr))); - } - - return -1; -} - -static int cmp_sock_addr(info_fn fn, int sock1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - struct sockaddr_storage addr1; - socklen_t len1 = sizeof(addr1); - - memset(&addr1, 0, len1); - if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0) - return -1; - - return cmp_addr(&addr1, addr2, cmp_port); -} - -static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0); -} - -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1); -} - -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); -} - -static int start_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int fd; - - fd = socket(addr->ss_family, type, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (type == SOCK_STREAM) { - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - } - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int connect_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Fail to connect to server"); - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - -int init_pktinfo(int domain, struct cmsghdr *cmsg) -{ - struct in6_pktinfo *pktinfo6; - struct in_pktinfo *pktinfo4; - - if (domain == AF_INET) { - cmsg->cmsg_level = SOL_IP; - cmsg->cmsg_type = IP_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); - pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo4, 0, sizeof(struct in_pktinfo)); - if (inet_pton(domain, SRC4_IP, - (void *)&pktinfo4->ipi_spec_dst) != 1) - return -1; - } else if (domain == AF_INET6) { - cmsg->cmsg_level = SOL_IPV6; - cmsg->cmsg_type = IPV6_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); - pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo6, 0, sizeof(struct in6_pktinfo)); - if (inet_pton(domain, SRC6_IP, - (void *)&pktinfo6->ipi6_addr) != 1) - return -1; - } else { - return -1; - } - - return 0; -} - -static int sendmsg_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len, int set_cmsg, int flags, - int *syscall_err) -{ - union { - char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))]; - struct cmsghdr align; - } control6; - union { - char buf[CMSG_SPACE(sizeof(struct in_pktinfo))]; - struct cmsghdr align; - } control4; - struct msghdr hdr; - struct iovec iov; - char data = 'a'; - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - if (set_cmsg) { - if (domain == AF_INET) { - hdr.msg_control = &control4; - hdr.msg_controllen = sizeof(control4.buf); - } else if (domain == AF_INET6) { - hdr.msg_control = &control6; - hdr.msg_controllen = sizeof(control6.buf); - } - if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) { - log_err("Fail to init pktinfo"); - goto err; - } - } - - if (sendmsg(fd, &hdr, flags) != sizeof(data)) { - log_err("Fail to send message to server"); - *syscall_err = errno; - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - -static int fastconnect_to_server(const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int sendmsg_err; - - return sendmsg_to_server(SOCK_STREAM, addr, addr_len, /*set_cmsg*/0, - MSG_FASTOPEN, &sendmsg_err); -} - -static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr) -{ - struct timeval tv; - struct msghdr hdr; - struct iovec iov; - char data[64]; - fd_set rfds; - - FD_ZERO(&rfds); - FD_SET(sockfd, &rfds); - - tv.tv_sec = 2; - tv.tv_usec = 0; - - if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 || - !FD_ISSET(sockfd, &rfds)) - return -1; - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = src_addr; - hdr.msg_namelen = sizeof(struct sockaddr_storage); - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - return recvmsg(sockfd, &hdr, 0); -} - -static int init_addrs(const struct sock_addr_test *test, - struct sockaddr_storage *requested_addr, - struct sockaddr_storage *expected_addr, - struct sockaddr_storage *expected_src_addr) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - - if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port, - (struct sockaddr *)expected_addr, addr_len) == -1) - goto err; - - if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port, - (struct sockaddr *)requested_addr, addr_len) == -1) - goto err; - - if (test->expected_src_ip && - mk_sockaddr(test->domain, test->expected_src_ip, 0, - (struct sockaddr *)expected_src_addr, addr_len) == -1) - goto err; - - return 0; -err: - return -1; -} - -static int run_bind_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, NULL)) - goto err; - - servfd = start_server(test->type, &requested_addr, addr_len); - if (servfd == -1) - goto err; - - if (cmp_local_addr(servfd, &expected_addr)) - goto err; - - /* Try to connect to server just in case */ - clientfd = connect_to_server(test->type, &expected_addr, addr_len); - if (clientfd == -1) - goto err; - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_connect_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_src_addr; - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, - &expected_src_addr)) - goto err; - - /* Prepare server to connect to */ - servfd = start_server(test->type, &expected_addr, addr_len); - if (servfd == -1) - goto err; - - clientfd = connect_to_server(test->type, &requested_addr, addr_len); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - - if (test->type == SOCK_STREAM) { - /* Test TCP Fast Open scenario */ - clientfd = fastconnect_to_server(&requested_addr, addr_len); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_addr; - struct sockaddr_storage server_addr; - struct sockaddr_storage sendmsg_addr; - struct sockaddr_storage recvmsg_addr; - int clientfd = -1; - int servfd = -1; - int set_cmsg; - int err = 0; - - if (test->type != SOCK_DGRAM) - goto err; - - if (init_addrs(test, &sendmsg_addr, &server_addr, &expected_addr)) - goto err; - - /* Prepare server to sendmsg to */ - servfd = start_server(test->type, &server_addr, addr_len); - if (servfd == -1) - goto err; - - for (set_cmsg = 0; set_cmsg <= max_cmsg; ++set_cmsg) { - if (clientfd >= 0) - close(clientfd); - - clientfd = sendmsg_to_server(test->type, &sendmsg_addr, - addr_len, set_cmsg, /*flags*/0, - &err); - if (err) - goto out; - else if (clientfd == -1) - goto err; - - /* Try to receive message on server instead of using - * getpeername(2) on client socket, to check that client's - * destination address was rewritten properly, since - * getpeername(2) doesn't work with unconnected datagram - * sockets. - * - * Get source address from recvmsg(2) as well to make sure - * source was rewritten properly: getsockname(2) can't be used - * since socket is unconnected and source defined for one - * specific packet may differ from the one used by default and - * returned by getsockname(2). - */ - if (recvmsg_from_client(servfd, &recvmsg_addr) == -1) - goto err; - - if (cmp_addr(&recvmsg_addr, &expected_addr, /*cmp_port*/0)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_test_case(int cgfd, const struct sock_addr_test *test) -{ - int progfd = -1; - int err = 0; - - printf("Test case: %s .. ", test->descr); - - progfd = test->loadfn(test); - if (test->expected_result == LOAD_REJECT && progfd < 0) - goto out; - else if (test->expected_result == LOAD_REJECT || progfd < 0) - goto err; - - err = bpf_prog_attach(progfd, cgfd, test->attach_type, - BPF_F_ALLOW_OVERRIDE); - if (test->expected_result == ATTACH_REJECT && err) { - err = 0; /* error was expected, reset it */ - goto out; - } else if (test->expected_result == ATTACH_REJECT || err) { - goto err; - } else if (test->expected_result == ATTACH_OKAY) { - err = 0; - goto out; - } - - switch (test->attach_type) { - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - err = run_bind_test_case(test); - break; - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - err = run_connect_test_case(test); - break; - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - err = run_xmsg_test_case(test, 1); - break; - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - err = run_xmsg_test_case(test, 0); - break; - default: - goto err; - } - - if (test->expected_result == SYSCALL_EPERM && err == EPERM) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (err || test->expected_result != SUCCESS) - goto err; - - goto out; -err: - err = -1; -out: - /* Detaching w/o checking return code: best effort attempt. */ - if (progfd != -1) - bpf_prog_detach(cgfd, test->attach_type); - close(progfd); - printf("[%s]\n", err ? "FAIL" : "PASS"); - return err; -} - -static int run_tests(int cgfd) -{ - int passes = 0; - int fails = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - if (run_test_case(cgfd, &tests[i])) - ++fails; - else - ++passes; - } - printf("Summary: %d PASSED, %d FAILED\n", passes, fails); - return fails ? -1 : 0; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - if (argc < 2) { - fprintf(stderr, - "%s has to be run via %s.sh. Skip direct run.\n", - argv[0], argv[0]); - exit(err); - } - - cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (run_tests(cgfd)) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - return err; -} diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh deleted file mode 100755 index 3b9fdb8094..0000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -set -eu - -ping_once() -{ - type ping${1} >/dev/null 2>&1 && PING="ping${1}" || PING="ping -${1}" - $PING -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1 -} - -wait_for_ip() -{ - local _i - echo -n "Wait for testing IPv4/IPv6 to become available " - for _i in $(seq ${MAX_PING_TRIES}); do - echo -n "." - if ping_once 4 ${TEST_IPv4} && ping_once 6 ${TEST_IPv6}; then - echo " OK" - return - fi - done - echo 1>&2 "ERROR: Timeout waiting for test IP to become available." - exit 1 -} - -setup() -{ - # Create testing interfaces not to interfere with current environment. - ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} - ip link set ${TEST_IF} up - ip link set ${TEST_IF_PEER} up - - ip -4 addr add ${TEST_IPv4} dev ${TEST_IF} - ip -6 addr add ${TEST_IPv6} dev ${TEST_IF} - wait_for_ip -} - -cleanup() -{ - ip link del ${TEST_IF} 2>/dev/null || : - ip link del ${TEST_IF_PEER} 2>/dev/null || : -} - -main() -{ - trap cleanup EXIT 2 3 6 15 - setup - ./test_sock_addr setup_done -} - -BASENAME=$(basename $0 .sh) -TEST_IF="${BASENAME}1" -TEST_IF_PEER="${BASENAME}2" -TEST_IPv4="127.0.0.4/8" -TEST_IPv6="::6/128" -MAX_PING_TRIES=5 - -main diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 43612de44f..a709911cdd 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -63,7 +63,7 @@ int passed; int failed; int map_fd[9]; struct bpf_map *maps[9]; -int prog_fd[11]; +int prog_fd[9]; int txmsg_pass; int txmsg_redir; @@ -680,7 +680,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, } } - s->bytes_recvd += recv; + if (recv > 0) + s->bytes_recvd += recv; if (opt->check_recved_len && s->bytes_recvd > total_bytes) { errno = EMSGSIZE; @@ -1793,8 +1794,6 @@ int prog_attach_type[] = { BPF_SK_MSG_VERDICT, BPF_SK_MSG_VERDICT, BPF_SK_MSG_VERDICT, - BPF_SK_MSG_VERDICT, - BPF_SK_MSG_VERDICT, }; int prog_type[] = { @@ -1807,8 +1806,6 @@ int prog_type[] = { BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_SK_MSG, - BPF_PROG_TYPE_SK_MSG, - BPF_PROG_TYPE_SK_MSG, }; static int populate_progs(char *bpf_file) @@ -1887,10 +1884,13 @@ static int check_whitelist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } @@ -1907,10 +1907,13 @@ static int check_blacklist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index 32df937470..7b5fc98838 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -16,68 +16,7 @@ #include #include "cgroup_helpers.h" - -static int start_server(const struct sockaddr *addr, socklen_t len, bool dual) -{ - int mode = !dual; - int fd; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (addr->sa_family == AF_INET6) { - if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&mode, - sizeof(mode)) == -1) { - log_err("Failed to set the dual-stack mode"); - goto close_out; - } - } - - if (bind(fd, addr, len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int connect_to_server(const struct sockaddr *addr, socklen_t len) -{ - int fd = -1; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)addr, len) == -1) { - log_err("Fail to connect to server"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} +#include "network_helpers.h" static int get_map_fd_by_prog_id(int prog_id, bool *xdp) { @@ -117,8 +56,7 @@ err: return map_fd; } -static int run_test(int server_fd, int results_fd, bool xdp, - const struct sockaddr *addr, socklen_t len) +static int run_test(int server_fd, int results_fd, bool xdp) { int client = -1, srv_client = -1; int ret = 0; @@ -144,7 +82,7 @@ static int run_test(int server_fd, int results_fd, bool xdp, goto err; } - client = connect_to_server(addr, len); + client = connect_to_fd(server_fd, 0); if (client == -1) goto err; @@ -201,23 +139,23 @@ out: return ret; } -static bool get_port(int server_fd, in_port_t *port) +static int v6only_true(int fd, const struct post_socket_opts *opts) { - struct sockaddr_in addr; - socklen_t len = sizeof(addr); + int mode = true; - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - return false; - } + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); +} + +static int v6only_false(int fd, const struct post_socket_opts *opts) +{ + int mode = false; - /* sin_port and sin6_port are located at the same offset. */ - *port = addr.sin_port; - return true; + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); } int main(int argc, char **argv) { + struct network_helper_opts opts = { 0 }; struct sockaddr_in addr4; struct sockaddr_in6 addr6; struct sockaddr_in addr4dual; @@ -259,31 +197,30 @@ int main(int argc, char **argv) addr6dual.sin6_addr = in6addr_any; addr6dual.sin6_port = 0; - server = start_server((const struct sockaddr *)&addr4, sizeof(addr4), - false); - if (server == -1 || !get_port(server, &addr4.sin_port)) + server = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr4, + sizeof(addr4), NULL); + if (server == -1) goto err; - server_v6 = start_server((const struct sockaddr *)&addr6, - sizeof(addr6), false); - if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port)) + opts.post_socket_cb = v6only_true; + server_v6 = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6, + sizeof(addr6), &opts); + if (server_v6 == -1) goto err; - server_dual = start_server((const struct sockaddr *)&addr6dual, - sizeof(addr6dual), true); - if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) + opts.post_socket_cb = v6only_false; + server_dual = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6dual, + sizeof(addr6dual), &opts); + if (server_dual == -1) goto err; - if (run_test(server, results, xdp, - (const struct sockaddr *)&addr4, sizeof(addr4))) + if (run_test(server, results, xdp)) goto err; - if (run_test(server_v6, results, xdp, - (const struct sockaddr *)&addr6, sizeof(addr6))) + if (run_test(server_v6, results, xdp)) goto err; - if (run_test(server_dual, results, xdp, - (const struct sockaddr *)&addr4dual, sizeof(addr4dual))) + if (run_test(server_dual, results, xdp)) goto err; printf("ok\n"); diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 28b6646662..d5379a0e6d 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -368,9 +368,23 @@ int delete_module(const char *name, int flags) int unload_bpf_testmod(bool verbose) { + int ret, cnt = 0; + if (kern_sync_rcu()) fprintf(stdout, "Failed to trigger kernel-side RCU sync!\n"); - if (delete_module("bpf_testmod", 0)) { + + for (;;) { + ret = delete_module("bpf_testmod", 0); + if (!ret || errno != EAGAIN) + break; + if (++cnt > 10000) { + fprintf(stdout, "Unload of bpf_testmod timed out\n"); + break; + } + usleep(100); + } + + if (ret) { if (errno == ENOENT) { if (verbose) fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 27fd7ed3e4..70e29f316f 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -61,12 +61,7 @@ void free_kallsyms_local(struct ksyms *ksyms) free(ksyms); } -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -struct ksyms *load_kallsyms_local(void) +static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb) { FILE *f; char func[256], buf[256]; @@ -100,7 +95,7 @@ struct ksyms *load_kallsyms_local(void) goto error; } fclose(f); - qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), ksym_cmp); + qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb); return ksyms; error: @@ -109,6 +104,21 @@ error: return NULL; } +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +struct ksyms *load_kallsyms_local(void) +{ + return load_kallsyms_local_common(ksym_cmp); +} + +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb) +{ + return load_kallsyms_local_common(cmp_cb); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); @@ -148,6 +158,28 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key) return &ksyms->syms[0]; } +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p, + ksym_search_cmp_t cmp_cb) +{ + int start = 0, mid, end = ksyms->sym_cnt; + struct ksym *ks; + int result; + + while (start < end) { + mid = start + (end - start) / 2; + ks = &ksyms->syms[mid]; + result = cmp_cb(p, ks); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return ks; + } + + return NULL; +} + struct ksym *ksym_search(long key) { if (!ksyms) @@ -201,29 +233,6 @@ out: return err; } -void read_trace_pipe(void) -{ - int trace_fd; - - if (access(TRACEFS_PIPE, F_OK) == 0) - trace_fd = open(TRACEFS_PIPE, O_RDONLY, 0); - else - trace_fd = open(DEBUGFS_PIPE, O_RDONLY, 0); - if (trace_fd < 0) - return; - - while (1) { - static char buf[4096]; - ssize_t sz; - - sz = read(trace_fd, buf, sizeof(buf) - 1); - if (sz > 0) { - buf[sz] = 0; - puts(buf); - } - } -} - ssize_t get_uprobe_offset(const void *addr) { size_t start, end, base; @@ -381,3 +390,43 @@ out: close(fd); return err; } + +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter) +{ + size_t buflen, n; + char *buf = NULL; + FILE *fp = NULL; + + if (access(TRACEFS_PIPE, F_OK) == 0) + fp = fopen(TRACEFS_PIPE, "r"); + else + fp = fopen(DEBUGFS_PIPE, "r"); + if (!fp) + return -1; + + /* We do not want to wait forever when iter is specified. */ + if (iter) + fcntl(fileno(fp), F_SETFL, O_NONBLOCK); + + while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) { + if (n > 0) + cb(buf, data); + if (iter && !(--iter)) + break; + } + + free(buf); + if (fp) + fclose(fp); + return 0; +} + +static void trace_pipe_cb(const char *str, void *data) +{ + printf("%s", str); +} + +void read_trace_pipe(void) +{ + read_trace_pipe_iter(trace_pipe_cb, NULL, 0); +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 04fd1da707..2ce873c9f9 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -13,6 +13,9 @@ struct ksym { }; struct ksyms; +typedef int (*ksym_cmp_t)(const void *p1, const void *p2); +typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); + int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); @@ -22,10 +25,16 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key); long ksym_get_addr_local(struct ksyms *ksyms, const char *name); void free_kallsyms_local(struct ksyms *ksyms); +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb); +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, + ksym_search_cmp_t cmp_cb); + /* open kallsyms and find addresses on the fly, faster than load + search. */ int kallsyms_find(const char *sym, unsigned long long *addr); void read_trace_pipe(void); +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), + void *data, int iter); ssize_t get_uprobe_offset(const void *addr); ssize_t get_rel_offset(uintptr_t addr); diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index a61ceab60b..7ffa563ffe 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -9,7 +9,7 @@ #define NAME(name, idx) PASTE(name, idx) -#define DEF(name, idx) int NAME(name, idx)(void) { return 0; } +#define DEF(name, idx) int __attribute__((weak)) NAME(name, idx)(void) { return 0; } #define CALL(name, idx) NAME(name, idx)(); #define F(body, name, idx) body(name, idx) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 244d4996e0..b2854238d4 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -792,10 +792,13 @@ static int parse_stats(const char *stats_str, struct stat_specs *specs) while ((next = strtok_r(state ? NULL : input, ",", &state))) { err = parse_stat(next, specs); - if (err) + if (err) { + free(input); return err; + } } + free(input); return 0; } diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index bdf5d81800..6f9956eed7 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -495,20 +495,6 @@ peek: return 0; } -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ - static int rxq_num(const char *ifname) { struct ethtool_channels ch = { @@ -595,6 +581,8 @@ static void cleanup(void) if (bpf_obj) xdp_hw_metadata__destroy(bpf_obj); + + free((void *)saved_hwtstamp_ifname); } static void handle_signal(int sig) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index b1102ee13f..2eac0895b0 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -105,11 +106,15 @@ #include "../kselftest.h" #include "xsk_xdp_common.h" +#include + static bool opt_verbose; static bool opt_print_tests; static enum test_mode opt_mode = TEST_MODE_ALL; static u32 opt_run_test = RUN_ALL_TESTS; +void test__fail(void) { /* for network_helpers.c */ } + static void __exit_with_error(int error, const char *file, const char *func, int line) { ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, @@ -239,7 +244,7 @@ static void enable_busy_poll(struct xsk_socket_info *xsk) (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); - sock_opt = BATCH_SIZE; + sock_opt = xsk->batch_size; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); @@ -409,6 +414,33 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj } } +static int set_ring_size(struct ifobject *ifobj) +{ + int ret; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); + if (!ret) + break; + + /* Retry if it fails */ + if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) + return -errno; + + usleep(USLEEP_MAX); + } + + return ret; +} + +static int hw_ring_size_reset(struct ifobject *ifobj) +{ + ifobj->ring.tx_pending = ifobj->set_ring.default_tx; + ifobj->ring.rx_pending = ifobj->set_ring.default_rx; + return set_ring_size(ifobj); +} + static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, struct ifobject *ifobj_rx) { @@ -439,6 +471,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, for (j = 0; j < MAX_SOCKETS; j++) { memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; if (i == 0) ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; else @@ -451,12 +484,16 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + test->ifobj_tx = ifobj_tx; test->ifobj_rx = ifobj_rx; test->current_step = 0; test->total_steps = 1; test->nb_sockets = 1; test->fail = false; + test->set_ring = false; test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; @@ -1087,7 +1124,7 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) return TEST_CONTINUE; } - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); if (!rcvd) return TEST_CONTINUE; @@ -1239,7 +1276,8 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) { + if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / + buffer_len)) { ret = kick_tx(xsk); if (ret) return TEST_FAILURE; @@ -1249,7 +1287,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b fds.fd = xsk_socket__fd(xsk->xsk); fds.events = POLLOUT; - while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) { + while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { if (use_poll) { ret = poll(&fds, 1, POLL_TMOUT); if (timeout) { @@ -1269,10 +1307,10 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b } } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } - for (i = 0; i < BATCH_SIZE; i++) { + for (i = 0; i < xsk->batch_size; i++) { struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); u32 nb_frags_left, nb_frags, bytes_written = 0; @@ -1280,9 +1318,9 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b break; nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); - if (nb_frags > BATCH_SIZE - i) { + if (nb_frags > xsk->batch_size - i) { pkt_stream_cancel(pkt_stream); - xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i); + xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); break; } nb_frags_left = nb_frags; @@ -1370,7 +1408,7 @@ static int wait_for_tx_completion(struct xsk_socket_info *xsk) return TEST_FAILURE; } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } return TEST_PASS; @@ -1860,6 +1898,14 @@ static int testapp_validate_traffic(struct test_spec *test) return TEST_SKIP; } + if (test->set_ring) { + if (ifobj_tx->hw_ring_size_supp) + return set_ring_size(ifobj_tx); + + ksft_test_result_skip("Changing HW ring size not supported.\n"); + return TEST_SKIP; + } + xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx); return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); } @@ -2373,6 +2419,50 @@ static int testapp_xdp_metadata_mb(struct test_spec *test) return testapp_xdp_metadata_copy(test); } +static int testapp_hw_sw_min_ring_size(struct test_spec *test) +{ + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; + test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; + test->ifobj_tx->xsk->batch_size = 1; + test->ifobj_rx->xsk->batch_size = 1; + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch size to hw_ring_size - 1 */ + test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + return testapp_validate_traffic(test); +} + +static int testapp_hw_sw_max_ring_size(struct test_spec *test) +{ + u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; + test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; + test->ifobj_rx->umem->num_frames = max_descs; + test->ifobj_rx->xsk->rxqsize = max_descs; + test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch_size to 4095 */ + test->ifobj_tx->xsk->batch_size = max_descs - 1; + test->ifobj_rx->xsk->batch_size = max_descs - 1; + return testapp_validate_traffic(test); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2477,7 +2567,9 @@ static const struct test_spec tests[] = { {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, -}; + {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, + {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, + }; static void print_tests(void) { @@ -2497,6 +2589,7 @@ int main(int argc, char **argv) int modes = TEST_MODE_SKB + 1; struct test_spec test; bool shared_netdev; + int ret; /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); @@ -2534,6 +2627,13 @@ int main(int argc, char **argv) modes++; } + ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring); + if (!ret) { + ifobj_tx->hw_ring_size_supp = true; + ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending; + ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; + } + init_iface(ifobj_rx, worker_testapp_validate_rx); init_iface(ifobj_tx, worker_testapp_validate_tx); @@ -2581,6 +2681,9 @@ int main(int argc, char **argv) } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + pkt_stream_delete(tx_pkt_stream_default); pkt_stream_delete(rx_pkt_stream_default); xsk_unload_xdp_programs(ifobj_tx); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index f174df2d69..906de5fab7 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -44,7 +44,7 @@ #define MAX_ETH_JUMBO_SIZE 9000 #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 +#define DEFAULT_BATCH_SIZE 64 #define POLL_TMOUT 1000 #define THREAD_TMOUT 3 #define DEFAULT_PKT_CNT (4 * 1024) @@ -91,6 +91,7 @@ struct xsk_socket_info { struct pkt_stream *pkt_stream; u32 outstanding_tx; u32 rxqsize; + u32 batch_size; u8 dst_mac[ETH_ALEN]; u8 src_mac[ETH_ALEN]; }; @@ -113,6 +114,11 @@ struct pkt_stream { bool verbatim; }; +struct set_hw_ring { + u32 default_tx; + u32 default_rx; +}; + struct ifobject; struct test_spec; typedef int (*validation_func_t)(struct ifobject *ifobj); @@ -129,6 +135,8 @@ struct ifobject { struct xsk_xdp_progs *xdp_progs; struct bpf_map *xskmap; struct bpf_program *xdp_prog; + struct ethtool_ringparam ring; + struct set_hw_ring set_ring; enum test_mode mode; int ifindex; int mtu; @@ -145,6 +153,7 @@ struct ifobject { bool unaligned_supp; bool multi_buff_supp; bool multi_buff_zc_supp; + bool hw_ring_size_supp; }; struct test_spec { @@ -162,6 +171,7 @@ struct test_spec { u16 current_step; u16 nb_sockets; bool fail; + bool set_ring; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c index 7cde07a5df..47bad7ddc5 100644 --- a/tools/testing/selftests/capabilities/test_execve.c +++ b/tools/testing/selftests/capabilities/test_execve.c @@ -82,7 +82,7 @@ static bool create_and_enter_ns(uid_t inner_uid) { uid_t outer_uid; gid_t outer_gid; - int i; + int i, ret; bool have_outer_privilege; outer_uid = getuid(); @@ -97,7 +97,10 @@ static bool create_and_enter_ns(uid_t inner_uid) ksft_exit_fail_msg("setresuid - %s\n", strerror(errno)); // Re-enable effective caps - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) + ksft_exit_fail_msg("capng_get_caps_process failed\n"); + for (i = 0; i < CAP_LAST_CAP; i++) if (capng_have_capability(CAPNG_PERMITTED, i)) capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i); @@ -207,6 +210,7 @@ static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient) static int do_tests(int uid, const char *our_path) { + int ret; bool have_outer_privilege = create_and_enter_ns(uid); int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY); @@ -250,7 +254,9 @@ static int do_tests(int uid, const char *our_path) ksft_exit_fail_msg("chmod - %s\n", strerror(errno)); } - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) + ksft_exit_fail_msg("capng_get_caps_process failed\n"); /* Make sure that i starts out clear */ capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c index 60b4e7b716..65f2a1c892 100644 --- a/tools/testing/selftests/capabilities/validate_cap.c +++ b/tools/testing/selftests/capabilities/validate_cap.c @@ -28,6 +28,7 @@ static bool bool_arg(char **argv, int i) int main(int argc, char **argv) { const char *atsec = ""; + int ret; /* * Be careful just in case a setgid or setcapped copy of this @@ -44,7 +45,11 @@ int main(int argc, char **argv) atsec = " (AT_SECURE is not set)"; #endif - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) { + ksft_print_msg("capng_get_caps_process failed\n"); + return 1; + } if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) { ksft_print_msg("Wrong effective state%s\n", atsec); diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 00b4419289..16461dc0ff 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wall -pthread all: ${HELPER_PROGS} TEST_FILES := with_stress.sh -TEST_PROGS := test_stress.sh test_cpuset_prs.sh +TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh TEST_GEN_FILES := wait_inotify TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_kmem diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 89e8519fb2..e8d04ac9e3 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -18,7 +18,7 @@ */ static inline int values_close(long a, long b, int err) { - return abs(a - b) <= (a + b) / 100 * err; + return labs(a - b) <= (a + b) / 100 * err; } extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate); diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index 186bf96f6a..dad2ed82f3 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -237,7 +237,7 @@ run_cpucg_weight_test( { int ret = KSFT_FAIL, i; char *parent = NULL; - struct cpu_hogger children[3] = {NULL}; + struct cpu_hogger children[3] = {}; parent = cg_name(root, "cpucg_test_0"); if (!parent) @@ -408,7 +408,7 @@ run_cpucg_nested_weight_test(const char *root, bool overprovisioned) { int ret = KSFT_FAIL, i; char *parent = NULL, *child = NULL; - struct cpu_hogger leaf[3] = {NULL}; + struct cpu_hogger leaf[3] = {}; long nested_leaf_usage, child_usage; int nprocs = get_nprocs(); diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh new file mode 100755 index 0000000000..3f45512fb5 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -0,0 +1,46 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Test the special cpuset v1 hotplug case where a cpuset become empty of +# CPUs will force migration of tasks out to an ancestor. +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk -e '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, put a CPU and a task there and offline that CPU +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR +echo 1 > $CPUSET/$TDIR/cpuset.cpus +echo 0 > $CPUSET/$TDIR/cpuset.mems +sleep 10& +TASK=$! +echo $TASK > $CPUSET/$TDIR/tasks +NEWCS=$(cat /proc/$TASK/cpuset) +[[ $NEWCS != "/$TDIR" ]] && { + echo "Unexpected cpuset $NEWCS, test FAILED!" + exit 1 +} + +echo 0 > /sys/devices/system/cpu/cpu1/online +sleep 0.5 +echo 1 > /sys/devices/system/cpu/cpu1/online +NEWCS=$(cat /proc/$TASK/cpuset) +rmdir $CPUSET/$TDIR +[[ $NEWCS != "/" ]] && { + echo "cpuset $NEWCS, test FAILED!" + exit 1 +} +echo "Test PASSED" +exit 0 diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 137506db03..96693d8772 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -192,7 +192,7 @@ static int test_kmem_memcg_deletion(const char *root) goto cleanup; sum = anon + file + kernel + sock; - if (abs(sum - current) < MAX_VMSTAT_ERROR) { + if (labs(sum - current) < MAX_VMSTAT_ERROR) { ret = KSFT_PASS; } else { printf("memory.current = %ld\n", current); @@ -380,7 +380,7 @@ static int test_percpu_basic(const char *root) current = cg_read_long(parent, "memory.current"); percpu = cg_read_key_long(parent, "memory.stat", "percpu "); - if (current > 0 && percpu > 0 && abs(current - percpu) < + if (current > 0 && percpu > 0 && labs(current - percpu) < MAX_VMSTAT_ERROR) ret = KSFT_PASS; else diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index b462416b38..41ae8047b8 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -716,7 +716,9 @@ static bool reclaim_until(const char *memcg, long goal) */ static int test_memcg_reclaim(const char *root) { - int ret = KSFT_FAIL, fd, retries; + int ret = KSFT_FAIL; + int fd = -1; + int retries; char *memcg; long current, expected_usage; diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index ef7f395453..190096017f 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,7 +50,7 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } -static int get_cg_wb_count(const char *cg) +static long get_cg_wb_count(const char *cg) { return cg_read_key_long(cg, "memory.stat", "zswpwb"); } @@ -248,6 +248,132 @@ out: return ret; } +/* + * Attempt writeback with the following steps: + * 1. Allocate memory. + * 2. Reclaim memory equal to the amount that was allocated in step 1. + This will move it into zswap. + * 3. Save current zswap usage. + * 4. Move the memory allocated in step 1 back in from zswap. + * 5. Set zswap.max to half the amount that was recorded in step 3. + * 6. Attempt to reclaim memory equal to the amount that was allocated, + this will either trigger writeback if it's enabled, or reclamation + will fail if writeback is disabled as there isn't enough zswap space. + */ +static int attempt_writeback(const char *cgroup, void *arg) +{ + long pagesize = sysconf(_SC_PAGESIZE); + char *test_group = arg; + size_t memsize = MB(4); + char buf[pagesize]; + long zswap_usage; + bool wb_enabled; + int ret = -1; + char *mem; + + wb_enabled = cg_read_long(test_group, "memory.zswap.writeback"); + mem = (char *)malloc(memsize); + if (!mem) + return ret; + + /* + * Fill half of each page with increasing data, and keep other + * half empty, this will result in data that is still compressible + * and ends up in zswap, with material zswap usage. + */ + for (int i = 0; i < pagesize; i++) + buf[i] = i < pagesize/2 ? (char) i : 0; + + for (int i = 0; i < memsize; i += pagesize) + memcpy(&mem[i], buf, pagesize); + + /* Try and reclaim allocated memory */ + if (cg_write_numeric(test_group, "memory.reclaim", memsize)) { + ksft_print_msg("Failed to reclaim all of the requested memory\n"); + goto out; + } + + zswap_usage = cg_read_long(test_group, "memory.zswap.current"); + + /* zswpin */ + for (int i = 0; i < memsize; i += pagesize) { + if (memcmp(&mem[i], buf, pagesize)) { + ksft_print_msg("invalid memory\n"); + goto out; + } + } + + if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2)) + goto out; + + /* + * If writeback is enabled, trying to reclaim memory now will trigger a + * writeback as zswap.max is half of what was needed when reclaim ran the first time. + * If writeback is disabled, memory reclaim will fail as zswap is limited and + * it can't writeback to swap. + */ + ret = cg_write_numeric(test_group, "memory.reclaim", memsize); + if (!wb_enabled) + ret = (ret == -EAGAIN) ? 0 : -1; + +out: + free(mem); + return ret; +} + +/* Test to verify the zswap writeback path */ +static int test_zswap_writeback(const char *root, bool wb) +{ + long zswpwb_before, zswpwb_after; + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "zswap_writeback_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) + goto out; + + zswpwb_before = get_cg_wb_count(test_group); + if (zswpwb_before != 0) { + ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + goto out; + } + + if (cg_run(test_group, attempt_writeback, (void *) test_group)) + goto out; + + /* Verify that zswap writeback occurred only if writeback was enabled */ + zswpwb_after = get_cg_wb_count(test_group); + if (zswpwb_after < 0) + goto out; + + if (wb != !!zswpwb_after) { + ksft_print_msg("zswpwb_after is %ld while wb is %s", + zswpwb_after, wb ? "enabled" : "disabled"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +static int test_zswap_writeback_enabled(const char *root) +{ + return test_zswap_writeback(root, true); +} + +static int test_zswap_writeback_disabled(const char *root) +{ + return test_zswap_writeback(root, false); +} + /* * When trying to store a memcg page in zswap, if the memcg hits its memory * limit in zswap, writeback should affect only the zswapped pages of that @@ -257,7 +383,7 @@ static int test_no_invasive_cgroup_shrink(const char *root) { int ret = KSFT_FAIL; size_t control_allocation_size = MB(10); - char *control_allocation, *wb_group = NULL, *control_group = NULL; + char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); if (!wb_group) @@ -342,7 +468,7 @@ static int test_no_kmem_bypass(const char *root) struct sysinfo sys_info; int ret = KSFT_FAIL; int child_status; - char *test_group; + char *test_group = NULL; pid_t child_pid; /* Read sys info and compute test values accordingly */ @@ -364,8 +490,6 @@ static int test_no_kmem_bypass(const char *root) trigger_allocation_size = sys_info.totalram / 20; /* Set up test memcg */ - if (cg_write(root, "cgroup.subtree_control", "+memory")) - goto out; test_group = cg_name(root, "kmem_bypass_test"); if (!test_group) goto out; @@ -425,6 +549,8 @@ struct zswap_test { T(test_zswap_usage), T(test_swapin_nozswap), T(test_zswapin), + T(test_zswap_writeback_enabled), + T(test_zswap_writeback_disabled), T(test_no_kmem_bypass), T(test_no_invasive_cgroup_shrink), }; diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 3c9bf0cd82..e61f07973c 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -95,9 +95,14 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) getpid(), pid); if (waitpid(-1, &status, __WALL) < 0) { - ksft_print_msg("Child returned %s\n", strerror(errno)); + ksft_print_msg("waitpid() returned %s\n", strerror(errno)); return -errno; } + if (!WIFEXITED(status)) { + ksft_print_msg("Child did not exit normally, status 0x%x\n", + status); + return EXIT_FAILURE; + } if (WEXITSTATUS(status)) return WEXITSTATUS(status); diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c index 54a8b2445b..ce04267868 100644 --- a/tools/testing/selftests/clone3/clone3_clear_sighand.c +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -120,5 +120,5 @@ int main(int argc, char **argv) test_clone3_clear_sighand(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index ed785afb60..bfb0da2b4f 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -114,7 +114,8 @@ static int call_clone3_set_tid(pid_t *set_tid, return WEXITSTATUS(status); } -static void test_clone3_set_tid(pid_t *set_tid, +static void test_clone3_set_tid(const char *desc, + pid_t *set_tid, size_t set_tid_size, int flags, int expected, @@ -129,17 +130,13 @@ static void test_clone3_set_tid(pid_t *set_tid, ret = call_clone3_set_tid(set_tid, set_tid_size, flags, expected_pid, wait_for_it); ksft_print_msg( - "[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n", + "[%d] clone3() with CLONE_SET_TID %d says: %d - expected %d\n", getpid(), set_tid[0], ret, expected); - if (ret != expected) - ksft_test_result_fail( - "[%d] Result (%d) is different than expected (%d)\n", - getpid(), ret, expected); - else - ksft_test_result_pass( - "[%d] Result (%d) matches expectation (%d)\n", - getpid(), ret, expected); + + ksft_test_result(ret == expected, "%s with %zu TIDs and flags 0x%x\n", + desc, set_tid_size, flags); } + int main(int argc, char *argv[]) { FILE *f; @@ -172,73 +169,91 @@ int main(int argc, char *argv[]) /* Try invalid settings */ memset(&set_tid, 0, sizeof(set_tid)); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, - -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); /* * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 * nested PID namespace. */ - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); memset(&set_tid, 0xff, sizeof(set_tid)); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, - -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); /* * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 * nested PID namespace. */ - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); memset(&set_tid, 0, sizeof(set_tid)); /* Try with an invalid PID */ set_tid[0] = 0; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, 0 TID", + set_tid, 1, 0, -EINVAL, 0, 0); set_tid[0] = -1; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, -1 TID", + set_tid, 1, 0, -EINVAL, 0, 0); /* Claim that the set_tid array actually contains 2 elements. */ - test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("2 TIDs, -1 and 0", + set_tid, 2, 0, -EINVAL, 0, 0); /* Try it in a new PID namespace */ if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, -1 TID", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* Try with a valid PID (1) this should return -EEXIST. */ set_tid[0] = 1; if (uid == 0) - test_clone3_set_tid(set_tid, 1, 0, -EEXIST, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, 0, -EEXIST, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* Try it in a new PID namespace */ if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, 0, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, CLONE_NEWPID, 0, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* pid_max should fail everywhere */ set_tid[0] = pid_max; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("set TID to maximum", + set_tid, 1, 0, -EINVAL, 0, 0); if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("set TID to maximum", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); @@ -262,10 +277,12 @@ int main(int argc, char *argv[]) /* After the child has finished, its PID should be free. */ set_tid[0] = pid; - test_clone3_set_tid(set_tid, 1, 0, 0, 0, 0); + test_clone3_set_tid("reallocate child TID", + set_tid, 1, 0, 0, 0, 0); /* This should fail as there is no PID 1 in that namespace */ - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("duplicate child TID", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); /* * Creating a process with PID 1 in the newly created most nested @@ -274,7 +291,8 @@ int main(int argc, char *argv[]) */ set_tid[0] = 1; set_tid[1] = pid; - test_clone3_set_tid(set_tid, 2, CLONE_NEWPID, 0, pid, 0); + test_clone3_set_tid("create PID 1 in new NS", + set_tid, 2, CLONE_NEWPID, 0, pid, 0); ksft_print_msg("unshare PID namespace\n"); if (unshare(CLONE_NEWPID) == -1) @@ -284,7 +302,8 @@ int main(int argc, char *argv[]) set_tid[0] = pid; /* This should fail as there is no PID 1 in that namespace */ - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, 0, -EINVAL, 0, 0); /* Let's create a PID 1 */ ns_pid = fork(); @@ -295,21 +314,25 @@ int main(int argc, char *argv[]) */ set_tid[0] = 43; set_tid[1] = -1; - test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("check leak on invalid TID -1", + set_tid, 2, 0, -EINVAL, 0, 0); set_tid[0] = 43; set_tid[1] = pid; - test_clone3_set_tid(set_tid, 2, 0, 0, 43, 0); + test_clone3_set_tid("check leak on invalid specific TID", + set_tid, 2, 0, 0, 43, 0); ksft_print_msg("Child in PID namespace has PID %d\n", getpid()); set_tid[0] = 2; - test_clone3_set_tid(set_tid, 1, 0, 0, 2, 0); + test_clone3_set_tid("create PID 2 in child NS", + set_tid, 1, 0, 0, 2, 0); set_tid[0] = 1; set_tid[1] = -1; set_tid[2] = pid; /* This should fail as there is invalid PID at level '1'. */ - test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("fail due to invalid TID at level 1", + set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0); set_tid[0] = 1; set_tid[1] = 42; @@ -319,13 +342,15 @@ int main(int argc, char *argv[]) * namespaces. Again assuming this is running in the host's * PID namespace. Not yet nested. */ - test_clone3_set_tid(set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("fail due to too few active PID NSs", + set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0); /* * This should work and from the parent we should see * something like 'NSpid: pid 42 1'. */ - test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true); + test_clone3_set_tid("verify that we have 3 PID NSs", + set_tid, 3, CLONE_NEWPID, 0, 42, true); child_exit(ksft_cnt.ksft_fail); } @@ -380,16 +405,14 @@ int main(int argc, char *argv[]) ksft_cnt.ksft_pass += 6 - (ksft_cnt.ksft_fail - WEXITSTATUS(status)); ksft_cnt.ksft_fail = WEXITSTATUS(status); - if (ns3 == pid && ns2 == 42 && ns1 == 1) - ksft_test_result_pass( - "PIDs in all namespaces as expected (%d,%d,%d)\n", - ns3, ns2, ns1); - else - ksft_test_result_fail( - "PIDs in all namespaces not as expected (%d,%d,%d)\n", - ns3, ns2, ns1); + ksft_print_msg("Expecting PIDs %d, 42, 1\n", pid); + ksft_print_msg("Have PIDs in namespaces: %d, %d, %d\n", ns3, ns2, ns1); + ksft_test_result(ns3 == pid && ns2 == 42 && ns1 == 1, + "PIDs in all namespaces as expected\n"); out: ret = 0; - return !ret ? ksft_exit_pass() : ksft_exit_fail(); + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index c59e4adb90..991c473e38 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -17,6 +17,15 @@ #include "../kselftest_harness.h" #include "../clone3/clone3_selftests.h" + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_DUPFD_QUERY +#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) +#endif + static inline int sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) { @@ -45,6 +54,15 @@ TEST(core_close_range) SKIP(return, "close_range() syscall not supported"); } + for (i = 0; i < 100; i++) { + ret = fcntl(open_fds[i], F_DUPFD_QUERY, open_fds[i + 1]); + if (ret < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(ret, 0); + } + } + EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], 0)); for (i = 0; i <= 50; i++) @@ -358,7 +376,7 @@ TEST(close_range_cloexec_unshare) */ TEST(close_range_cloexec_syzbot) { - int fd1, fd2, fd3, flags, ret, status; + int fd1, fd2, fd3, fd4, flags, ret, status; pid_t pid; struct __clone_args args = { .flags = CLONE_FILES, @@ -372,6 +390,13 @@ TEST(close_range_cloexec_syzbot) fd2 = dup2(fd1, 1000); EXPECT_GT(fd2, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd2); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); @@ -396,6 +421,15 @@ TEST(close_range_cloexec_syzbot) fd3 = dup2(fd1, 42); EXPECT_GT(fd3, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd3); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + + + /* * Duplicating the file descriptor must remove the * FD_CLOEXEC flag. @@ -426,6 +460,24 @@ TEST(close_range_cloexec_syzbot) fd3 = dup2(fd1, 42); EXPECT_GT(fd3, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd3); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + + fd4 = open("/dev/null", O_RDWR); + EXPECT_GT(fd4, 0); + + /* Same inode, different file pointers. */ + flags = fcntl(fd1, F_DUPFD_QUERY, fd4); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 0); + } + flags = fcntl(fd3, F_GETFD); EXPECT_GT(flags, -1); EXPECT_EQ(flags & FD_CLOEXEC, 0); @@ -433,6 +485,7 @@ TEST(close_range_cloexec_syzbot) EXPECT_EQ(close(fd1), 0); EXPECT_EQ(close(fd2), 0); EXPECT_EQ(close(fd3), 0); + EXPECT_EQ(close(fd4), 0); } /* diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh index b583a2fb45..a8b1dbc0a3 100755 --- a/tools/testing/selftests/cpufreq/cpufreq.sh +++ b/tools/testing/selftests/cpufreq/cpufreq.sh @@ -178,8 +178,7 @@ cpufreq_basic_tests() count=$(count_cpufreq_managed_cpus) if [ $count = 0 ]; then - printf "No cpu is managed by cpufreq core, exiting\n" - exit; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting\n" else printf "CPUFreq manages: $count CPUs\n\n" fi diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh index 60ce18ed06..a0eb84cf71 100755 --- a/tools/testing/selftests/cpufreq/main.sh +++ b/tools/testing/selftests/cpufreq/main.sh @@ -7,15 +7,15 @@ source governor.sh source module.sh source special-tests.sh +DIR="$(dirname $(readlink -f "$0"))" +source "${DIR}"/../kselftest/ktap_helpers.sh + FUNC=basic # do basic tests by default OUTFILE=cpufreq_selftest SYSFS= CPUROOT= CPUFREQROOT= -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - helpme() { printf "Usage: $0 [-h] [-todg args] @@ -32,7 +32,7 @@ helpme() [-d \"] [-g \"] \n" - exit 2 + exit "${KSFT_FAIL}" } prerequisite() @@ -40,8 +40,8 @@ prerequisite() msg="skip all tests:" if [ $UID != 0 ]; then - echo $msg must be run as root >&2 - exit $ksft_skip + ktap_skip_all "$msg must be run as root" + exit "${KSFT_SKIP}" fi taskset -p 01 $$ @@ -49,21 +49,21 @@ prerequisite() SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'` if [ ! -d "$SYSFS" ]; then - echo $msg sysfs is not mounted >&2 - exit 2 + ktap_skip_all "$msg sysfs is not mounted" + exit "${KSFT_SKIP}" fi CPUROOT=$SYSFS/devices/system/cpu CPUFREQROOT="$CPUROOT/cpufreq" if ! ls $CPUROOT/cpu* > /dev/null 2>&1; then - echo $msg cpus not available in sysfs >&2 - exit 2 + ktap_skip_all "$msg cpus not available in sysfs" + exit "${KSFT_SKIP}" fi if ! ls $CPUROOT/cpufreq > /dev/null 2>&1; then - echo $msg cpufreq directory not available in sysfs >&2 - exit 2 + ktap_skip_all "$msg cpufreq directory not available in sysfs" + exit "${KSFT_SKIP}" fi } @@ -105,8 +105,7 @@ do_test() count=$(count_cpufreq_managed_cpus) if [ $count = 0 -a $FUNC != "modtest" ]; then - echo "No cpu is managed by cpufreq core, exiting" - exit 2; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting" fi case "$FUNC" in @@ -125,8 +124,7 @@ do_test() "modtest") # Do we have modules in place? if [ -z $DRIVER_MOD ] && [ -z $GOVERNOR_MOD ]; then - echo "No driver or governor module passed with -d or -g" - exit 2; + ktap_exit_fail_msg "No driver or governor module passed with -d or -g" fi if [ $DRIVER_MOD ]; then @@ -137,8 +135,7 @@ do_test() fi else if [ $count = 0 ]; then - echo "No cpu is managed by cpufreq core, exiting" - exit 2; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting" fi module_governor_test $GOVERNOR_MOD @@ -162,7 +159,7 @@ do_test() ;; *) - echo "Invalid [-f] function type" + ktap_print_msg "Invalid [-f] function type" helpme ;; esac @@ -186,13 +183,25 @@ dmesg_dumps() dmesg >> $1.dmesg_full.txt } +ktap_print_header + # Parse arguments parse_arguments $@ +ktap_set_plan 1 + # Make sure all requirements are met prerequisite # Run requested functions clear_dumps $OUTFILE do_test | tee -a $OUTFILE.txt +if [ "${PIPESTATUS[0]}" -ne 0 ]; then + exit ${PIPESTATUS[0]}; +fi dmesg_dumps $OUTFILE + +ktap_test_pass "Completed successfully" + +ktap_print_totals +exit "${KSFT_PASS}" diff --git a/tools/testing/selftests/cpufreq/module.sh b/tools/testing/selftests/cpufreq/module.sh index 22563cd122..7f2667e0ae 100755 --- a/tools/testing/selftests/cpufreq/module.sh +++ b/tools/testing/selftests/cpufreq/module.sh @@ -24,16 +24,14 @@ test_basic_insmod_rmmod() # insert module insmod $1 if [ $? != 0 ]; then - printf "Insmod $1 failed\n" - exit; + ktap_exit_fail_msg "Insmod $1 failed\n" fi printf "Removing $1 module\n" # remove module rmmod $1 if [ $? != 0 ]; then - printf "rmmod $1 failed\n" - exit; + ktap_exit_fail_msg "rmmod $1 failed\n" fi printf "\n" diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 789d6949c2..29a22f50e7 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -7,16 +7,21 @@ TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh + +# functionality tests TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh +TEST_PROGS += sysfs.sh +TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py +TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py +TEST_PROGS += reclaim.sh lru_sort.sh + +# regression tests (reproducers of previously found bugs) TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh TEST_PROGS += debugfs_target_ids_pid_leak.sh -TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh +TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py -TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py -TEST_PROGS += damos_quota.py damos_apply_interval.py -TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index fe77d7e73a..2bd44c32be 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -2,7 +2,18 @@ import os -sysfs_root = '/sys/kernel/mm/damon/admin' +ksft_skip=4 + +sysfs_root = None +with open('/proc/mounts', 'r') as f: + for line in f: + dev_name, mount_point, dev_fs = line.split()[:3] + if dev_fs == 'sysfs': + sysfs_root = '%s/kernel/mm/damon/admin' % mount_point + break +if sysfs_root is None: + print('Seems sysfs not mounted?') + exit(ksft_skip) def write_file(path, string): "Returns error string if failed, or None otherwise" @@ -34,11 +45,11 @@ class DamosAccessPattern: self.nr_accesses = nr_accesses self.age = age - if self.size == None: + if self.size is None: self.size = [0, 2**64 - 1] - if self.nr_accesses == None: + if self.nr_accesses is None: self.nr_accesses = [0, 2**64 - 1] - if self.age == None: + if self.age is None: self.age = [0, 2**64 - 1] def sysfs_dir(self): @@ -47,55 +58,109 @@ class DamosAccessPattern: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'min'), self.size[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'max'), self.size[1]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'min'), self.nr_accesses[0]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'max'), self.nr_accesses[1]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'min'), self.age[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'max'), self.age[1]) - if err != None: + if err is not None: return err +qgoal_metric_user_input = 'user_input' +qgoal_metric_some_mem_psi_us = 'some_mem_psi_us' +qgoal_metrics = [qgoal_metric_user_input, qgoal_metric_some_mem_psi_us] + +class DamosQuotaGoal: + metric = None + target_value = None + current_value = None + effective_bytes = None + quota = None # owner quota + idx = None + + def __init__(self, metric, target_value=10000, current_value=0): + self.metric = metric + self.target_value = target_value + self.current_value = current_value + + def sysfs_dir(self): + return os.path.join(self.quota.sysfs_dir(), 'goals', '%d' % self.idx) + + def stage(self): + err = write_file(os.path.join(self.sysfs_dir(), 'target_metric'), + self.metric) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'target_value'), + self.target_value) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'current_value'), + self.current_value) + if err is not None: + return err + return None + class DamosQuota: sz = None # size quota, in bytes ms = None # time quota + goals = None # quota goals reset_interval_ms = None # quota reset interval scheme = None # owner scheme - def __init__(self, sz=0, ms=0, reset_interval_ms=0): + def __init__(self, sz=0, ms=0, goals=None, reset_interval_ms=0): self.sz = sz self.ms = ms self.reset_interval_ms = reset_interval_ms + self.goals = goals if goals is not None else [] + for idx, goal in enumerate(self.goals): + goal.idx = idx + goal.quota = self def sysfs_dir(self): return os.path.join(self.scheme.sysfs_dir(), 'quotas') def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'), self.reset_interval_ms) - if err != None: + if err is not None: return err + nr_goals_file = os.path.join(self.sysfs_dir(), 'goals', 'nr_goals') + content, err = read_file(nr_goals_file) + if err is not None: + return err + if int(content) != len(self.goals): + err = write_file(nr_goals_file, len(self.goals)) + if err is not None: + return err + for goal in self.goals: + err = goal.stage() + if err is not None: + return err + return None + class DamosStats: nr_tried = None sz_tried = None @@ -136,30 +201,30 @@ class Damos: def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'action'), self.action) - if err != None: + if err is not None: return err err = self.access_pattern.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'), '%d' % self.apply_interval_us) - if err != None: + if err is not None: return err err = self.quota.stage() - if err != None: + if err is not None: return err # disable watermarks err = write_file( os.path.join(self.sysfs_dir(), 'watermarks', 'metric'), 'none') - if err != None: + if err is not None: return err # disable filters err = write_file( os.path.join(self.sysfs_dir(), 'filters', 'nr_filters'), '0') - if err != None: + if err is not None: return err class DamonTarget: @@ -178,7 +243,7 @@ class DamonTarget: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') - if err != None: + if err is not None: return err return write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) @@ -210,27 +275,27 @@ class DamonAttrs: def stage(self): err = write_file(os.path.join(self.interval_sysfs_dir(), 'sample_us'), self.sample_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'aggr_us'), self.aggr_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'update_us'), self.update_us) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'min'), self.min_nr_regions) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'max'), self.max_nr_regions) - if err != None: + if err is not None: return err class DamonCtx: @@ -264,24 +329,24 @@ class DamonCtx: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'operations'), self.ops) - if err != None: + if err is not None: return err err = self.monitoring_attrs.stage() - if err != None: + if err is not None: return err nr_targets_file = os.path.join( self.sysfs_dir(), 'targets', 'nr_targets') content, err = read_file(nr_targets_file) - if err != None: + if err is not None: return err if int(content) != len(self.targets): err = write_file(nr_targets_file, '%d' % len(self.targets)) - if err != None: + if err is not None: return err for target in self.targets: err = target.stage() - if err != None: + if err is not None: return err nr_schemes_file = os.path.join( @@ -291,11 +356,11 @@ class DamonCtx: return err if int(content) != len(self.schemes): err = write_file(nr_schemes_file, '%d' % len(self.schemes)) - if err != None: + if err is not None: return err for scheme in self.schemes: err = scheme.stage() - if err != None: + if err is not None: return err return None @@ -319,16 +384,16 @@ class Kdamond: nr_contexts_file = os.path.join(self.sysfs_dir(), 'contexts', 'nr_contexts') content, err = read_file(nr_contexts_file) - if err != None: + if err is not None: return err if int(content) != len(self.contexts): err = write_file(nr_contexts_file, '%d' % len(self.contexts)) - if err != None: + if err is not None: return err for context in self.contexts: err = context.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err @@ -336,20 +401,20 @@ class Kdamond: def update_schemes_tried_bytes(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_tried_bytes') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: content, err = read_file(os.path.join(scheme.sysfs_dir(), 'tried_regions', 'total_bytes')) - if err != None: + if err is not None: return err scheme.tried_bytes = int(content) def update_schemes_stats(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_stats') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: @@ -358,11 +423,39 @@ class Kdamond: 'sz_applied', 'qt_exceeds']: content, err = read_file( os.path.join(scheme.sysfs_dir(), 'stats', stat)) - if err != None: + if err is not None: return err stat_values.append(int(content)) scheme.stats = DamosStats(*stat_values) + def update_schemes_effective_quotas(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_effective_quotas') + if err is not None: + return err + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + content, err = read_file( + os.path.join(scheme.quota.sysfs_dir(), + 'effective_bytes')) + if err is not None: + return err + goal.effective_bytes = int(content) + return None + + def commit_schemes_quota_goals(self): + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + err = goal.stage() + if err is not None: + print('commit_schemes_quota_goals failed stagign: %s'% + err) + exit(1) + return write_file(os.path.join(self.sysfs_dir(), 'state'), + 'commit_schemes_quota_goals') + class Kdamonds: kdamonds = [] @@ -378,10 +471,10 @@ class Kdamonds: def start(self): err = write_file(os.path.join(self.sysfs_dir(), 'nr_kdamonds'), '%s' % len(self.kdamonds)) - if err != None: + if err is not None: return err for kdamond in self.kdamonds: err = kdamond.start() - if err != None: + if err is not None: return err return None diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c index 585a2fa543..56b17e8fe1 100644 --- a/tools/testing/selftests/damon/access_memory.c +++ b/tools/testing/selftests/damon/access_memory.c @@ -35,7 +35,7 @@ int main(int argc, char *argv[]) start_clock = clock(); while ((clock() - start_clock) * 1000 / CLOCKS_PER_SEC < access_time_ms) - memset(regions[i], i, 1024 * 1024 * 10); + memset(regions[i], i, sz_region); } return 0; } diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py new file mode 100644 index 0000000000..18246f3b62 --- /dev/null +++ b/tools/testing/selftests/damon/damos_quota_goal.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def main(): + # access two 10 MiB memory regions, 2 second per each + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) + + goal = _damon_sysfs.DamosQuotaGoal( + metric=_damon_sysfs.qgoal_metric_user_input, target_value=10000) + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos( + action='stat', + quota=_damon_sysfs.DamosQuota( + goals=[goal], reset_interval_ms=100), + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err != None: + print('kdamond start failed: %s' % err) + exit(1) + + score_values_to_test = [0, 15000, 5000, 18000] + while proc.poll() == None: + if len(score_values_to_test) == 0: + time.sleep(0.1) + continue + + goal.current_value = score_values_to_test.pop(0) + expect_increase = goal.current_value < goal.target_value + + err = kdamonds.kdamonds[0].commit_schemes_quota_goals() + if err is not None: + print('commit_schemes_quota_goals failed: %s' % err) + exit(1) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('before-update_schemes_effective_quotas failed: %s' % err) + exit(1) + last_effective_bytes = goal.effective_bytes + + time.sleep(0.5) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('after-update_schemes_effective_quotas failed: %s' % err) + exit(1) + + print('score: %s, effective quota: %d -> %d (%.3fx)' % ( + goal.current_value, last_effective_bytes, goal.effective_bytes, + goal.effective_bytes / last_effective_bytes + if last_effective_bytes != 0 else -1.0)) + + if last_effective_bytes == goal.effective_bytes: + print('efective bytes not changed: %d' % goal.effective_bytes) + exit(1) + + increased = last_effective_bytes < goal.effective_bytes + if expect_increase != increased: + print('expectation of increase (%s) != increased (%s)' % + (expect_increase, increased)) + exit(1) + last_effective_bytes = goal.effective_bytes + +if __name__ == '__main__': + main() diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c index 890a8236a8..5f54152236 100644 --- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c +++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c @@ -15,6 +15,7 @@ #include #include #include +#include "../kselftest.h" #define DEVPATH "/dev/dma_heap" @@ -90,14 +91,13 @@ static int dmabuf_heap_open(char *name) char buf[256]; ret = snprintf(buf, 256, "%s/%s", DEVPATH, name); - if (ret < 0) { - printf("snprintf failed!\n"); - return ret; - } + if (ret < 0) + ksft_exit_fail_msg("snprintf failed! %d\n", ret); fd = open(buf, O_RDWR); if (fd < 0) - printf("open %s failed!\n", buf); + ksft_exit_fail_msg("open %s failed: %s\n", buf, strerror(errno)); + return fd; } @@ -140,7 +140,7 @@ static int dmabuf_sync(int fd, int start_stop) #define ONE_MEG (1024 * 1024) -static int test_alloc_and_import(char *heap_name) +static void test_alloc_and_import(char *heap_name) { int heap_fd = -1, dmabuf_fd = -1, importer_fd = -1; uint32_t handle = 0; @@ -148,27 +148,19 @@ static int test_alloc_and_import(char *heap_name) int ret; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing allocation and importing: "); + ksft_print_msg("Testing allocation and importing:\n"); ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0, &dmabuf_fd); if (ret) { - printf("FAIL (Allocation Failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (Allocation Failed!) %d\n", ret); + return; } + /* mmap and write a simple pattern */ - p = mmap(NULL, - ONE_MEG, - PROT_READ | PROT_WRITE, - MAP_SHARED, - dmabuf_fd, - 0); + p = mmap(NULL, ONE_MEG, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd, 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed): %s\n", strerror(errno)); + goto close_and_return; } dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START); @@ -178,71 +170,64 @@ static int test_alloc_and_import(char *heap_name) importer_fd = open_vgem(); if (importer_fd < 0) { - ret = importer_fd; - printf("(Could not open vgem - skipping): "); + ksft_test_result_skip("Could not open vgem %d\n", importer_fd); } else { ret = import_vgem_fd(importer_fd, dmabuf_fd, &handle); - if (ret < 0) { - printf("FAIL (Failed to import buffer)\n"); - goto out; - } + ksft_test_result(ret >= 0, "Import buffer %d\n", ret); } ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START); if (ret < 0) { - printf("FAIL (DMA_BUF_SYNC_START failed!)\n"); + ksft_print_msg("FAIL (DMA_BUF_SYNC_START failed!) %d\n", ret); goto out; } memset(p, 0xff, ONE_MEG); ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_END); if (ret < 0) { - printf("FAIL (DMA_BUF_SYNC_END failed!)\n"); + ksft_print_msg("FAIL (DMA_BUF_SYNC_END failed!) %d\n", ret); goto out; } close_handle(importer_fd, handle); - ret = 0; - printf(" OK\n"); + ksft_test_result_pass("%s dmabuf sync succeeded\n", __func__); + return; + out: - if (p) - munmap(p, ONE_MEG); - if (importer_fd >= 0) - close(importer_fd); - if (dmabuf_fd >= 0) - close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + ksft_test_result_fail("%s dmabuf sync failed\n", __func__); + munmap(p, ONE_MEG); + close(importer_fd); - return ret; +close_and_return: + close(dmabuf_fd); + close(heap_fd); } -static int test_alloc_zeroed(char *heap_name, size_t size) +static void test_alloc_zeroed(char *heap_name, size_t size) { int heap_fd = -1, dmabuf_fd[32]; - int i, j, ret; + int i, j, k, ret; void *p = NULL; char *c; - printf(" Testing alloced %ldk buffers are zeroed: ", size / 1024); + ksft_print_msg("Testing alloced %ldk buffers are zeroed:\n", size / 1024); heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; /* Allocate and fill a bunch of buffers */ for (i = 0; i < 32; i++) { ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]); - if (ret < 0) { - printf("FAIL (Allocation (%i) failed)\n", i); - goto out; + if (ret) { + ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret); + goto close_and_return; } + /* mmap and fill with simple pattern */ p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno)); + goto close_and_return; } + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START); memset(p, 0xff, size); dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); @@ -251,48 +236,47 @@ static int test_alloc_zeroed(char *heap_name, size_t size) /* close them all */ for (i = 0; i < 32; i++) close(dmabuf_fd[i]); + ksft_test_result_pass("Allocate and fill a bunch of buffers\n"); /* Allocate and validate all buffers are zeroed */ for (i = 0; i < 32; i++) { ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]); if (ret < 0) { - printf("FAIL (Allocation (%i) failed)\n", i); - goto out; + ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret); + goto close_and_return; } /* mmap and validate everything is zero */ p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno)); + goto close_and_return; } + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START); c = (char *)p; for (j = 0; j < size; j++) { if (c[j] != 0) { - printf("FAIL (Allocated buffer not zeroed @ %i)\n", j); - break; + ksft_print_msg("FAIL (Allocated buffer not zeroed @ %i)\n", j); + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); + munmap(p, size); + goto out; } } dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); munmap(p, size); } - /* close them all */ - for (i = 0; i < 32; i++) - close(dmabuf_fd[i]); - - close(heap_fd); - printf("OK\n"); - return 0; out: - while (i > 0) { - close(dmabuf_fd[i]); - i--; - } + ksft_test_result(i == 32, "Allocate and validate all buffers are zeroed\n"); + +close_and_return: + /* close them all */ + for (k = 0; k < i; k++) + close(dmabuf_fd[k]); + close(heap_fd); - return ret; + return; } /* Test the ioctl version compatibility w/ a smaller structure then expected */ @@ -360,126 +344,97 @@ static int dmabuf_heap_alloc_newer(int fd, size_t len, unsigned int flags, return ret; } -static int test_alloc_compat(char *heap_name) +static void test_alloc_compat(char *heap_name) { - int heap_fd = -1, dmabuf_fd = -1; - int ret; + int ret, heap_fd = -1, dmabuf_fd = -1; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing (theoretical)older alloc compat: "); + ksft_print_msg("Testing (theoretical) older alloc compat:\n"); ret = dmabuf_heap_alloc_older(heap_fd, ONE_MEG, 0, &dmabuf_fd); - if (ret) { - printf("FAIL (Older compat allocation failed!)\n"); - ret = -1; - goto out; - } - close(dmabuf_fd); - printf("OK\n"); + if (dmabuf_fd >= 0) + close(dmabuf_fd); + ksft_test_result(!ret, "dmabuf_heap_alloc_older\n"); - printf(" Testing (theoretical)newer alloc compat: "); + ksft_print_msg("Testing (theoretical) newer alloc compat:\n"); ret = dmabuf_heap_alloc_newer(heap_fd, ONE_MEG, 0, &dmabuf_fd); - if (ret) { - printf("FAIL (Newer compat allocation failed!)\n"); - ret = -1; - goto out; - } - printf("OK\n"); -out: if (dmabuf_fd >= 0) close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + ksft_test_result(!ret, "dmabuf_heap_alloc_newer\n"); - return ret; + close(heap_fd); } -static int test_alloc_errors(char *heap_name) +static void test_alloc_errors(char *heap_name) { int heap_fd = -1, dmabuf_fd = -1; int ret; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing expected error cases: "); + ksft_print_msg("Testing expected error cases:\n"); ret = dmabuf_heap_alloc(0, ONE_MEG, 0x111111, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid fd)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid fd %d\n", ret); ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0x111111, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid heap flags)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret); ret = dmabuf_heap_alloc_fdflags(heap_fd, ONE_MEG, ~(O_RDWR | O_CLOEXEC), 0, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid fd flags)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret); - printf("OK\n"); - ret = 0; -out: if (dmabuf_fd >= 0) close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + close(heap_fd); +} - return ret; +static int numer_of_heaps(void) +{ + DIR *d = opendir(DEVPATH); + struct dirent *dir; + int heaps = 0; + + while ((dir = readdir(d))) { + if (!strncmp(dir->d_name, ".", 2)) + continue; + if (!strncmp(dir->d_name, "..", 3)) + continue; + heaps++; + } + + return heaps; } int main(void) { - DIR *d; struct dirent *dir; - int ret = -1; + DIR *d; + + ksft_print_header(); d = opendir(DEVPATH); if (!d) { - printf("No %s directory?\n", DEVPATH); - return -1; + ksft_print_msg("No %s directory?\n", DEVPATH); + return KSFT_SKIP; } - while ((dir = readdir(d)) != NULL) { + ksft_set_plan(11 * numer_of_heaps()); + + while ((dir = readdir(d))) { if (!strncmp(dir->d_name, ".", 2)) continue; if (!strncmp(dir->d_name, "..", 3)) continue; - printf("Testing heap: %s\n", dir->d_name); - printf("=======================================\n"); - ret = test_alloc_and_import(dir->d_name); - if (ret) - break; - - ret = test_alloc_zeroed(dir->d_name, 4 * 1024); - if (ret) - break; - - ret = test_alloc_zeroed(dir->d_name, ONE_MEG); - if (ret) - break; - - ret = test_alloc_compat(dir->d_name); - if (ret) - break; - - ret = test_alloc_errors(dir->d_name); - if (ret) - break; + ksft_print_msg("Testing heap: %s\n", dir->d_name); + ksft_print_msg("=======================================\n"); + test_alloc_and_import(dir->d_name); + test_alloc_zeroed(dir->d_name, 4 * 1024); + test_alloc_zeroed(dir->d_name, ONE_MEG); + test_alloc_compat(dir->d_name); + test_alloc_errors(dir->d_name); } closedir(d); - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile new file mode 100644 index 0000000000..e54f382bcb --- /dev/null +++ b/tools/testing/selftests/drivers/net/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := $(wildcard lib/py/*.py) + +TEST_PROGS := \ + ping.py \ + queues.py \ + stats.py \ +# end of TEST_PROGS + +include ../../lib.mk diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst new file mode 100644 index 0000000000..3b6a29e656 --- /dev/null +++ b/tools/testing/selftests/drivers/net/README.rst @@ -0,0 +1,136 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Running driver tests +==================== + +Networking driver tests are executed within kselftest framework like any +other tests. They support testing both real device drivers and emulated / +software drivers (latter mostly to test the core parts of the stack). + +SW mode +~~~~~~~ + +By default, when no extra parameters are set or exported, tests execute +against software drivers such as netdevsim. No extra preparation is required +the software devices are created and destroyed as part of the test. +In this mode the tests are indistinguishable from other selftests and +(for example) can be run under ``virtme-ng`` like the core networking selftests. + +HW mode +~~~~~~~ + +Executing tests against a real device requires external preparation. +The netdevice against which tests will be run must exist, be running +(in UP state) and be configured with an IP address. + +Refer to list of :ref:`Variables` later in this file to set up running +the tests against a real device. + +Both modes required +~~~~~~~~~~~~~~~~~~~ + +All tests in drivers/net must support running both against a software device +and a real device. SW-only tests should instead be placed in net/ or +drivers/net/netdevsim, HW-only tests in drivers/net/hw. + +Variables +========= + +The variables can be set in the environment or by creating a net.config +file in the same directory as this README file. Example:: + + $ NETIF=eth0 ./some_test.sh + +or:: + + $ cat tools/testing/selftests/drivers/net/net.config + # Variable set in a file + NETIF=eth0 + +Local test (which don't require endpoint for sending / receiving traffic) +need only the ``NETIF`` variable. Remaining variables define the endpoint +and communication method. + +NETIF +~~~~~ + +Name of the netdevice against which the test should be executed. +When empty or not set software devices will be used. + +LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Local and remote endpoint IP addresses. + +REMOTE_TYPE +~~~~~~~~~~~ + +Communication method used to run commands on the remote endpoint. +Test framework has built-in support for ``netns`` and ``ssh`` channels. +``netns`` assumes the "remote" interface is part of the same +host, just moved to the specified netns. +``ssh`` communicates with remote endpoint over ``ssh`` and ``scp``. +Using persistent SSH connections is strongly encouraged to avoid +the latency of SSH connection setup on every command. + +Communication methods are defined by classes in ``lib/py/remote_{name}.py``. +It should be possible to add a new method without modifying any of +the framework, by simply adding an appropriately named file to ``lib/py``. + +REMOTE_ARGS +~~~~~~~~~~~ + +Arguments used to construct the communication channel. +Communication channel dependent:: + + for netns - name of the "remote" namespace + for ssh - name/address of the remote host + +Example +======= + +Build the selftests:: + + # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw" + +"Install" the tests and copy them over to the target machine:: + + # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw" \ + install INSTALL_PATH=/tmp/ksft-net-drv + + # rsync -ra --delete /tmp/ksft-net-drv root@192.168.1.1:/root/ + +On the target machine, running the tests will use netdevsim by default:: + + [/root] # ./ksft-net-drv/run_kselftest.sh -t drivers/net:ping.py + TAP version 13 + 1..1 + # timeout set to 45 + # selftests: drivers/net: ping.py + # KTAP version 1 + # 1..3 + # ok 1 ping.test_v4 + # ok 2 ping.test_v6 + # ok 3 ping.test_tcp + # # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 + ok 1 selftests: drivers/net: ping.py + +Create a config with remote info:: + + [/root] # cat > ./ksft-net-drv/drivers/net/net.config < None: + """Test whether Tx and Rx checksum offload are enabled. + + If the device under test has either off, then skip the relevant tests.""" + cfg.have_tx_csum_generic = False + cfg.have_tx_csum_ipv4 = False + cfg.have_tx_csum_ipv6 = False + cfg.have_rx_csum = False + + ethnl = EthtoolFamily() + features = ethnl.features_get({"header": {"dev-index": cfg.ifindex}}) + for f in features["active"]["bits"]["bit"]: + if f["name"] == "tx-checksum-ip-generic": + cfg.have_tx_csum_generic = True + elif f["name"] == "tx-checksum-ipv4": + cfg.have_tx_csum_ipv4 = True + elif f["name"] == "tx-checksum-ipv6": + cfg.have_tx_csum_ipv6 = True + elif f["name"] == "rx-checksum": + cfg.have_rx_csum = True + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + check_nic_features(cfg) + + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../net/lib/csum") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + cases = [] + for ipv4 in [True, False]: + cases.append(test_builder("rx_tcp", cfg, ipv4, False, "-t")) + cases.append(test_builder("rx_tcp_invalid", cfg, ipv4, False, "-t -E")) + + cases.append(test_builder("rx_udp", cfg, ipv4, False, "")) + cases.append(test_builder("rx_udp_invalid", cfg, ipv4, False, "-E")) + + cases.append(test_builder("tx_udp_csum_offload", cfg, ipv4, True, "-U")) + cases.append(test_builder("tx_udp_zero_checksum", cfg, ipv4, True, "-U -Z")) + + ksft_run(cases=cases, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/devlink_port_split.py b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py new file mode 100755 index 0000000000..2d84c7a0be --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from subprocess import PIPE, Popen +import json +import time +import argparse +import collections +import sys + +# +# Test port split configuration using devlink-port lanes attribute. +# The test is skipped in case the attribute is not available. +# +# First, check that all the ports with 1 lane fail to split. +# Second, check that all the ports with more than 1 lane can be split +# to all valid configurations (e.g., split to 2, split to 4 etc.) +# + + +# Kselftest framework requirement - SKIP code is 4 +KSFT_SKIP=4 +Port = collections.namedtuple('Port', 'bus_info name') + + +def run_command(cmd, should_fail=False): + """ + Run a command in subprocess. + Return: Tuple of (stdout, stderr). + """ + + p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) + stdout, stderr = p.communicate() + stdout, stderr = stdout.decode(), stderr.decode() + + if stderr != "" and not should_fail: + print("Error sending command: %s" % cmd) + print(stdout) + print(stderr) + return stdout, stderr + + +class devlink_ports(object): + """ + Class that holds information on the devlink ports, required to the tests; + if_names: A list of interfaces in the devlink ports. + """ + + def get_if_names(dev): + """ + Get a list of physical devlink ports. + Return: Array of tuples (bus_info/port, if_name). + """ + + arr = [] + + cmd = "devlink -j port show" + stdout, stderr = run_command(cmd) + assert stderr == "" + ports = json.loads(stdout)['port'] + + validate_devlink_output(ports, 'flavour') + + for port in ports: + if dev in port: + if ports[port]['flavour'] == 'physical': + arr.append(Port(bus_info=port, name=ports[port]['netdev'])) + + return arr + + def __init__(self, dev): + self.if_names = devlink_ports.get_if_names(dev) + + +def get_max_lanes(port): + """ + Get the $port's maximum number of lanes. + Return: number of lanes, e.g. 1, 2, 4 and 8. + """ + + cmd = "devlink -j port show %s" % port + stdout, stderr = run_command(cmd) + assert stderr == "" + values = list(json.loads(stdout)['port'].values())[0] + + if 'lanes' in values: + lanes = values['lanes'] + else: + lanes = 0 + return lanes + + +def get_split_ability(port): + """ + Get the $port split ability. + Return: split ability, true or false. + """ + + cmd = "devlink -j port show %s" % port.name + stdout, stderr = run_command(cmd) + assert stderr == "" + values = list(json.loads(stdout)['port'].values())[0] + + return values['splittable'] + + +def split(k, port, should_fail=False): + """ + Split $port into $k ports. + If should_fail == True, the split should fail. Otherwise, should pass. + Return: Array of sub ports after splitting. + If the $port wasn't split, the array will be empty. + """ + + cmd = "devlink port split %s count %s" % (port.bus_info, k) + stdout, stderr = run_command(cmd, should_fail=should_fail) + + if should_fail: + if not test(stderr != "", "%s is unsplittable" % port.name): + print("split an unsplittable port %s" % port.name) + return create_split_group(port, k) + else: + if stderr == "": + return create_split_group(port, k) + print("didn't split a splittable port %s" % port.name) + + return [] + + +def unsplit(port): + """ + Unsplit $port. + """ + + cmd = "devlink port unsplit %s" % port + stdout, stderr = run_command(cmd) + test(stderr == "", "Unsplit port %s" % port) + + +def exists(port, dev): + """ + Check if $port exists in the devlink ports. + Return: True is so, False otherwise. + """ + + return any(dev_port.name == port + for dev_port in devlink_ports.get_if_names(dev)) + + +def exists_and_lanes(ports, lanes, dev): + """ + Check if every port in the list $ports exists in the devlink ports and has + $lanes number of lanes after splitting. + Return: True if both are True, False otherwise. + """ + + for port in ports: + max_lanes = get_max_lanes(port) + if not exists(port, dev): + print("port %s doesn't exist in devlink ports" % port) + return False + if max_lanes != lanes: + print("port %s has %d lanes, but %s were expected" + % (port, lanes, max_lanes)) + return False + return True + + +def test(cond, msg): + """ + Check $cond and print a message accordingly. + Return: True is pass, False otherwise. + """ + + if cond: + print("TEST: %-60s [ OK ]" % msg) + else: + print("TEST: %-60s [FAIL]" % msg) + + return cond + + +def create_split_group(port, k): + """ + Create the split group for $port. + Return: Array with $k elements, which are the split port group. + """ + + return list(port.name + "s" + str(i) for i in range(k)) + + +def split_unsplittable_port(port, k): + """ + Test that splitting of unsplittable port fails. + """ + + # split to max + new_split_group = split(k, port, should_fail=True) + + if new_split_group != []: + unsplit(port.bus_info) + + +def split_splittable_port(port, k, lanes, dev): + """ + Test that splitting of splittable port passes correctly. + """ + + new_split_group = split(k, port) + + # Once the split command ends, it takes some time to the sub ifaces' + # to get their names. Use udevadm to continue only when all current udev + # events are handled. + cmd = "udevadm settle" + stdout, stderr = run_command(cmd) + assert stderr == "" + + if new_split_group != []: + test(exists_and_lanes(new_split_group, lanes/k, dev), + "split port %s into %s" % (port.name, k)) + + unsplit(port.bus_info) + + +def validate_devlink_output(devlink_data, target_property=None): + """ + Determine if test should be skipped by checking: + 1. devlink_data contains values + 2. The target_property exist in devlink_data + """ + skip_reason = None + if any(devlink_data.values()): + if target_property: + skip_reason = "{} not found in devlink output, test skipped".format(target_property) + for key in devlink_data: + if target_property in devlink_data[key]: + skip_reason = None + else: + skip_reason = 'devlink output is empty, test skipped' + + if skip_reason: + print(skip_reason) + sys.exit(KSFT_SKIP) + + +def make_parser(): + parser = argparse.ArgumentParser(description='A test for port splitting.') + parser.add_argument('--dev', + help='The devlink handle of the device under test. ' + + 'The default is the first registered devlink ' + + 'handle.') + + return parser + + +def main(cmdline=None): + parser = make_parser() + args = parser.parse_args(cmdline) + + dev = args.dev + if not dev: + cmd = "devlink -j dev show" + stdout, stderr = run_command(cmd) + assert stderr == "" + + validate_devlink_output(json.loads(stdout)) + devs = json.loads(stdout)['dev'] + dev = list(devs.keys())[0] + + cmd = "devlink dev show %s" % dev + stdout, stderr = run_command(cmd) + if stderr != "": + print("devlink device %s can not be found" % dev) + sys.exit(1) + + ports = devlink_ports(dev) + + found_max_lanes = False + for port in ports.if_names: + max_lanes = get_max_lanes(port.name) + + # If max lanes is 0, do not test port splitting at all + if max_lanes == 0: + continue + + # If 1 lane, shouldn't be able to split + elif max_lanes == 1: + test(not get_split_ability(port), + "%s should not be able to split" % port.name) + split_unsplittable_port(port, max_lanes) + + # Else, splitting should pass and all the split ports should exist. + else: + lane = max_lanes + test(get_split_ability(port), + "%s should be able to split" % port.name) + while lane > 1: + split_splittable_port(port, lane, max_lanes, dev) + + lane //= 2 + found_max_lanes = True + + if not found_max_lanes: + print(f"Test not started, no port of device {dev} reports max_lanes") + sys.exit(KSFT_SKIP) + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh new file mode 100755 index 0000000000..fa6953de6b --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh @@ -0,0 +1,297 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + same_speeds_autoneg_off + different_speeds_autoneg_off + combination_of_neg_on_and_off + advertise_subset_of_speeds + check_highest_speed_is_chosen + different_speeds_autoneg_on +" +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source ethtool_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/24 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/24 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy +} + +same_speeds_autoneg_off() +{ + # Check that when each of the reported speeds is forced, the links come + # up and are operational. + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0)) + + for speed in "${speeds_arr[@]}"; do + RET=0 + ethtool_set $h1 speed $speed autoneg off + ethtool_set $h2 speed $speed autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "ping with speed $speed autoneg off" + log_test "force speed $speed on both ends" + done + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +different_speeds_autoneg_off() +{ + # Test that when we force different speeds, links are not up and ping + # fails. + RET=0 + + local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0)) + local speed1=${speeds_arr[0]} + local speed2=${speeds_arr[1]} + + ethtool_set $h1 speed $speed1 autoneg off + ethtool_set $h2 speed $speed2 autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_fail $? "ping with different speeds" + + log_test "force of different speeds autoneg off" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +combination_of_neg_on_and_off() +{ + # Test that when one device is forced to a speed supported by both + # endpoints and the other device is configured to autoneg on, the links + # are up and ping passes. + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) + + for speed in "${speeds_arr[@]}"; do + RET=0 + ethtool_set $h1 speed $speed autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "ping with h1-speed=$speed autoneg off, h2 autoneg on" + log_test "force speed $speed vs. autoneg" + done + + ethtool -s $h1 autoneg on +} + +hex_speed_value_get() +{ + local speed=$1; shift + + local shift_size=${speed_values[$speed]} + speed=$((0x1 << $"shift_size")) + printf "%#x" "$speed" +} + +subset_of_common_speeds_get() +{ + local dev1=$1; shift + local dev2=$1; shift + local adver=$1; shift + + local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver)) + local speed_to_advertise=0 + local speed_to_remove=${speeds_arr[0]} + speed_to_remove+='base' + + local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver)) + + for speed in ${speeds_mode_arr[@]}; do + if [[ $speed != $speed_to_remove* ]]; then + speed=$(hex_speed_value_get $speed) + speed_to_advertise=$(($speed_to_advertise | \ + $speed)) + fi + + done + + # Convert to hex. + printf "%#x" "$speed_to_advertise" +} + +speed_to_advertise_get() +{ + # The function returns the hex number that is composed by OR-ing all + # the modes corresponding to the provided speed. + local speed_without_mode=$1; shift + local supported_speeds=("$@"); shift + local speed_to_advertise=0 + + speed_without_mode+='base' + + for speed in ${supported_speeds[@]}; do + if [[ $speed == $speed_without_mode* ]]; then + speed=$(hex_speed_value_get $speed) + speed_to_advertise=$(($speed_to_advertise | \ + $speed)) + fi + + done + + # Convert to hex. + printf "%#x" "$speed_to_advertise" +} + +advertise_subset_of_speeds() +{ + # Test that when one device advertises a subset of speeds and another + # advertises a specific speed (but all modes of this speed), the links + # are up and ping passes. + RET=0 + + local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) + ethtool_set $h1 advertise $speed_1_to_advertise + + if [ $RET != 0 ]; then + log_test "advertise subset of speeds" + return + fi + + local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1)) + # Check only speeds that h1 advertised. Remove the first speed. + unset speeds_arr_without_mode[0] + local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1)) + + for speed_value in ${speeds_arr_without_mode[@]}; do + RET=0 + local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \ + "${speeds_arr_with_mode[@]}") + ethtool_set $h2 advertise $speed_2_to_advertise + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "ping with h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" + + log_test "advertise $speed_1_to_advertise vs. $speed_2_to_advertise" + done + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +check_highest_speed_is_chosen() +{ + # Test that when one device advertises a subset of speeds, the other + # chooses the highest speed. This test checks configuration without + # traffic. + RET=0 + + local max_speed + local chosen_speed + local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) + + ethtool_set $h1 advertise $speed_to_advertise + + if [ $RET != 0 ]; then + log_test "check highest speed" + return + fi + + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) + + max_speed=${speeds_arr[0]} + for current in ${speeds_arr[@]}; do + if [[ $current -gt $max_speed ]]; then + max_speed=$current + fi + done + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + chosen_speed=$(ethtool $h1 | grep 'Speed:') + chosen_speed=${chosen_speed%"Mb/s"*} + chosen_speed=${chosen_speed#*"Speed: "} + ((chosen_speed == max_speed)) + check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed" + + log_test "check highest speed" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +different_speeds_autoneg_on() +{ + # Test that when we configure links to advertise different speeds, + # links are not up and ping fails. + RET=0 + + local -a speeds=($(different_speeds_get $h1 $h2 1 1)) + local speed1=${speeds[0]} + local speed2=${speeds[1]} + + speed1=$(hex_speed_value_get $speed1) + speed2=$(hex_speed_value_get $speed2) + + ethtool_set $h1 advertise $speed1 + ethtool_set $h2 advertise $speed2 + + if (($RET)); then + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_fail $? "ping with different speeds autoneg on" + fi + + log_test "advertise different speeds autoneg on" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +declare -gA speed_values +eval "speed_values=($(speeds_arr_get))" + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh new file mode 100755 index 0000000000..a758444841 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + autoneg + autoneg_force_mode + no_cable +" + +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source ethtool_lib.sh + +TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + swp3=$NETIF_NO_CABLE +} + +ethtool_ext_state() +{ + local dev=$1; shift + local expected_ext_state=$1; shift + local expected_ext_substate=${1:-""}; shift + + local ext_state=$(ethtool $dev | grep "Link detected" \ + | cut -d "(" -f2 | cut -d ")" -f1) + local ext_substate=$(echo $ext_state | cut -sd "," -f2 \ + | sed -e 's/^[[:space:]]*//') + ext_state=$(echo $ext_state | cut -d "," -f1) + + if [[ $ext_state != $expected_ext_state ]]; then + echo "Expected \"$expected_ext_state\", got \"$ext_state\"" + return 1 + fi + if [[ $ext_substate != $expected_ext_substate ]]; then + echo "Expected \"$expected_ext_substate\", got \"$ext_substate\"" + return 1 + fi +} + +autoneg() +{ + local msg + + RET=0 + + ip link set dev $swp1 up + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected") + check_err $? "$msg" + + log_test "Autoneg, No partner detected" + + ip link set dev $swp1 down +} + +autoneg_force_mode() +{ + local msg + + RET=0 + + ip link set dev $swp1 up + ip link set dev $swp2 up + + local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0)) + local speed1=${speeds_arr[0]} + local speed2=${speeds_arr[1]} + + ethtool_set $swp1 speed $speed1 autoneg off + ethtool_set $swp2 speed $speed2 autoneg off + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp2 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" + + log_test "Autoneg, No partner detected during force mode" + + ethtool -s $swp2 autoneg on + ethtool -s $swp1 autoneg on + + ip link set dev $swp2 down + ip link set dev $swp1 down +} + +no_cable() +{ + local msg + + RET=0 + + ip link set dev $swp3 up + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp3 "No cable") + check_err $? "$msg" + + log_test "No cable" + + ip link set dev $swp3 down +} + +setup_prepare + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh new file mode 100644 index 0000000000..b9bfb45085 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +speeds_arr_get() +{ + cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \ + {sub(/,$/, "") \ + sub(/ETHTOOL_LINK_MODE_/,"") \ + sub(/_BIT/,"") \ + sub(/_Full/,"/Full") \ + sub(/_Half/,"/Half");\ + print "["$1"]="$3}' + + awk "${cmd}" /usr/include/linux/ethtool.h +} + +ethtool_set() +{ + local cmd="$@" + local out=$(ethtool -s $cmd 2>&1 | wc -l) + + check_err $out "error in configuration. $cmd" +} + +dev_linkmodes_params_get() +{ + local dev=$1; shift + local adver=$1; shift + local -a linkmodes_params + local param_count + local arr + + if (($adver)); then + mode="Advertised link modes" + else + mode="Supported link modes" + fi + + local -a dev_linkmodes=($(dev_speeds_get $dev 1 $adver)) + for ((i=0; i<${#dev_linkmodes[@]}; i++)); do + linkmodes_params[$i]=$(echo -e "${dev_linkmodes[$i]}" | \ + # Replaces all non numbers with spaces + sed -e 's/[^0-9]/ /g' | \ + # Squeeze spaces in sequence to 1 space + tr -s ' ') + # Count how many numbers were found in the linkmode + param_count=$(echo "${linkmodes_params[$i]}" | wc -w) + if [[ $param_count -eq 1 ]]; then + linkmodes_params[$i]="${linkmodes_params[$i]} 1" + elif [[ $param_count -ge 3 ]]; then + arr=(${linkmodes_params[$i]}) + # Take only first two params + linkmodes_params[$i]=$(echo "${arr[@]:0:2}") + fi + done + echo ${linkmodes_params[@]} +} + +dev_speeds_get() +{ + local dev=$1; shift + local with_mode=$1; shift + local adver=$1; shift + local speeds_str + + if (($adver)); then + mode="Advertised link modes" + else + mode="Supported link modes" + fi + + speeds_str=$(ethtool "$dev" | \ + # Snip everything before the link modes section. + sed -n '/'"$mode"':/,$p' | \ + # Quit processing the rest at the start of the next section. + # When checking, skip the header of this section (hence the 2,). + sed -n '2,${/^[\t][^ \t]/q};p' | \ + # Drop the section header of the current section. + cut -d':' -f2) + + local -a speeds_arr=($speeds_str) + if [[ $with_mode -eq 0 ]]; then + for ((i=0; i<${#speeds_arr[@]}; i++)); do + speeds_arr[$i]=${speeds_arr[$i]%base*} + done + fi + echo ${speeds_arr[@]} +} + +common_speeds_get() +{ + dev1=$1; shift + dev2=$1; shift + with_mode=$1; shift + adver=$1; shift + + local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver)) + local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver)) + + comm -12 \ + <(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \ + <(printf '%s\n' "${dev2_speeds[@]}" | sort -u) +} + +different_speeds_get() +{ + local dev1=$1; shift + local dev2=$1; shift + local with_mode=$1; shift + local adver=$1; shift + + local -a speeds_arr + + speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver)) + if [[ ${#speeds_arr[@]} < 2 ]]; then + check_err 1 "cannot check different speeds. There are not enough speeds" + fi + + echo ${speeds_arr[0]} ${speeds_arr[1]} +} diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh new file mode 100755 index 0000000000..c301e735c8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh @@ -0,0 +1,341 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + manual_with_verification_h1_to_h2 + manual_with_verification_h2_to_h1 + manual_without_verification_h1_to_h2 + manual_without_verification_h2_to_h1 + manual_failed_verification_h1_to_h2 + manual_failed_verification_h2_to_h1 + lldp +" + +NUM_NETIFS=2 +REQUIRE_MZ=no +PREEMPTIBLE_PRIO=0 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +traffic_test() +{ + local if=$1; shift + local src=$1; shift + local num_pkts=10000 + local before= + local after= + local delta= + + if [ ${has_pmac_stats[$if]} = false ]; then + src="aggregate" + fi + + before=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) + + $MZ $if -q -c $num_pkts -p 64 -b bcast -t ip -R $PREEMPTIBLE_PRIO + + after=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) + + delta=$((after - before)) + + # Allow an extra 1% tolerance for random packets sent by the stack + [ $delta -ge $num_pkts ] && [ $delta -le $((num_pkts + 100)) ] +} + +manual_with_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + # It isn't completely clear from IEEE 802.3-2018 Figure 99-5: Transmit + # Processing state diagram whether the "send_r" variable (send response + # to verification frame) should be taken into consideration while the + # MAC Merge TX direction is disabled. That being said, at least the + # NXP ENETC does not, and requires tx-enabled on in order to respond to + # the link partner's verification frames. + ethtool --set-mm $rx tx-enabled on + ethtool --set-mm $tx verify-enabled on tx-enabled on + + # Wait for verification to finish + sleep 1 + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'SUCCEEDED' + check_err "$?" "Verification did not succeed" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_err "$?" "pMAC TX is not active" + + traffic_test $tx "pmac" + check_err "$?" "Traffic did not get sent through $tx's pMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + ethtool --set-mm $rx tx-enabled off + + log_test "Manual configuration with verification: $tx to $rx" +} + +manual_with_verification_h1_to_h2() +{ + manual_with_verification $h1 $h2 +} + +manual_with_verification_h2_to_h1() +{ + manual_with_verification $h2 $h1 +} + +manual_without_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + ethtool --set-mm $tx verify-enabled off tx-enabled on + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'DISABLED' + check_err "$?" "Verification is not disabled" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_err "$?" "pMAC TX is not active" + + traffic_test $tx "pmac" + check_err "$?" "Traffic did not get sent through $tx's pMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + + log_test "Manual configuration without verification: $tx to $rx" +} + +manual_without_verification_h1_to_h2() +{ + manual_without_verification $h1 $h2 +} + +manual_without_verification_h2_to_h1() +{ + manual_without_verification $h2 $h1 +} + +manual_failed_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + ethtool --set-mm $rx pmac-enabled off + ethtool --set-mm $tx verify-enabled on tx-enabled on + + # Wait for verification to time out + sleep 1 + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'SUCCEEDED' + check_fail "$?" "Verification succeeded when it shouldn't have" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_fail "$?" "pMAC TX is active when it shouldn't have" + + traffic_test $tx "emac" + check_err "$?" "Traffic did not get sent through $tx's eMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + ethtool --set-mm $rx pmac-enabled on + + log_test "Manual configuration with failed verification: $tx to $rx" +} + +manual_failed_verification_h1_to_h2() +{ + manual_failed_verification $h1 $h2 +} + +manual_failed_verification_h2_to_h1() +{ + manual_failed_verification $h2 $h1 +} + +smallest_supported_add_frag_size() +{ + local iface=$1 + local rx_min_frag_size= + + rx_min_frag_size=$(ethtool --json --show-mm $iface | \ + jq '.[]."rx-min-frag-size"') + + if [ $rx_min_frag_size -le 60 ]; then + echo 0 + elif [ $rx_min_frag_size -le 124 ]; then + echo 1 + elif [ $rx_min_frag_size -le 188 ]; then + echo 2 + elif [ $rx_min_frag_size -le 252 ]; then + echo 3 + else + echo "$iface: RX min frag size $rx_min_frag_size cannot be advertised over LLDP" + exit 1 + fi +} + +expected_add_frag_size() +{ + local iface=$1 + local requested=$2 + local min=$(smallest_supported_add_frag_size $iface) + + [ $requested -le $min ] && echo $min || echo $requested +} + +lldp_change_add_frag_size() +{ + local add_frag_size=$1 + local pattern= + + lldptool -T -i $h1 -V addEthCaps addFragSize=$add_frag_size >/dev/null + # Wait for TLVs to be received + sleep 2 + pattern=$(printf "Additional fragment size: %d" \ + $(expected_add_frag_size $h1 $add_frag_size)) + lldptool -i $h2 -t -n -V addEthCaps | grep -q "$pattern" +} + +lldp() +{ + RET=0 + + systemctl start lldpad + + # Configure the interfaces to receive and transmit LLDPDUs + lldptool -L -i $h1 adminStatus=rxtx >/dev/null + lldptool -L -i $h2 adminStatus=rxtx >/dev/null + + # Enable the transmission of Additional Ethernet Capabilities TLV + lldptool -T -i $h1 -V addEthCaps enableTx=yes >/dev/null + lldptool -T -i $h2 -V addEthCaps enableTx=yes >/dev/null + + # Wait for TLVs to be received + sleep 2 + + lldptool -i $h1 -t -n -V addEthCaps | \ + grep -q "Preemption capability active" + check_err "$?" "$h1 pMAC TX is not active" + + lldptool -i $h2 -t -n -V addEthCaps | \ + grep -q "Preemption capability active" + check_err "$?" "$h2 pMAC TX is not active" + + lldp_change_add_frag_size 3 + check_err "$?" "addFragSize 3" + + lldp_change_add_frag_size 2 + check_err "$?" "addFragSize 2" + + lldp_change_add_frag_size 1 + check_err "$?" "addFragSize 1" + + lldp_change_add_frag_size 0 + check_err "$?" "addFragSize 0" + + traffic_test $h1 "pmac" + check_err "$?" "Traffic did not get sent through $h1's pMAC" + + traffic_test $h2 "pmac" + check_err "$?" "Traffic did not get sent through $h2's pMAC" + + systemctl stop lldpad + + log_test "LLDP" +} + +h1_create() +{ + ip link set dev $h1 up + + tc qdisc add dev $h1 root mqprio num_tc 4 map 0 1 2 3 \ + queues 1@0 1@1 1@2 1@3 \ + fp P E E E \ + hw 1 + + ethtool --set-mm $h1 pmac-enabled on tx-enabled off verify-enabled off +} + +h2_create() +{ + ip link set dev $h2 up + + ethtool --set-mm $h2 pmac-enabled on tx-enabled off verify-enabled off + + tc qdisc add dev $h2 root mqprio num_tc 4 map 0 1 2 3 \ + queues 1@0 1@1 1@2 1@3 \ + fp P E E E \ + hw 1 +} + +h1_destroy() +{ + ethtool --set-mm $h1 pmac-enabled off tx-enabled off verify-enabled off + + tc qdisc del dev $h1 root + + ip link set dev $h1 down +} + +h2_destroy() +{ + tc qdisc del dev $h2 root + + ethtool --set-mm $h2 pmac-enabled off tx-enabled off verify-enabled off + + ip link set dev $h2 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy +} + +check_ethtool_mm_support +check_tc_fp_support +require_command lldptool +bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually" + +for netif in ${NETIFS[@]}; do + ethtool --show-mm $netif 2>&1 &> /dev/null + if [[ $? -ne 0 ]]; then + echo "SKIP: $netif does not support MAC Merge" + exit $ksft_skip + fi + + if check_ethtool_pmac_std_stats_support $netif eth-mac; then + has_pmac_stats[$netif]=true + else + has_pmac_stats[$netif]=false + echo "$netif does not report pMAC statistics, falling back to aggregate" + fi +done + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh new file mode 100755 index 0000000000..8f60c1685a --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + rmon_rx_histogram + rmon_tx_histogram +" + +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +ETH_FCS_LEN=4 +ETH_HLEN=$((6+6+2)) + +declare -A netif_mtu + +ensure_mtu() +{ + local iface=$1; shift + local len=$1; shift + local current=$(ip -j link show dev $iface | jq -r '.[0].mtu') + local required=$((len - ETH_HLEN - ETH_FCS_LEN)) + + if [ $current -lt $required ]; then + ip link set dev $iface mtu $required || return 1 + fi +} + +bucket_test() +{ + local iface=$1; shift + local neigh=$1; shift + local set=$1; shift + local bucket=$1; shift + local len=$1; shift + local num_rx=10000 + local num_tx=20000 + local expected= + local before= + local after= + local delta= + + # Mausezahn does not include FCS bytes in its length - but the + # histogram counters do + len=$((len - ETH_FCS_LEN)) + len=$((len > 0 ? len : 0)) + + before=$(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") + + # Send 10k one way and 20k in the other, to detect counters + # mapped to the wrong direction + $MZ $neigh -q -c $num_rx -p $len -a own -b bcast -d 10us + $MZ $iface -q -c $num_tx -p $len -a own -b bcast -d 10us + + after=$(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") + + delta=$((after - before)) + + expected=$([ $set = rx ] && echo $num_rx || echo $num_tx) + + # Allow some extra tolerance for other packets sent by the stack + [ $delta -ge $expected ] && [ $delta -le $((expected + 100)) ] +} + +rmon_histogram() +{ + local iface=$1; shift + local neigh=$1; shift + local set=$1; shift + local nbuckets=0 + local step= + + RET=0 + + while read -r -a bucket; do + step="$set-pkts${bucket[0]}to${bucket[1]} on $iface" + + for if in $iface $neigh; do + if ! ensure_mtu $if ${bucket[0]}; then + log_test_xfail "$if does not support the required MTU for $step" + return + fi + done + + if ! bucket_test $iface $neigh $set $nbuckets ${bucket[0]}; then + check_err 1 "$step failed" + return 1 + fi + log_test "$step" + nbuckets=$((nbuckets + 1)) + done < <(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null) + + if [ $nbuckets -eq 0 ]; then + log_test_xfail "$iface does not support $set histogram counters" + return + fi +} + +rmon_rx_histogram() +{ + rmon_histogram $h1 $h2 rx + rmon_histogram $h2 $h1 rx +} + +rmon_tx_histogram() +{ + rmon_histogram $h1 $h2 tx + rmon_histogram $h2 $h1 tx +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + for iface in $h1 $h2; do + netif_mtu[$iface]=$(ip -j link show dev $iface | jq -r '.[0].mtu') + ip link set dev $iface up + done +} + +cleanup() +{ + pre_cleanup + + for iface in $h2 $h1; do + ip link set dev $iface \ + mtu ${netif_mtu[$iface]} \ + down + done +} + +check_ethtool_counter_group_support +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh new file mode 100755 index 0000000000..67fafefc80 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh @@ -0,0 +1,334 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +----------------------+ +# | H1 | | H2 | +# | | | | +# | $h1.200 + | | + $h2.200 | +# | 192.0.2.1/28 | | | | 192.0.2.18/28 | +# | 2001:db8:1::1/64 | | | | 2001:db8:2::1/64 | +# | | | | | | +# | $h1 + | | + $h2 | +# | | | | | | +# +------------------|-+ +-|--------------------+ +# | | +# +------------------|-------------------------|--------------------+ +# | SW | | | +# | | | | +# | $rp1 + + $rp2 | +# | | | | +# | $rp1.200 + + $rp2.200 | +# | 192.0.2.2/28 192.0.2.17/28 | +# | 2001:db8:1::2/64 2001:db8:2::2/64 | +# | | +# +-----------------------------------------------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + respin_enablement + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + reapply_config + ping_ipv4 + ping_ipv6 + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + test_stats_report_rx + test_stats_report_tx + test_destroy_enabled + test_double_enable +" +NUM_NETIFS=4 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh + +h1_create() +{ + simple_if_init $h1 + vlan_create $h1 200 v$h1 192.0.2.1/28 2001:db8:1::1/64 + ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 +} + +h1_destroy() +{ + ip -6 route del 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 + ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + vlan_destroy $h1 200 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + vlan_create $h2 200 v$h2 192.0.2.18/28 2001:db8:2::1/64 + ip route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 + ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 +} + +h2_destroy() +{ + ip -6 route del 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 + ip route del 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 + vlan_destroy $h2 200 + simple_if_fini $h2 +} + +router_rp1_200_create() +{ + ip link add name $rp1.200 link $rp1 type vlan id 200 + ip link set dev $rp1.200 addrgenmode eui64 + ip link set dev $rp1.200 up + ip address add dev $rp1.200 192.0.2.2/28 + ip address add dev $rp1.200 2001:db8:1::2/64 + ip stats set dev $rp1.200 l3_stats on +} + +router_rp1_200_destroy() +{ + ip stats set dev $rp1.200 l3_stats off + ip address del dev $rp1.200 2001:db8:1::2/64 + ip address del dev $rp1.200 192.0.2.2/28 + ip link del dev $rp1.200 +} + +router_create() +{ + ip link set dev $rp1 up + router_rp1_200_create + + ip link set dev $rp2 up + vlan_create $rp2 200 "" 192.0.2.17/28 2001:db8:2::2/64 +} + +router_destroy() +{ + vlan_destroy $rp2 200 + ip link set dev $rp2 down + + router_rp1_200_destroy + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1mac=$(mac_get $rp1) + rp2mac=$(mac_get $rp2) + + vrf_prepare + + h1_create + h2_create + + router_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + router_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1.200 192.0.2.18 " IPv4" +} + +ping_ipv6() +{ + ping_test $h1.200 2001:db8:2::1 " IPv6" +} + +send_packets_rx_ipv4() +{ + # Send 21 packets instead of 20, because the first one might trap and go + # through the SW datapath, which might not bump the HW counter. + $MZ $h1.200 -c 21 -d 20msec -p 100 \ + -a own -b $rp1mac -A 192.0.2.1 -B 192.0.2.18 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_rx_ipv6() +{ + $MZ $h1.200 -6 -c 21 -d 20msec -p 100 \ + -a own -b $rp1mac -A 2001:db8:1::1 -B 2001:db8:2::1 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_tx_ipv4() +{ + $MZ $h2.200 -c 21 -d 20msec -p 100 \ + -a own -b $rp2mac -A 192.0.2.18 -B 192.0.2.1 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_tx_ipv6() +{ + $MZ $h2.200 -6 -c 21 -d 20msec -p 100 \ + -a own -b $rp2mac -A 2001:db8:2::1 -B 2001:db8:1::1 \ + -q -t udp sp=54321,dp=12345 +} + +___test_stats() +{ + local dir=$1; shift + local prot=$1; shift + + local a + local b + + a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) + send_packets_${dir}_${prot} + "$@" + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $rp1.200 ${dir} packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" +} + +__test_stats() +{ + local dir=$1; shift + local prot=$1; shift + + RET=0 + ___test_stats "$dir" "$prot" + log_test "Test $dir packets: $prot" +} + +test_stats_rx_ipv4() +{ + __test_stats rx ipv4 +} + +test_stats_tx_ipv4() +{ + __test_stats tx ipv4 +} + +test_stats_rx_ipv6() +{ + __test_stats rx ipv6 +} + +test_stats_tx_ipv6() +{ + __test_stats tx ipv6 +} + +# Make sure everything works well even after stats have been disabled and +# reenabled on the same device without touching the L3 configuration. +respin_enablement() +{ + log_info "Turning stats off and on again" + ip stats set dev $rp1.200 l3_stats off + ip stats set dev $rp1.200 l3_stats on +} + +# For the initial run, l3_stats is enabled on a completely set up netdevice. Now +# do it the other way around: enabling the L3 stats on an L2 netdevice, and only +# then apply the L3 configuration. +reapply_config() +{ + log_info "Reapplying configuration" + + router_rp1_200_destroy + + ip link add name $rp1.200 link $rp1 type vlan id 200 + ip link set dev $rp1.200 addrgenmode none + ip stats set dev $rp1.200 l3_stats on + ip link set dev $rp1.200 addrgenmode eui64 + ip link set dev $rp1.200 up + ip address add dev $rp1.200 192.0.2.2/28 + ip address add dev $rp1.200 2001:db8:1::2/64 +} + +__test_stats_report() +{ + local dir=$1; shift + local prot=$1; shift + + local a + local b + + RET=0 + + a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) + send_packets_${dir}_${prot} + ip address flush dev $rp1.200 + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $rp1.200 ${dir} packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" + log_test "Test ${dir} packets: stats pushed on loss of L3" + + ip stats set dev $rp1.200 l3_stats off + ip link del dev $rp1.200 + router_rp1_200_create +} + +test_stats_report_rx() +{ + __test_stats_report rx ipv4 +} + +test_stats_report_tx() +{ + __test_stats_report tx ipv4 +} + +test_destroy_enabled() +{ + RET=0 + + ip link del dev $rp1.200 + router_rp1_200_create + + log_test "Destroy l3_stats-enabled netdev" +} + +test_double_enable() +{ + RET=0 + ___test_stats rx ipv4 \ + ip stats set dev $rp1.200 l3_stats on + log_test "Test stat retention across a spurious enablement" +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info | + jq '.[].info.l3_stats.used') +[[ $used = true ]] +check_err $? "hw_stats_info.used=$used" +log_test "l3_stats offloaded" +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh new file mode 100755 index 0000000000..a94d92e1ab --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test L3 stats on IP-in-IP GRE tunnel without key. + +# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more +# details. + +ALL_TESTS=" + ping_ipv4 + test_stats_rx + test_stats_tx +" +NUM_NETIFS=6 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/ipip_lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh + +setup_prepare() +{ + h1=${NETIFS[p1]} + ol1=${NETIFS[p2]} + + ul1=${NETIFS[p3]} + ul2=${NETIFS[p4]} + + ol2=${NETIFS[p5]} + h2=${NETIFS[p6]} + + ol1mac=$(mac_get $ol1) + + forwarding_enable + vrf_prepare + h1_create + h2_create + sw1_flat_create gre $ol1 $ul1 + sw2_flat_create gre $ol2 $ul2 + ip stats set dev g1a l3_stats on + ip stats set dev g2a l3_stats on +} + +cleanup() +{ + pre_cleanup + + ip stats set dev g1a l3_stats off + ip stats set dev g2a l3_stats off + + sw2_flat_destroy $ol2 $ul2 + sw1_flat_destroy $ol1 $ul1 + h2_destroy + h1_destroy + + vrf_cleanup + forwarding_restore +} + +ping_ipv4() +{ + RET=0 + + ping_test $h1 192.0.2.18 " gre flat" +} + +send_packets_ipv4() +{ + # Send 21 packets instead of 20, because the first one might trap and go + # through the SW datapath, which might not bump the HW counter. + $MZ $h1 -c 21 -d 20msec -p 100 \ + -a own -b $ol1mac -A 192.0.2.1 -B 192.0.2.18 \ + -q -t udp sp=54321,dp=12345 +} + +test_stats() +{ + local dev=$1; shift + local dir=$1; shift + + local a + local b + + RET=0 + + a=$(hw_stats_get l3_stats $dev $dir packets) + send_packets_ipv4 + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $dev $dir packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" + + log_test "Test $dir packets: $prot" +} + +test_stats_tx() +{ + test_stats g1a tx +} + +test_stats_rx() +{ + test_stats g2a rx +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py new file mode 100644 index 0000000000..b582885786 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../../../..").resolve() + +try: + sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * + from drivers.net.lib.py import * +except ModuleNotFoundError as e: + ksft_pr("Failed importing `net` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) diff --git a/tools/testing/selftests/drivers/net/hw/loopback.sh b/tools/testing/selftests/drivers/net/hw/loopback.sh new file mode 100755 index 0000000000..5acc3ff820 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/loopback.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +ALL_TESTS="loopback_test" +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/tc_common.sh +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 + tc qdisc add dev $h1 clsact +} + +h1_destroy() +{ + tc qdisc del dev $h1 clsact + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 +} + +h2_destroy() +{ + simple_if_fini $h2 +} + +loopback_test() +{ + RET=0 + + tc filter add dev $h1 ingress protocol arp pref 1 handle 101 flower \ + skip_hw arp_op reply arp_tip 192.0.2.1 action drop + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 1 + check_fail $? "Matched on a filter without loopback setup" + + ethtool -K $h1 loopback on + check_err $? "Failed to enable loopback" + + setup_wait_dev $h1 + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 1 + check_err $? "Did not match on filter with loopback" + + ethtool -K $h1 loopback off + check_err $? "Failed to disable loopback" + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 2 + check_fail $? "Matched on a filter after loopback was removed" + + tc filter del dev $h1 ingress protocol arp pref 1 handle 101 flower + + log_test "loopback" +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + vrf_prepare + + h1_create + h2_create + + if ethtool -k $h1 | grep loopback | grep -q fixed; then + log_test "SKIP: dev $h1 does not support loopback feature" + exit $ksft_skip + fi +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + + vrf_cleanup +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py new file mode 100755 index 0000000000..026d98976c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import time +import os +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import KsftSkipEx, KsftFailEx +from lib.py import NetdevFamily, NlError +from lib.py import NetDrvEpEnv +from lib.py import cmd, tool, GenerateTraffic + + +def _write_fail_config(config): + for key, value in config.items(): + with open("/sys/kernel/debug/fail_function/" + key, "w") as fp: + fp.write(str(value) + "\n") + + +def _enable_pp_allocation_fail(): + if not os.path.exists("/sys/kernel/debug/fail_function"): + raise KsftSkipEx("Kernel built without function error injection (or DebugFS)") + + if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"): + with open("/sys/kernel/debug/fail_function/inject", "w") as fp: + fp.write("page_pool_alloc_pages\n") + + _write_fail_config({ + "verbose": 0, + "interval": 511, + "probability": 100, + "times": -1, + }) + + +def _disable_pp_allocation_fail(): + if not os.path.exists("/sys/kernel/debug/fail_function"): + return + + if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"): + with open("/sys/kernel/debug/fail_function/inject", "w") as fp: + fp.write("\n") + + _write_fail_config({ + "probability": 0, + "times": 0, + }) + + +def test_pp_alloc(cfg, netdevnl): + def get_stats(): + return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + + def check_traffic_flowing(): + stat1 = get_stats() + time.sleep(1) + stat2 = get_stats() + if stat2['rx-packets'] - stat1['rx-packets'] < 15000: + raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets']) + + + try: + stats = get_stats() + except NlError as e: + if e.nl_msg.error == -95: + stats = {} + else: + raise + if 'rx-alloc-fail' not in stats: + raise KsftSkipEx("Driver does not report 'rx-alloc-fail' via qstats") + + set_g = False + traffic = None + try: + traffic = GenerateTraffic(cfg) + + check_traffic_flowing() + + _enable_pp_allocation_fail() + + s1 = get_stats() + time.sleep(3) + s2 = get_stats() + + if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1: + raise KsftSkipEx("Allocation failures not increasing") + if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100: + raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'], + "packets:", s2['rx-packets'] - s1['rx-packets']) + + # Basic failures are fine, try to wobble some settings to catch extra failures + check_traffic_flowing() + g = tool("ethtool", "-g " + cfg.ifname, json=True)[0] + if 'rx' in g and g["rx"] * 2 <= g["rx-max"]: + new_g = g['rx'] * 2 + elif 'rx' in g: + new_g = g['rx'] // 2 + else: + new_g = None + + if new_g: + set_g = cmd(f"ethtool -G {cfg.ifname} rx {new_g}", fail=False).ret == 0 + if set_g: + ksft_pr("ethtool -G change retval: success") + else: + ksft_pr("ethtool -G change retval: did not succeed", new_g) + else: + ksft_pr("ethtool -G change retval: did not try") + + time.sleep(0.1) + check_traffic_flowing() + finally: + _disable_pp_allocation_fail() + if traffic: + traffic.stop() + time.sleep(0.1) + if set_g: + cmd(f"ethtool -G {cfg.ifname} rx {g['rx']}") + + +def main() -> None: + netdevnl = NetdevFamily() + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + + ksft_run([test_pp_alloc], args=(cfg, netdevnl, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/settings b/tools/testing/selftests/drivers/net/hw/settings new file mode 100644 index 0000000000..e7b9417537 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py new file mode 100644 index 0000000000..401e70f7f1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../../..").resolve() + +try: + sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * +except ModuleNotFoundError as e: + ksft_pr("Failed importing `net` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +from .env import * +from .load import * +from .remote import Remote diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py new file mode 100644 index 0000000000..edcedd7bff --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -0,0 +1,224 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +from pathlib import Path +from lib.py import KsftSkipEx, KsftXfailEx +from lib.py import cmd, ip +from lib.py import NetNS, NetdevSimDev +from .remote import Remote + + +def _load_env_file(src_path): + env = os.environ.copy() + + src_dir = Path(src_path).parent.resolve() + if not (src_dir / "net.config").exists(): + return env + + with open((src_dir / "net.config").as_posix(), 'r') as fp: + for line in fp.readlines(): + full_file = line + # Strip comments + pos = line.find("#") + if pos >= 0: + line = line[:pos] + line = line.strip() + if not line: + continue + pair = line.split('=', maxsplit=1) + if len(pair) != 2: + raise Exception("Can't parse configuration line:", full_file) + env[pair[0]] = pair[1] + return env + + +class NetDrvEnv: + """ + Class for a single NIC / host env, with no remote end + """ + def __init__(self, src_path, **kwargs): + self._ns = None + + self.env = _load_env_file(src_path) + + if 'NETIF' in self.env: + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + else: + self._ns = NetdevSimDev(**kwargs) + self.dev = self._ns.nsims[0].dev + self.ifindex = self.dev['ifindex'] + + def __enter__(self): + ip(f"link set dev {self.dev['ifname']} up") + + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + + +class NetDrvEpEnv: + """ + Class for an environment with a local device and "remote endpoint" + which can be used to send traffic in. + + For local testing it creates two network namespaces and a pair + of netdevsim devices. + """ + + # Network prefixes used for local tests + nsim_v4_pfx = "192.0.2." + nsim_v6_pfx = "2001:db8::" + + def __init__(self, src_path, nsim_test=None): + + self.env = _load_env_file(src_path) + + # Things we try to destroy + self.remote = None + # These are for local testing state + self._netns = None + self._ns = None + self._ns_peer = None + + if "NETIF" in self.env: + if nsim_test is True: + raise KsftXfailEx("Test only works on netdevsim") + self._check_env() + + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + + self.v4 = self.env.get("LOCAL_V4") + self.v6 = self.env.get("LOCAL_V6") + self.remote_v4 = self.env.get("REMOTE_V4") + self.remote_v6 = self.env.get("REMOTE_V6") + kind = self.env["REMOTE_TYPE"] + args = self.env["REMOTE_ARGS"] + else: + if nsim_test is False: + raise KsftXfailEx("Test does not work on netdevsim") + + self.create_local() + + self.dev = self._ns.nsims[0].dev + + self.v4 = self.nsim_v4_pfx + "1" + self.v6 = self.nsim_v6_pfx + "1" + self.remote_v4 = self.nsim_v4_pfx + "2" + self.remote_v6 = self.nsim_v6_pfx + "2" + kind = "netns" + args = self._netns.name + + self.remote = Remote(kind, args, src_path) + + self.addr = self.v6 if self.v6 else self.v4 + self.remote_addr = self.remote_v6 if self.remote_v6 else self.remote_v4 + + self.addr_ipver = "6" if self.v6 else "4" + # Bracketed addresses, some commands need IPv6 to be inside [] + self.baddr = f"[{self.v6}]" if self.v6 else self.v4 + self.remote_baddr = f"[{self.remote_v6}]" if self.remote_v6 else self.remote_v4 + + self.ifname = self.dev['ifname'] + self.ifindex = self.dev['ifindex'] + + self._required_cmd = {} + + def create_local(self): + self._netns = NetNS() + self._ns = NetdevSimDev() + self._ns_peer = NetdevSimDev(ns=self._netns) + + with open("/proc/self/ns/net") as nsfd0, \ + open("/var/run/netns/" + self._netns.name) as nsfd1: + ifi0 = self._ns.nsims[0].ifindex + ifi1 = self._ns_peer.nsims[0].ifindex + NetdevSimDev.ctrl_write('link_device', + f'{nsfd0.fileno()}:{ifi0} {nsfd1.fileno()}:{ifi1}') + + ip(f" addr add dev {self._ns.nsims[0].ifname} {self.nsim_v4_pfx}1/24") + ip(f"-6 addr add dev {self._ns.nsims[0].ifname} {self.nsim_v6_pfx}1/64 nodad") + ip(f" link set dev {self._ns.nsims[0].ifname} up") + + ip(f" addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v4_pfx}2/24", ns=self._netns) + ip(f"-6 addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v6_pfx}2/64 nodad", ns=self._netns) + ip(f" link set dev {self._ns_peer.nsims[0].ifname} up", ns=self._netns) + + def _check_env(self): + vars_needed = [ + ["LOCAL_V4", "LOCAL_V6"], + ["REMOTE_V4", "REMOTE_V6"], + ["REMOTE_TYPE"], + ["REMOTE_ARGS"] + ] + missing = [] + + for choice in vars_needed: + for entry in choice: + if entry in self.env: + break + else: + missing.append(choice) + # Make sure v4 / v6 configs are symmetric + if ("LOCAL_V6" in self.env) != ("REMOTE_V6" in self.env): + missing.append(["LOCAL_V6", "REMOTE_V6"]) + if ("LOCAL_V4" in self.env) != ("REMOTE_V4" in self.env): + missing.append(["LOCAL_V4", "REMOTE_V4"]) + if missing: + raise Exception("Invalid environment, missing configuration:", missing, + "Please see tools/testing/selftests/drivers/net/README.rst") + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + if self._ns_peer: + self._ns_peer.remove() + self._ns_peer = None + if self._netns: + del self._netns + self._netns = None + if self.remote: + del self.remote + self.remote = None + + def require_v4(self): + if not self.v4 or not self.remote_v4: + raise KsftSkipEx("Test requires IPv4 connectivity") + + def require_v6(self): + if not self.v6 or not self.remote_v6: + raise KsftSkipEx("Test requires IPv6 connectivity") + + def _require_cmd(self, comm, key, host=None): + cached = self._required_cmd.get(comm, {}) + if cached.get(key) is None: + cached[key] = cmd("command -v -- " + comm, fail=False, + shell=True, host=host).ret == 0 + self._required_cmd[comm] = cached + return cached[key] + + def require_cmd(self, comm, local=True, remote=False): + if local: + if not self._require_cmd(comm, "local"): + raise KsftSkipEx("Test requires command: " + comm) + if remote: + if not self._require_cmd(comm, "remote"): + raise KsftSkipEx("Test requires (remote) command: " + comm) diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py new file mode 100644 index 0000000000..abdb677bdb --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 + +import time + +from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen + +class GenerateTraffic: + def __init__(self, env): + env.require_cmd("iperf3", remote=True) + + self.env = env + + port = rand_port() + self._iperf_server = cmd(f"iperf3 -s -p {port}", background=True) + wait_port_listen(port) + time.sleep(0.1) + self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {port} -t 86400", + background=True, host=env.remote) + + # Wait for traffic to ramp up + pkt = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"] + for _ in range(50): + time.sleep(0.1) + now = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"] + if now - pkt > 1000: + return + pkt = now + self.stop(verbose=True) + raise Exception("iperf3 traffic did not ramp up") + + def stop(self, verbose=None): + self._iperf_client.process(terminate=True) + if verbose: + ksft_pr(">> Client:") + ksft_pr(self._iperf_client.stdout) + ksft_pr(self._iperf_client.stderr) + self._iperf_server.process(terminate=True) + if verbose: + ksft_pr(">> Server:") + ksft_pr(self._iperf_server.stdout) + ksft_pr(self._iperf_server.stderr) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote.py b/tools/testing/selftests/drivers/net/lib/py/remote.py new file mode 100644 index 0000000000..b1780b9877 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import importlib + +_modules = {} + +def Remote(kind, args, src_path): + global _modules + + if kind not in _modules: + _modules[kind] = importlib.import_module("..remote_" + kind, __name__) + + dir_path = os.path.abspath(src_path + "/../") + return getattr(_modules[kind], "Remote")(args, dir_path) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_netns.py b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py new file mode 100644 index 0000000000..7d5eeb0271 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import subprocess + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + + def cmd(self, comm): + return subprocess.Popen(["ip", "netns", "exec", self.name, "bash", "-c", comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def deploy(self, what): + if os.path.isabs(what): + return what + return os.path.abspath(self.dir_path + "/" + what) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py new file mode 100644 index 0000000000..924addde19 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import string +import subprocess +import random + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + self._tmpdir = None + + def __del__(self): + if self._tmpdir: + cmd("rm -rf " + self._tmpdir, host=self) + self._tmpdir = None + + def cmd(self, comm): + return subprocess.Popen(["ssh", "-q", self.name, comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def _mktmp(self): + return ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + + def deploy(self, what): + if not self._tmpdir: + self._tmpdir = "/tmp/" + self._mktmp() + cmd("mkdir " + self._tmpdir, host=self) + file_name = self._tmpdir + "/" + self._mktmp() + os.path.basename(what) + + if not os.path.isabs(what): + what = os.path.abspath(self.dir_path + "/" + what) + + cmd(f"scp {what} {self.name}:{file_name}") + return file_name diff --git a/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh new file mode 100755 index 0000000000..82be5d0133 --- /dev/null +++ b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh @@ -0,0 +1,668 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2024 Pengutronix, Oleksij Rempel + +# The script is adopted to work with the Microchip KSZ switch driver. + +ETH_FCS_LEN=4 + +WAIT_TIME=1 +NUM_NETIFS=4 +REQUIRE_JQ="yes" +REQUIRE_MZ="yes" +STABLE_MAC_ADDRS=yes +NETIF_CREATE=no +lib_dir=$(dirname $0)/../../../net/forwarding +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh + +require_command dcb + +h1=${NETIFS[p1]} +swp1=${NETIFS[p2]} +swp2=${NETIFS[p3]} +h2=${NETIFS[p4]} + +H1_IPV4="192.0.2.1" +H2_IPV4="192.0.2.2" +H1_IPV6="2001:db8:1::1" +H2_IPV6="2001:db8:1::2" + +# On h1_ and h2_create do not set IP addresses to avoid interaction with the +# system, to keep packet counters clean. +h1_create() +{ + simple_if_init $h1 + sysctl_set net.ipv6.conf.${h1}.disable_ipv6 1 + # Get the MAC address of the interface to use it with mausezahn + h1_mac=$(ip -j link show dev ${h1} | jq -e '.[].address') +} + +h1_destroy() +{ + sysctl_restore net.ipv6.conf.${h1}.disable_ipv6 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + sysctl_set net.ipv6.conf.${h2}.disable_ipv6 1 + h2_mac=$(ip -j link show dev ${h2} | jq -e '.[].address') +} + +h2_destroy() +{ + sysctl_restore net.ipv6.conf.${h2}.disable_ipv6 + simple_if_fini $h2 +} + +switch_create() +{ + ip link set ${swp1} up + ip link set ${swp2} up + sysctl_set net.ipv6.conf.${swp1}.disable_ipv6 1 + sysctl_set net.ipv6.conf.${swp2}.disable_ipv6 1 + + # Ports should trust VLAN PCP even with vlan_filtering=0 + ip link add br0 type bridge + ip link set ${swp1} master br0 + ip link set ${swp2} master br0 + ip link set br0 up + sysctl_set net.ipv6.conf.br0.disable_ipv6 1 +} + +switch_destroy() +{ + sysctl_restore net.ipv6.conf.${swp2}.disable_ipv6 + sysctl_restore net.ipv6.conf.${swp1}.disable_ipv6 + + ip link del br0 +} + +setup_prepare() +{ + vrf_prepare + + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + switch_destroy + + vrf_cleanup +} + +set_apptrust_order() +{ + local if_name=$1 + local order=$2 + + dcb apptrust set dev ${if_name} order ${order} +} + +# Function to extract a specified field from a given JSON stats string +extract_network_stat() { + local stats_json=$1 + local field_name=$2 + + echo $(echo "$stats_json" | jq -r "$field_name") +} + +run_test() +{ + local test_name=$1; + local apptrust_order=$2; + local port_prio=$3; + local dscp_ipv=$4; + local dscp=$5; + local have_vlan=$6; + local pcp_ipv=$7; + local vlan_pcp=$8; + local ip_v6=$9 + + local rx_ipv + local tx_ipv + + RET=0 + + # Send some packet to populate the switch MAC table + $MZ ${h2} -a ${h2_mac} -b ${h1_mac} -p 64 -t icmp echores -c 1 + + # Based on the apptrust order, set the expected Internal Priority values + # for the RX and TX paths. + if [ "${apptrust_order}" == "" ]; then + echo "Apptrust order not set." + rx_ipv=${port_prio} + tx_ipv=${port_prio} + elif [ "${apptrust_order}" == "dscp" ]; then + echo "Apptrust order is DSCP." + rx_ipv=${dscp_ipv} + tx_ipv=${dscp_ipv} + elif [ "${apptrust_order}" == "pcp" ]; then + echo "Apptrust order is PCP." + rx_ipv=${pcp_ipv} + tx_ipv=${pcp_ipv} + elif [ "${apptrust_order}" == "pcp dscp" ]; then + echo "Apptrust order is PCP DSCP." + if [ ${have_vlan} -eq 1 ]; then + rx_ipv=$((dscp_ipv > pcp_ipv ? dscp_ipv : pcp_ipv)) + tx_ipv=${pcp_ipv} + else + rx_ipv=${dscp_ipv} + tx_ipv=${dscp_ipv} + fi + else + RET=1 + echo "Error: Unknown apptrust order ${apptrust_order}" + log_test "${test_name}" + return + fi + + # Most/all? of the KSZ switches do not provide per-TC counters. There + # are only tx_hi and rx_hi counters, which are used to count packets + # which are considered as high priority and most likely not assigned + # to the queue 0. + # On the ingress path, packets seem to get high priority status + # independently of the DSCP or PCP global mapping. On the egress path, + # the high priority status is assigned based on the DSCP or PCP global + # map configuration. + # The thresholds for the high priority status are not documented, but + # it seems that the switch considers packets as high priority on the + # ingress path if detected Internal Priority is greater than 0. On the + # egress path, the switch considers packets as high priority if + # detected Internal Priority is greater than 1. + if [ ${rx_ipv} -ge 1 ]; then + local expect_rx_high_prio=1 + else + local expect_rx_high_prio=0 + fi + + if [ ${tx_ipv} -ge 2 ]; then + local expect_tx_high_prio=1 + else + local expect_tx_high_prio=0 + fi + + # Use ip tool to get the current switch packet counters. ethool stats + # need to be recalculated to get the correct values. + local swp1_stats=$(ip -s -j link show dev ${swp1}) + local swp2_stats=$(ip -s -j link show dev ${swp2}) + local swp1_rx_packets_before=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.packets') + local swp1_rx_bytes_before=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.bytes') + local swp2_tx_packets_before=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.packets') + local swp2_tx_bytes_before=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.bytes') + local swp1_rx_hi_before=$(ethtool_stats_get ${swp1} "rx_hi") + local swp2_tx_hi_before=$(ethtool_stats_get ${swp2} "tx_hi") + + # Assamble the mausezahn command based on the test parameters + # For the testis with ipv4 or ipv6, use icmp response packets, + # to avoid interaction with the system, to keep packet counters + # clean. + if [ ${ip_v6} -eq 0 ]; then + local ip="-a ${h1_mac} -b ${h2_mac} -A ${H1_IPV4} \ + -B ${H2_IPV4} -t icmp unreach,code=1,dscp=${dscp}" + else + local ip="-6 -a ${h1_mac} -b ${h2_mac} -A ${H1_IPV6} \ + -B ${H2_IPV6} -t icmp6 type=1,code=0,dscp=${dscp}" + fi + + if [ ${have_vlan} -eq 1 ]; then + local vlan_pcp_opt="-Q ${vlan_pcp}:0" + else + local vlan_pcp_opt="" + fi + $MZ ${h1} ${ip} -c ${PING_COUNT} -d 10msec ${vlan_pcp_opt} + + # Wait until the switch packet counters are updated + sleep 6 + + local swp1_stats=$(ip -s -j link show dev ${swp1}) + local swp2_stats=$(ip -s -j link show dev ${swp2}) + + local swp1_rx_packets_after=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.packets') + local swp1_rx_bytes_after=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.bytes') + local swp2_tx_packets_after=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.packets') + local swp2_tx_bytes_after=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.bytes') + + local swp1_rx_packets_diff=$((${swp1_rx_packets_after} - \ + ${swp1_rx_packets_before})) + local swp2_tx_packets_diff=$((${swp2_tx_packets_after} - \ + ${swp2_tx_packets_before})) + + local swp1_rx_hi_after=$(ethtool_stats_get ${swp1} "rx_hi") + local swp2_tx_hi_after=$(ethtool_stats_get ${swp2} "tx_hi") + + # Test if any packets were received on swp1, we will rx before and after + if [ ${swp1_rx_packets_diff} -lt ${PING_COUNT} ]; then + echo "Not expected amount of received packets on ${swp1}" + echo "before ${swp1_rx_packets_before} after ${swp1_rx_packets_after}" + RET=1 + fi + + # Test if any packets were transmitted on swp2, we will tx before and after + if [ ${swp2_tx_packets_diff} -lt ${PING_COUNT} ]; then + echo "Not expected amount of transmitted packets on ${swp2}" + echo "before ${swp2_tx_packets_before} after ${swp2_tx_packets_after}" + RET=1 + fi + + # tx/rx_hi counted in bytes. So, we need to compare the difference in bytes + local swp1_rx_bytes_diff=$(($swp1_rx_bytes_after - $swp1_rx_bytes_before)) + local swp2_tx_bytes_diff=$(($swp2_tx_bytes_after - $swp2_tx_bytes_before)) + local swp1_rx_hi_diff=$(($swp1_rx_hi_after - $swp1_rx_hi_before)) + local swp2_tx_hi_diff=$(($swp2_tx_hi_after - $swp2_tx_hi_before)) + + if [ ${expect_rx_high_prio} -eq 1 ]; then + swp1_rx_hi_diff=$((${swp1_rx_hi_diff} - \ + ${swp1_rx_packets_diff} * ${ETH_FCS_LEN})) + if [ ${swp1_rx_hi_diff} -ne ${swp1_rx_bytes_diff} ]; then + echo "Not expected amount of high priority packets received on ${swp1}" + echo "RX hi diff: ${swp1_rx_hi_diff}, expected RX bytes diff: ${swp1_rx_bytes_diff}" + RET=1 + fi + else + if [ ${swp1_rx_hi_diff} -ne 0 ]; then + echo "Unexpected amount of high priority packets received on ${swp1}" + echo "RX hi diff: ${swp1_rx_hi_diff}, expected 0" + RET=1 + fi + fi + + if [ ${expect_tx_high_prio} -eq 1 ]; then + swp2_tx_hi_diff=$((${swp2_tx_hi_diff} - \ + ${swp2_tx_packets_diff} * ${ETH_FCS_LEN})) + if [ ${swp2_tx_hi_diff} -ne ${swp2_tx_bytes_diff} ]; then + echo "Not expected amount of high priority packets transmitted on ${swp2}" + echo "TX hi diff: ${swp2_tx_hi_diff}, expected TX bytes diff: ${swp2_tx_bytes_diff}" + RET=1 + fi + else + if [ ${swp2_tx_hi_diff} -ne 0 ]; then + echo "Unexpected amount of high priority packets transmitted on ${swp2}" + echo "TX hi diff: ${swp2_tx_hi_diff}, expected 0" + RET=1 + fi + fi + + log_test "${test_name}" +} + +run_test_dscp() +{ + # IPv4 test + run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 0 + # IPv6 test + run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 1 +} + +run_test_dscp_pcp() +{ + # IPv4 test + run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 0 + # IPv6 test + run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 1 +} + +port_default_prio_get() +{ + local if_name=$1 + local prio + + prio="$(dcb -j app show dev ${if_name} default-prio | \ + jq '.default_prio[]')" + if [ -z "${prio}" ]; then + prio=0 + fi + + echo ${prio} +} + +test_port_default() +{ + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_prio=$(port_default_prio_get ${swp1}) + local apptrust_order="" + + RET=0 + + # Make sure no other priority sources will interfere with the test + set_apptrust_order ${swp1} "${apptrust_order}" + + for val in $(seq 0 7); do + dcb app replace dev ${swp1} default-prio ${val} + if [ $val -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + break + fi + + run_test_dscp "Port-default QoS classification, prio: ${val}" \ + "${apptrust_order}" ${val} 0 0 + done + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_prio} + if [ $orig_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "Port-default QoS classification" +} + +port_get_default_apptrust() +{ + local if_name=$1 + + dcb -j apptrust show dev ${if_name} | jq -r '.order[]' | \ + tr '\n' ' ' | xargs +} + +test_port_apptrust() +{ + local original_dscp_prios_swp1=$(get_dscp_prios ${swp1}) + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_port_prio=$(port_default_prio_get ${swp1}) + local order_variants=("pcp dscp" "dscp" "pcp") + local apptrust_order + local port_prio + local dscp_prio + local pcp_prio + local dscp + local pcp + + RET=0 + + # First, test if apptrust configuration as taken by the kernel + for order in "${order_variants[@]}"; do + set_apptrust_order ${swp1} "${order}" + if [[ "$order" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + break + fi + done + + log_test "Apptrust, supported variants" + + # To test if the apptrust configuration is working as expected, we need + # to set DSCP priorities for the switch port. + init_dscp_prios "${swp1}" "${original_dscp_prios_swp1}" + + # Start with a simple test where all apptrust sources are disabled + # default port priority is 0, DSCP priority is mapped to 7. + # No high priority packets should be received or transmitted. + port_prio=0 + dscp_prio=7 + dscp=4 + + dcb app replace dev ${swp1} default-prio ${port_prio} + dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio} + + apptrust_order="" + set_apptrust_order ${swp1} "${apptrust_order}" + # Test with apptrust sources disabled, Packets should get port default + # priority which is 0 + run_test_dscp "Apptrust, all disabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="pcp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If PCP is enabled, packets should get PCP priority, which is not + # set in this test (no VLAN tags are present in the packet). No high + # priority packets should be received or transmitted. + run_test_dscp "Apptrust, PCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="dscp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If DSCP is enabled, packets should get DSCP priority which is set to 7 + # in this test. High priority packets should be received and transmitted. + run_test_dscp "Apptrust, DSCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="pcp dscp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If PCP and DSCP are enabled, PCP would have higher apptrust priority + # so packets should get PCP priority. But in this test VLAN PCP is not + # set, so it should get DSCP priority which is set to 7. High priority + # packets should be received and transmitted. + run_test_dscp "Apptrust, PCP and DSCP are enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + # If VLAN PCP is set, it should have higher apptrust priority than DSCP + # so packets should get VLAN PCP priority. Send packets with VLAN PCP + # set to 0, DSCP set to 7. Packets should get VLAN PCP priority. + # No high priority packets should be transmitted. Due to nature of the + # switch, high priority packets will be received. + pcp_prio=0 + pcp=0 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + + # If VLAN PCP is set to 7, it should have higher apptrust priority than + # DSCP so packets should get VLAN PCP priority. Send packets with VLAN + # PCP set to 7, DSCP set to 7. Packets should get VLAN PCP priority. + # High priority packets should be received and transmitted. + pcp_prio=7 + pcp=7 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + # Now make sure that the switch is able to handle the case where DSCP + # priority is set to 0 and PCP priority is set to 7. Packets should get + # PCP priority. High priority packets should be received and transmitted. + dscp_prio=0 + dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio} + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + # If both VLAN PCP and DSCP are set to 0, packets should get 0 priority. + # No high priority packets should be received or transmitted. + pcp_prio=0 + pcp=0 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + + # Restore original priorities + if ! restore_priorities "${swp1}" "${original_dscp_prios_swp1}"; then + RET=1 + fi + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_port_prio} + if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "Apptrust, restore original settings" +} + +# Function to get current DSCP priorities +get_dscp_prios() { + local if_name=$1 + dcb -j app show dev ${if_name} | jq -c '.dscp_prio' +} + +# Function to set a specific DSCP priority on a device +replace_dscp_prio() { + local if_name=$1 + local dscp=$2 + local prio=$3 + dcb app replace dev ${if_name} dscp-prio ${dscp}:${prio} +} + +# Function to compare DSCP maps +compare_dscp_maps() { + local old_json=$1 + local new_json=$2 + local dscp=$3 + local prio=$4 + + # Create a modified old_json with the expected change for comparison + local modified_old_json=$(echo "$old_json" | + jq --argjson dscp $dscp --argjson prio $prio \ + 'map(if .[0] == $dscp then [$dscp, $prio] else . end)' | + tr -d " \n") + + # Compare new_json with the modified_old_json + if [[ "$modified_old_json" == "$new_json" ]]; then + return 0 + else + return 1 + fi +} + +# Function to set DSCP priorities +set_and_verify_dscp() { + local port=$1 + local dscp=$2 + local new_prio=$3 + + local old_prios=$(get_dscp_prios $port) + + replace_dscp_prio "$port" $dscp $new_prio + + # Fetch current settings and compare + local current_prios=$(get_dscp_prios $port) + if ! compare_dscp_maps "$old_prios" "$current_prios" $dscp $new_prio; then + echo "Error: Unintended changes detected in DSCP map for $port after setting DSCP $dscp to $new_prio." + return 1 + fi + return 0 +} + +# Function to restore original priorities +restore_priorities() { + local port=$1 + local original_prios=$2 + + echo "Removing test artifacts for $port" + local current_prios=$(get_dscp_prios $port) + local prio_str=$(echo "$current_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app del dev $port dscp-prio $prio_str + + echo "Restoring original DSCP priorities for $port" + local restore_str=$(echo "$original_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app add dev $port dscp-prio $restore_str + + local current_prios=$(get_dscp_prios $port) + if [[ "$original_prios" != "$current_prios" ]]; then + echo "Error: Failed to restore original DSCP priorities for $port" + return 1 + fi + return 0 +} + +# Initialize DSCP priorities. Set them to predictable values for testing. +init_dscp_prios() { + local port=$1 + local original_prios=$2 + + echo "Removing any existing DSCP priority mappins for $port" + local prio_str=$(echo "$original_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app del dev $port dscp-prio $prio_str + + # Initialize DSCP priorities list + local dscp_prios="" + for dscp in {0..63}; do + dscp_prios+=("$dscp:0") + done + + echo "Setting initial DSCP priorities map to 0 for $port" + dcb app add dev $port dscp-prio ${dscp_prios[@]} +} + +# Main function to test global DSCP map across specified ports +test_global_dscp_map() { + local ports=("$swp1" "$swp2") + local original_dscp_prios_port0=$(get_dscp_prios ${ports[0]}) + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_port_prio=$(port_default_prio_get ${swp1}) + local apptrust_order="dscp" + local port_prio=0 + local dscp_prio + local dscp + + RET=0 + + set_apptrust_order ${swp1} "${apptrust_order}" + dcb app replace dev ${swp1} default-prio ${port_prio} + + # Initialize DSCP priorities + init_dscp_prios "${ports[0]}" "$original_dscp_prios_port0" + + # Loop over each DSCP index + for dscp in {0..63}; do + # and test each Internal Priority value + for dscp_prio in {0..7}; do + # do it for each port. This is to test if the global DSCP map + # is accessible from all ports. + for port in "${ports[@]}"; do + if ! set_and_verify_dscp "$port" $dscp $dscp_prio; then + RET=1 + fi + done + + # Test if the DSCP priority is correctly applied to the packets + run_test_dscp "DSCP (${dscp}) QoS classification, prio: ${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + if [ ${RET} -eq 1 ]; then + break + fi + done + done + + # Restore original priorities + if ! restore_priorities "${ports[0]}" "${original_dscp_prios_port0}"; then + RET=1 + fi + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_port_prio} + if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "DSCP global map" +} + +trap cleanup EXIT + +ALL_TESTS=" + test_port_default + test_port_apptrust + test_global_dscp_map +" + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh index 91891b9418..877cd6df94 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh @@ -24,8 +24,8 @@ setup_prepare() busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 check_err $? "ports did not come up" - local lanes_exist=$(ethtool $swp1 | grep 'Lanes:') - if [[ -z $lanes_exist ]]; then + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + if [[ $? -ne 0 ]]; then log_test "SKIP: driver does not support lanes setting" exit 1 fi @@ -122,8 +122,9 @@ autoneg() ethtool_set $swp1 speed $max_speed lanes $lanes ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "$lanes lanes is autonegotiated" @@ -160,8 +161,9 @@ autoneg_force_mode() ethtool_set $swp2 speed $max_speed lanes $lanes autoneg off ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "Autoneg off, $lanes lanes detected during force mode" diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh index 31252bc877..4994bea5da 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh @@ -11,7 +11,7 @@ ALL_TESTS="single_mask_test identical_filters_test two_masks_test \ multiple_masks_test ctcam_edge_cases_test delta_simple_test \ delta_two_masks_one_key_test delta_simple_rehash_test \ bloom_simple_test bloom_complex_test bloom_delta_test \ - max_erp_entries_test max_group_size_test" + max_erp_entries_test max_group_size_test collision_test" NUM_NETIFS=2 source $lib_dir/lib.sh source $lib_dir/tc_common.sh @@ -457,7 +457,7 @@ delta_two_masks_one_key_test() { # If 2 keys are the same and only differ in mask in a way that # they belong under the same ERP (second is delta of the first), - # there should be no C-TCAM spill. + # there should be C-TCAM spill. RET=0 @@ -474,8 +474,8 @@ delta_two_masks_one_key_test() tp_record "mlxsw:*" "tc filter add dev $h2 ingress protocol ip \ pref 2 handle 102 flower $tcflags dst_ip 192.0.2.2 \ action drop" - tp_check_hits "mlxsw:mlxsw_sp_acl_atcam_entry_add_ctcam_spill" 0 - check_err $? "incorrect C-TCAM spill while inserting the second rule" + tp_check_hits "mlxsw:mlxsw_sp_acl_atcam_entry_add_ctcam_spill" 1 + check_err $? "C-TCAM spill did not happen while inserting the second rule" $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ -t ip -q @@ -1087,6 +1087,53 @@ max_group_size_test() log_test "max ACL group size test ($tcflags). max size $max_size" } +collision_test() +{ + # Filters cannot share an eRP if in the common unmasked part (i.e., + # without the delta bits) they have the same values. If the driver does + # not prevent such configuration (by spilling into the C-TCAM), then + # multiple entries will be present in the device with the same key, + # leading to collisions and a reduced scale. + # + # Create such a scenario and make sure all the filters are successfully + # added. + + RET=0 + + local ret + + if [[ "$tcflags" != "skip_sw" ]]; then + return 0; + fi + + # Add a single dst_ip/24 filter and multiple dst_ip/32 filters that all + # have the same values in the common unmasked part (dst_ip/24). + + tc filter add dev $h2 ingress pref 1 proto ipv4 handle 101 \ + flower $tcflags dst_ip 198.51.100.0/24 \ + action drop + + for i in {0..255}; do + tc filter add dev $h2 ingress pref 2 proto ipv4 \ + handle $((102 + i)) \ + flower $tcflags dst_ip 198.51.100.${i}/32 \ + action drop + ret=$? + [[ $ret -ne 0 ]] && break + done + + check_err $ret "failed to add all the filters" + + for i in {255..0}; do + tc filter del dev $h2 ingress pref 2 proto ipv4 \ + handle $((102 + i)) flower + done + + tc filter del dev $h2 ingress pref 1 proto ipv4 handle 101 flower + + log_test "collision test ($tcflags)" +} + setup_prepare() { h1=${NETIFS[p1]} diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py new file mode 100755 index 0000000000..eb83e7b487 --- /dev/null +++ b/tools/testing/selftests/drivers/net/ping.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, wait_port_listen, rand_port + + +def test_v4(cfg) -> None: + cfg.require_v4() + + cmd(f"ping -c 1 -W0.5 {cfg.remote_v4}") + cmd(f"ping -c 1 -W0.5 {cfg.v4}", host=cfg.remote) + + +def test_v6(cfg) -> None: + cfg.require_v6() + + cmd(f"ping -c 1 -W0.5 {cfg.remote_v6}") + cmd(f"ping -c 1 -W0.5 {cfg.v6}", host=cfg.remote) + + +def test_tcp(cfg) -> None: + cfg.require_cmd("socat", remote=True) + + port = rand_port() + listen_cmd = f"socat -{cfg.addr_ipver} -t 2 -u TCP-LISTEN:{port},reuseport STDOUT" + + with bkg(listen_cmd, exit_wait=True) as nc: + wait_port_listen(port) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.baddr}:{port}", + shell=True, host=cfg.remote) + ksft_eq(nc.stdout.strip(), "ping") + + with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc: + wait_port_listen(port, host=cfg.remote) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.remote_baddr}:{port}", shell=True) + ksft_eq(nc.stdout.strip(), "ping") + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py new file mode 100755 index 0000000000..30f29096e2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/queues.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx +from lib.py import EthtoolFamily, NetdevFamily +from lib.py import NetDrvEnv +from lib.py import cmd +import glob + + +def sys_get_queues(ifname) -> int: + folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*') + return len(folders) + + +def nl_get_queues(cfg, nl): + queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True) + if queues: + return len([q for q in queues if q['type'] == 'rx']) + return None + + +def get_queues(cfg, nl) -> None: + queues = nl_get_queues(cfg, nl) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + expected = sys_get_queues(cfg.dev['ifname']) + ksft_eq(queues, expected) + + +def addremove_queues(cfg, nl) -> None: + queues = nl_get_queues(cfg, nl) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + curr_queues = sys_get_queues(cfg.dev['ifname']) + if curr_queues == 1: + raise KsftSkipEx('cannot decrement queue: already at 1') + + netnl = EthtoolFamily() + channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + if channels['combined-count'] == 0: + rx_type = 'rx' + else: + rx_type = 'combined' + + expected = curr_queues - 1 + cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) + queues = nl_get_queues(cfg, nl) + ksft_eq(queues, expected) + + expected = curr_queues + cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) + queues = nl_get_queues(cfg, nl) + ksft_eq(queues, expected) + + +def main() -> None: + with NetDrvEnv(__file__, queue_count=3) as cfg: + ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py new file mode 100755 index 0000000000..820b8e0a22 --- /dev/null +++ b/tools/testing/selftests/drivers/net/stats.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx +from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError +from lib.py import NetDrvEnv + +ethnl = EthtoolFamily() +netfam = NetdevFamily() +rtnl = RtnlFamily() + + +def check_pause(cfg) -> None: + global ethnl + + try: + ethnl.pause_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("pause not supported by the device") + raise + + data = ethnl.pause_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def check_fec(cfg) -> None: + global ethnl + + try: + ethnl.fec_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("FEC not supported by the device") + raise + + data = ethnl.fec_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def pkt_byte_sum(cfg) -> None: + global netfam, rtnl + + def get_qstat(test): + global netfam + stats = netfam.qstats_get({}, dump=True) + if stats: + for qs in stats: + if qs["ifindex"]== test.ifindex: + return qs + + qstat = get_qstat(cfg) + if qstat is None: + raise KsftSkipEx("qstats not supported by the device") + + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + ksft_in(key, qstat, "Drivers should always report basic keys") + + # Compare stats, rtnl stats and qstats must match, + # but the interface may be up, so do a series of dumps + # each time the more "recent" stats must be higher or same. + def stat_cmp(rstat, qstat): + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + if rstat[key] != qstat[key]: + return rstat[key] - qstat[key] + return 0 + + for _ in range(10): + rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + if stat_cmp(rtstat, qstat) < 0: + raise Exception("RTNL stats are lower, fetched later") + qstat = get_qstat(cfg) + if stat_cmp(rtstat, qstat) > 0: + raise Exception("Qstats are lower, fetched later") + + +def qstat_by_ifindex(cfg) -> None: + global netfam + global rtnl + + # Construct a map ifindex -> [dump, by-index, dump] + ifindexes = {} + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']] = [entry, None, None] + + for ifindex in ifindexes.keys(): + entry = netfam.qstats_get({"ifindex": ifindex}, dump=True) + ksft_eq(len(entry), 1) + ifindexes[entry[0]['ifindex']][1] = entry[0] + + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']][2] = entry + + if len(ifindexes) == 0: + raise KsftSkipEx("No ifindex supports qstats") + + # Now make sure the stats match/make sense + for ifindex, triple in ifindexes.items(): + all_keys = triple[0].keys() | triple[1].keys() | triple[2].keys() + + for key in all_keys: + ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key) + ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key) + + # Test invalid dumps + # 0 is invalid + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 0}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -34) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # loopback has no stats + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 1}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -95) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # Try to get stats for lowest unused ifindex but not 0 + devs = rtnl.getlink({}, dump=True) + all_ifindexes = set([dev["ifi-index"] for dev in devs]) + lowest = 2 + while lowest in all_ifindexes: + lowest += 1 + + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": lowest}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -19) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + +def main() -> None: + with NetDrvEnv(__file__) as cfg: + ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/virtio_net/Makefile b/tools/testing/selftests/drivers/net/virtio_net/Makefile new file mode 100644 index 0000000000..7ec7cd3ab2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0+ OR MIT + +TEST_PROGS = basic_features.sh \ + # + +TEST_FILES = \ + virtio_net_common.sh \ + # + +TEST_INCLUDES = \ + ../../../net/forwarding/lib.sh \ + ../../../net/lib.sh \ + # + +include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh new file mode 100755 index 0000000000..cf8cf816ed --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# See virtio_net_common.sh comments for more details about assumed setup + +ALL_TESTS=" + initial_ping_test + f_mac_test +" + +source virtio_net_common.sh + +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1=${NETIFS[p1]} +h2=${NETIFS[p2]} + +h1_create() +{ + simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h1_destroy() +{ + simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h2_create() +{ + simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +h2_destroy() +{ + simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +initial_ping_test() +{ + setup_cleanup + setup_prepare + ping_test $h1 $H2_IPV4 " simple" +} + +f_mac_test() +{ + RET=0 + local test_name="mac feature filtered" + + virtio_feature_present $h1 $VIRTIO_NET_F_MAC + if [ $? -ne 0 ]; then + log_test_skip "$test_name" "Device $h1 is missing feature $VIRTIO_NET_F_MAC." + return 0 + fi + virtio_feature_present $h1 $VIRTIO_NET_F_MAC + if [ $? -ne 0 ]; then + log_test_skip "$test_name" "Device $h2 is missing feature $VIRTIO_NET_F_MAC." + return 0 + fi + + setup_cleanup + setup_prepare + + grep -q 0 /sys/class/net/$h1/addr_assign_type + check_err $? "Permanent address assign type for $h1 is not set" + grep -q 0 /sys/class/net/$h2/addr_assign_type + check_err $? "Permanent address assign type for $h2 is not set" + + setup_cleanup + virtio_filter_feature_add $h1 $VIRTIO_NET_F_MAC + virtio_filter_feature_add $h2 $VIRTIO_NET_F_MAC + setup_prepare + + grep -q 0 /sys/class/net/$h1/addr_assign_type + check_fail $? "Permanent address assign type for $h1 is set when F_MAC feature is filtered" + grep -q 0 /sys/class/net/$h2/addr_assign_type + check_fail $? "Permanent address assign type for $h2 is set when F_MAC feature is filtered" + + ping_do $h1 $H2_IPV4 + check_err $? "Ping failed" + + log_test "$test_name" +} + +setup_prepare() +{ + virtio_device_rebind $h1 + virtio_device_rebind $h2 + wait_for_dev $h1 + wait_for_dev $h2 + + vrf_prepare + + h1_create + h2_create +} + +setup_cleanup() +{ + h2_destroy + h1_destroy + + vrf_cleanup + + virtio_filter_features_clear $h1 + virtio_filter_features_clear $h2 + virtio_device_rebind $h1 + virtio_device_rebind $h2 + wait_for_dev $h1 + wait_for_dev $h2 +} + +cleanup() +{ + pre_cleanup + setup_cleanup +} + +check_driver $h1 "virtio_net" +check_driver $h2 "virtio_net" +check_virtio_debugfs $h1 +check_virtio_debugfs $h2 + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/virtio_net/config b/tools/testing/selftests/drivers/net/virtio_net/config new file mode 100644 index 0000000000..bcf7555eaf --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/config @@ -0,0 +1,8 @@ +CONFIG_BPF_SYSCALL=y +CONFIG_CGROUP_BPF=y +CONFIG_IPV6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_VRF=m +CONFIG_VIRTIO_DEBUG=y +CONFIG_VIRTIO_NET=y diff --git a/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh new file mode 100644 index 0000000000..57bd8055e2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This assumes running on a host with two virtio interfaces connected +# back to back. Example script to do such wire-up of tap devices would +# look like this: +# +# ======================================================================================================= +# #!/bin/bash +# +# DEV1="$1" +# DEV2="$2" +# +# sudo tc qdisc add dev $DEV1 clsact +# sudo tc qdisc add dev $DEV2 clsact +# sudo tc filter add dev $DEV1 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV2 +# sudo tc filter add dev $DEV2 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV1 +# sudo ip link set $DEV1 up +# sudo ip link set $DEV2 up +# ======================================================================================================= + +REQUIRE_MZ="no" +NETIF_CREATE="no" +NETIF_FIND_DRIVER="virtio_net" +NUM_NETIFS=2 + +H1_IPV4="192.0.2.1" +H2_IPV4="192.0.2.2" +H1_IPV6="2001:db8:1::1" +H2_IPV6="2001:db8:1::2" + +VIRTIO_NET_F_MAC=5 + +virtio_device_get() +{ + local dev=$1; shift + local device_path="/sys/class/net/$dev/device/" + + basename `realpath $device_path` +} + +virtio_device_rebind() +{ + local dev=$1; shift + local device=`virtio_device_get $dev` + + echo "$device" > /sys/bus/virtio/drivers/virtio_net/unbind + echo "$device" > /sys/bus/virtio/drivers/virtio_net/bind +} + +virtio_debugfs_get() +{ + local dev=$1; shift + local device=`virtio_device_get $dev` + + echo /sys/kernel/debug/virtio/$device/ +} + +check_virtio_debugfs() +{ + local dev=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + if [ ! -f "$debugfs/device_features" ] || + [ ! -f "$debugfs/filter_feature_add" ] || + [ ! -f "$debugfs/filter_feature_del" ] || + [ ! -f "$debugfs/filter_features" ] || + [ ! -f "$debugfs/filter_features_clear" ]; then + echo "SKIP: not possible to access debugfs for $dev" + exit $ksft_skip + fi +} + +virtio_feature_present() +{ + local dev=$1; shift + local feature=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + cat $debugfs/device_features |grep "^$feature$" &> /dev/null + return $? +} + +virtio_filter_features_clear() +{ + local dev=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + echo "1" > $debugfs/filter_features_clear +} + +virtio_filter_feature_add() +{ + local dev=$1; shift + local feature=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + echo "$feature" > $debugfs/filter_feature_add +} diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c index b2f37d86a5..438c8ff2fd 100644 --- a/tools/testing/selftests/exec/recursion-depth.c +++ b/tools/testing/selftests/exec/recursion-depth.c @@ -37,25 +37,25 @@ int main(void) ksft_test_result_skip("error: unshare, errno %d\n", errno); ksft_finished(); } - ksft_exit_fail_msg("error: unshare, errno %d\n", errno); + ksft_exit_fail_perror("error: unshare"); } if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL) == -1) - ksft_exit_fail_msg("error: mount '/', errno %d\n", errno); + ksft_exit_fail_perror("error: mount '/'"); /* Require "exec" filesystem. */ if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) - ksft_exit_fail_msg("error: mount ramfs, errno %d\n", errno); + ksft_exit_fail_perror("error: mount ramfs"); #define FILENAME "/tmp/1" fd = creat(FILENAME, 0700); if (fd == -1) - ksft_exit_fail_msg("error: creat, errno %d\n", errno); + ksft_exit_fail_perror("error: creat"); #define S "#!" FILENAME "\n" if (write(fd, S, strlen(S)) != strlen(S)) - ksft_exit_fail_msg("error: write, errno %d\n", errno); + ksft_exit_fail_perror("error: write"); close(fd); diff --git a/tools/testing/selftests/fchmodat2/Makefile b/tools/testing/selftests/fchmodat2/Makefile index 71ec34bf15..4373cea79b 100644 --- a/tools/testing/selftests/fchmodat2/Makefile +++ b/tools/testing/selftests/fchmodat2/Makefile @@ -1,6 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan $(KHDR_INCLUDES) +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined $(KHDR_INCLUDES) + +# gcc requires -static-libasan in order to ensure that Address Sanitizer's +# library is the first one loaded. However, clang already statically links the +# Address Sanitizer if -fsanitize is specified. Therefore, simply omit +# -static-libasan for clang builds. +ifeq ($(LLVM),) + CFLAGS += -static-libasan +endif + TEST_GEN_PROGS := fchmodat2_test include ../lib.mk diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 3eafd7da58..e8c019d72c 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -3,6 +3,7 @@ #define _GNU_SOURCE #include +#include #include #include #include @@ -124,8 +125,16 @@ static uint32_t old_root_id, old_parent_id; static void cleanup_namespace(void) { - fchdir(orig_root); - chroot("."); + int ret; + + ret = fchdir(orig_root); + if (ret == -1) + ksft_perror("fchdir to original root"); + + ret = chroot("."); + if (ret == -1) + ksft_perror("chroot to original root"); + umount2(root_mntpoint, MNT_DETACH); rmdir(root_mntpoint); } diff --git a/tools/testing/selftests/ftrace/config b/tools/testing/selftests/ftrace/config index e59d985eef..048a312abf 100644 --- a/tools/testing/selftests/ftrace/config +++ b/tools/testing/selftests/ftrace/config @@ -1,16 +1,28 @@ -CONFIG_KPROBES=y +CONFIG_BPF_SYSCALL=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_EPROBE_EVENTS=y +CONFIG_FPROBE=y +CONFIG_FPROBE_EVENTS=y CONFIG_FTRACE=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_FUNCTION_GRAPH_RETVAL=y CONFIG_FUNCTION_PROFILER=y -CONFIG_TRACER_SNAPSHOT=y -CONFIG_STACK_TRACER=y CONFIG_HIST_TRIGGERS=y -CONFIG_SCHED_TRACER=y -CONFIG_PREEMPT_TRACER=y CONFIG_IRQSOFF_TRACER=y -CONFIG_PREEMPTIRQ_DELAY_TEST=m +CONFIG_KALLSYMS_ALL=y +CONFIG_KPROBES=y +CONFIG_KPROBE_EVENTS=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y +CONFIG_PREEMPTIRQ_DELAY_TEST=m +CONFIG_PREEMPT_TRACER=y +CONFIG_PROBE_EVENTS_BTF_ARGS=y CONFIG_SAMPLES=y CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_SAMPLE_TRACE_PRINTK=m -CONFIG_KALLSYMS_ALL=y +CONFIG_SCHED_TRACER=y +CONFIG_STACK_TRACER=y +CONFIG_TRACER_SNAPSHOT=y +CONFIG_UPROBES=y +CONFIG_UPROBE_EVENTS=y diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 25d4e0fca3..cce72f8b03 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -255,7 +255,13 @@ prlog() { # messages [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE } catlog() { #file - cat $1 + if [ "${KTAP}" = "1" ]; then + cat $1 | while read line ; do + echo "# $line" + done + else + cat $1 + fi [ "$LOG_FILE" ] && cat $1 | strip_esc >> $LOG_FILE } prlog "=== Ftrace unit tests ===" diff --git a/tools/testing/selftests/ftrace/ftracetest-ktap b/tools/testing/selftests/ftrace/ftracetest-ktap index b3284679ef..14e62ef3f3 100755 --- a/tools/testing/selftests/ftrace/ftracetest-ktap +++ b/tools/testing/selftests/ftrace/ftracetest-ktap @@ -5,4 +5,4 @@ # # Copyright (C) Arm Ltd., 2023 -./ftracetest -K +./ftracetest -K -v diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc new file mode 100644 index 0000000000..c6a9d2466a --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -0,0 +1,41 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Fprobe event VFS type argument +# requires: dynamic_events "%pd/%pD":README "f[:[/][]] [%return] []":README + + +: "Test argument %pd with name for fprobe" +echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pd without name for fprobe" +echo 'f:testprobe dput $arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD with name for fprobe" +echo 'f:testprobe vfs_read name=$arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD without name for fprobe" +echo 'f:testprobe vfs_read $arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 25432b8cd5..073a748b93 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,7 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="scheduler_tick" +FUNC2="sched_tick" ALL_FUNCS="#### all functions enabled ####" diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc new file mode 100644 index 0000000000..21a54be689 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name" +echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pd without name" +echo 'p:testprobe dput $arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD with name" +echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD without name" +echo 'p:testprobe vfs_read $arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 11e157d753..78ab2cd111 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -3,8 +3,6 @@ SUBDIRS := functional TEST_PROGS := run.sh -.PHONY: all clean - include ../lib.mk all: diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common index 0f456dbab6..45b5570441 100644 --- a/tools/testing/selftests/hid/config.common +++ b/tools/testing/selftests/hid/config.common @@ -238,3 +238,4 @@ CONFIG_VLAN_8021Q=y CONFIG_XFRM_SUB_POLICY=y CONFIG_XFRM_USER=y CONFIG_ZEROPLUS_FF=y +CONFIG_KASAN=y diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index 2cf96f818f..f825623e3e 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -16,6 +16,11 @@ #define SHOW_UHID_DEBUG 0 +#define min(a, b) \ + ({ __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a < _b ? _a : _b; }) + static unsigned char rdesc[] = { 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ 0x09, 0x21, /* Usage (Vendor Usage 0x21) */ @@ -111,6 +116,10 @@ struct hid_hw_request_syscall_args { static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER; +static unsigned char output_report[10]; + /* no need to protect uhid_stopped, only one thread accesses it */ static bool uhid_stopped; @@ -205,6 +214,13 @@ static int uhid_event(struct __test_metadata *_metadata, int fd) break; case UHID_OUTPUT: UHID_LOG("UHID_OUTPUT from uhid-dev"); + + pthread_mutex_lock(&uhid_output_mtx); + memcpy(output_report, + ev.u.output.data, + min(ev.u.output.size, sizeof(output_report))); + pthread_cond_signal(&uhid_output_cond); + pthread_mutex_unlock(&uhid_output_mtx); break; case UHID_GET_REPORT: UHID_LOG("UHID_GET_REPORT from uhid-dev"); @@ -734,8 +750,100 @@ TEST_F(hid_bpf, test_hid_change_report) } /* - * Attach hid_user_raw_request to the given uhid device, - * call the bpf program from userspace + * Call hid_bpf_input_report against the given uhid device, + * check that the program is called and does the expected. + */ +TEST_F(hid_bpf, test_hid_user_input_report_call) +{ + struct hid_hw_request_syscall_args args = { + .retval = -1, + .size = 10, + }; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs, + .ctx_in = &args, + .ctx_size_in = sizeof(args), + ); + __u8 buf[10] = {0}; + int err, prog_fd; + + LOAD_BPF; + + args.hid = self->hid_id; + args.data[0] = 1; /* report ID */ + args.data[1] = 2; /* report ID */ + args.data[2] = 42; /* report ID */ + + prog_fd = bpf_program__fd(self->skel->progs.hid_user_input_report); + + /* check that there is no data to read from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, -1) TH_LOG("read_hidraw"); + + err = bpf_prog_test_run_opts(prog_fd, &tattrs); + + ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts"); + + ASSERT_EQ(args.retval, 0); + + /* read the data from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, 6) TH_LOG("read_hidraw"); + ASSERT_EQ(buf[0], 1); + ASSERT_EQ(buf[1], 2); + ASSERT_EQ(buf[2], 42); +} + +/* + * Call hid_bpf_hw_output_report against the given uhid device, + * check that the program is called and does the expected. + */ +TEST_F(hid_bpf, test_hid_user_output_report_call) +{ + struct hid_hw_request_syscall_args args = { + .retval = -1, + .size = 10, + }; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs, + .ctx_in = &args, + .ctx_size_in = sizeof(args), + ); + int err, cond_err, prog_fd; + struct timespec time_to_wait; + + LOAD_BPF; + + args.hid = self->hid_id; + args.data[0] = 1; /* report ID */ + args.data[1] = 2; /* report ID */ + args.data[2] = 42; /* report ID */ + + prog_fd = bpf_program__fd(self->skel->progs.hid_user_output_report); + + pthread_mutex_lock(&uhid_output_mtx); + + memset(output_report, 0, sizeof(output_report)); + clock_gettime(CLOCK_REALTIME, &time_to_wait); + time_to_wait.tv_sec += 2; + + err = bpf_prog_test_run_opts(prog_fd, &tattrs); + cond_err = pthread_cond_timedwait(&uhid_output_cond, &uhid_output_mtx, &time_to_wait); + + ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts"); + ASSERT_OK(cond_err) TH_LOG("error while calling waiting for the condition"); + + ASSERT_EQ(args.retval, 3); + + ASSERT_EQ(output_report[0], 1); + ASSERT_EQ(output_report[1], 2); + ASSERT_EQ(output_report[2], 42); + + pthread_mutex_unlock(&uhid_output_mtx); +} + +/* + * Call hid_hw_raw_request against the given uhid device, * check that the program is called and does the expected. */ TEST_F(hid_bpf, test_hid_user_raw_request_call) diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c index 1e558826b8..f67d35def1 100644 --- a/tools/testing/selftests/hid/progs/hid.c +++ b/tools/testing/selftests/hid/progs/hid.c @@ -101,6 +101,52 @@ int hid_user_raw_request(struct hid_hw_request_syscall_args *args) return 0; } +SEC("syscall") +int hid_user_output_report(struct hid_hw_request_syscall_args *args) +{ + struct hid_bpf_ctx *ctx; + const size_t size = args->size; + int i, ret = 0; + + if (size > sizeof(args->data)) + return -7; /* -E2BIG */ + + ctx = hid_bpf_allocate_context(args->hid); + if (!ctx) + return -1; /* EPERM check */ + + ret = hid_bpf_hw_output_report(ctx, + args->data, + size); + args->retval = ret; + + hid_bpf_release_context(ctx); + + return 0; +} + +SEC("syscall") +int hid_user_input_report(struct hid_hw_request_syscall_args *args) +{ + struct hid_bpf_ctx *ctx; + const size_t size = args->size; + int i, ret = 0; + + if (size > sizeof(args->data)) + return -7; /* -E2BIG */ + + ctx = hid_bpf_allocate_context(args->hid); + if (!ctx) + return -1; /* EPERM check */ + + ret = hid_bpf_input_report(ctx, HID_INPUT_REPORT, args->data, size); + args->retval = ret; + + hid_bpf_release_context(ctx); + + return 0; +} + static const __u8 rdesc[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x32, /* USAGE (Z) */ diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 65e657ac11..9cd56821d0 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -94,5 +94,11 @@ extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, size_t buf__sz, enum hid_report_type type, enum hid_class_request reqtype) __ksym; +extern int hid_bpf_hw_output_report(struct hid_bpf_ctx *ctx, + __u8 *buf, size_t buf__sz) __ksym; +extern int hid_bpf_input_report(struct hid_bpf_ctx *ctx, + enum hid_report_type type, + __u8 *data, + size_t buf__sz) __ksym; #endif /* __HID_BPF_HELPERS_H */ diff --git a/tools/testing/selftests/hid/tests/base.py b/tools/testing/selftests/hid/tests/base.py index 51433063b2..3a465768e5 100644 --- a/tools/testing/selftests/hid/tests/base.py +++ b/tools/testing/selftests/hid/tests/base.py @@ -8,11 +8,13 @@ import libevdev import os import pytest +import shutil +import subprocess import time import logging -from hidtools.device.base_device import BaseDevice, EvdevMatch, SysfsFile +from .base_device import BaseDevice, EvdevMatch, SysfsFile from pathlib import Path from typing import Final, List, Tuple @@ -157,6 +159,17 @@ class BaseTestCase: # for example ("playstation", "hid-playstation") kernel_modules: List[Tuple[str, str]] = [] + # List of in kernel HID-BPF object files to load + # before starting the test + # Any existing pre-loaded HID-BPF module will be removed + # before the ones in this list will be manually loaded. + # Each Element is a tuple '(hid_bpf_object, rdesc_fixup_present)', + # for example '("xppen-ArtistPro16Gen2.bpf.o", True)' + # If 'rdesc_fixup_present' is True, the test needs to wait + # for one unbind and rebind before it can be sure the kernel is + # ready + hid_bpfs: List[Tuple[str, bool]] = [] + def assertInputEventsIn(self, expected_events, effective_events): effective_events = effective_events.copy() for ev in expected_events: @@ -211,8 +224,6 @@ class BaseTestCase: # we don't know beforehand the name of the module from modinfo sysfs_path = Path("/sys/module") / kernel_module.replace("-", "_") if not sysfs_path.exists(): - import subprocess - ret = subprocess.run(["/usr/sbin/modprobe", kernel_module]) if ret.returncode != 0: pytest.skip( @@ -225,6 +236,64 @@ class BaseTestCase: self._load_kernel_module(kernel_driver, kernel_module) yield + def load_hid_bpfs(self): + script_dir = Path(os.path.dirname(os.path.realpath(__file__))) + root_dir = (script_dir / "../../../../..").resolve() + bpf_dir = root_dir / "drivers/hid/bpf/progs" + + udev_hid_bpf = shutil.which("udev-hid-bpf") + if not udev_hid_bpf: + pytest.skip("udev-hid-bpf not found in $PATH, skipping") + + wait = False + for _, rdesc_fixup in self.hid_bpfs: + if rdesc_fixup: + wait = True + + for hid_bpf, _ in self.hid_bpfs: + # We need to start `udev-hid-bpf` in the background + # and dispatch uhid events in case the kernel needs + # to fetch features on the device + process = subprocess.Popen( + [ + "udev-hid-bpf", + "--verbose", + "add", + str(self.uhdev.sys_path), + str(bpf_dir / hid_bpf), + ], + ) + while process.poll() is None: + self.uhdev.dispatch(1) + + if process.poll() != 0: + pytest.fail( + f"Couldn't insert hid-bpf program '{hid_bpf}', marking the test as failed" + ) + + if wait: + # the HID-BPF program exports a rdesc fixup, so it needs to be + # unbound by the kernel and then rebound. + # Ensure we get the bound event exactly 2 times (one for the normal + # uhid loading, and then the reload from HID-BPF) + now = time.time() + while self.uhdev.kernel_ready_count < 2 and time.time() - now < 2: + self.uhdev.dispatch(1) + + if self.uhdev.kernel_ready_count < 2: + pytest.fail( + f"Couldn't insert hid-bpf programs, marking the test as failed" + ) + + def unload_hid_bpfs(self): + ret = subprocess.run( + ["udev-hid-bpf", "--verbose", "remove", str(self.uhdev.sys_path)], + ) + if ret.returncode != 0: + pytest.fail( + f"Couldn't unload hid-bpf programs, marking the test as failed" + ) + @pytest.fixture() def new_uhdev(self, load_kernel_module): return self.create_device() @@ -248,12 +317,18 @@ class BaseTestCase: now = time.time() while not self.uhdev.is_ready() and time.time() - now < 5: self.uhdev.dispatch(1) + + if self.hid_bpfs: + self.load_hid_bpfs() + if self.uhdev.get_evdev() is None: logger.warning( f"available list of input nodes: (default application is '{self.uhdev.application}')" ) logger.warning(self.uhdev.input_nodes) yield + if self.hid_bpfs: + self.unload_hid_bpfs() self.uhdev = None except PermissionError: pytest.skip("Insufficient permissions, run me as root") @@ -313,8 +388,6 @@ class HIDTestUdevRule(object): self.reload_udev_rules() def reload_udev_rules(self): - import subprocess - subprocess.run("udevadm control --reload-rules".split()) subprocess.run("systemd-hwdb update".split()) @@ -330,10 +403,11 @@ class HIDTestUdevRule(object): delete=False, ) as f: f.write( - 'KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1"\n' - ) - f.write( - 'KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1"\n' + """ +KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1" +KERNELS=="*hid*", ENV{HID_NAME}=="*uhid test *", ENV{HID_BPF_IGNORE_DEVICE}="1" +KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1" +""" ) self.rulesfile = f diff --git a/tools/testing/selftests/hid/tests/base_device.py b/tools/testing/selftests/hid/tests/base_device.py new file mode 100644 index 0000000000..e0515be97f --- /dev/null +++ b/tools/testing/selftests/hid/tests/base_device.py @@ -0,0 +1,421 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# -*- coding: utf-8 -*- +# +# Copyright (c) 2017 Benjamin Tissoires +# Copyright (c) 2017 Red Hat, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import fcntl +import functools +import libevdev +import os + +try: + import pyudev +except ImportError: + raise ImportError("UHID is not supported due to missing pyudev dependency") + +import logging + +import hidtools.hid as hid +from hidtools.uhid import UHIDDevice +from hidtools.util import BusType + +from pathlib import Path +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, Union + +logger = logging.getLogger("hidtools.device.base_device") + + +class SysfsFile(object): + def __init__(self, path): + self.path = path + + def __set_value(self, value): + with open(self.path, "w") as f: + return f.write(f"{value}\n") + + def __get_value(self): + with open(self.path) as f: + return f.read().strip() + + @property + def int_value(self) -> int: + return int(self.__get_value()) + + @int_value.setter + def int_value(self, v: int) -> None: + self.__set_value(v) + + @property + def str_value(self) -> str: + return self.__get_value() + + @str_value.setter + def str_value(self, v: str) -> None: + self.__set_value(v) + + +class LED(object): + def __init__(self, sys_path): + self.max_brightness = SysfsFile(sys_path / "max_brightness").int_value + self.__brightness = SysfsFile(sys_path / "brightness") + + @property + def brightness(self) -> int: + return self.__brightness.int_value + + @brightness.setter + def brightness(self, value: int) -> None: + self.__brightness.int_value = value + + +class PowerSupply(object): + """Represents Linux power_supply_class sysfs nodes.""" + + def __init__(self, sys_path): + self._capacity = SysfsFile(sys_path / "capacity") + self._status = SysfsFile(sys_path / "status") + self._type = SysfsFile(sys_path / "type") + + @property + def capacity(self) -> int: + return self._capacity.int_value + + @property + def status(self) -> str: + return self._status.str_value + + @property + def type(self) -> str: + return self._type.str_value + + +class HIDIsReady(object): + """ + Companion class that binds to a kernel mechanism + and that allows to know when a uhid device is ready or not. + + See :meth:`is_ready` for details. + """ + + def __init__(self: "HIDIsReady", uhid: UHIDDevice) -> None: + self.uhid = uhid + + def is_ready(self: "HIDIsReady") -> bool: + """ + Overwrite in subclasses: should return True or False whether + the attached uhid device is ready or not. + """ + return False + + +class UdevHIDIsReady(HIDIsReady): + _pyudev_context: ClassVar[Optional[pyudev.Context]] = None + _pyudev_monitor: ClassVar[Optional[pyudev.Monitor]] = None + _uhid_devices: ClassVar[Dict[int, Tuple[bool, int]]] = {} + + def __init__(self: "UdevHIDIsReady", uhid: UHIDDevice) -> None: + super().__init__(uhid) + self._init_pyudev() + + @classmethod + def _init_pyudev(cls: Type["UdevHIDIsReady"]) -> None: + if cls._pyudev_context is None: + cls._pyudev_context = pyudev.Context() + cls._pyudev_monitor = pyudev.Monitor.from_netlink(cls._pyudev_context) + cls._pyudev_monitor.filter_by("hid") + cls._pyudev_monitor.start() + + UHIDDevice._append_fd_to_poll( + cls._pyudev_monitor.fileno(), cls._cls_udev_event_callback + ) + + @classmethod + def _cls_udev_event_callback(cls: Type["UdevHIDIsReady"]) -> None: + if cls._pyudev_monitor is None: + return + event: pyudev.Device + for event in iter(functools.partial(cls._pyudev_monitor.poll, 0.02), None): + if event.action not in ["bind", "remove", "unbind"]: + return + + logger.debug(f"udev event: {event.action} -> {event}") + + id = int(event.sys_path.strip().split(".")[-1], 16) + + device_ready, count = cls._uhid_devices.get(id, (False, 0)) + + ready = event.action == "bind" + if not device_ready and ready: + count += 1 + cls._uhid_devices[id] = (ready, count) + + def is_ready(self: "UdevHIDIsReady") -> Tuple[bool, int]: + try: + return self._uhid_devices[self.uhid.hid_id] + except KeyError: + return (False, 0) + + +class EvdevMatch(object): + def __init__( + self: "EvdevMatch", + *, + requires: List[Any] = [], + excludes: List[Any] = [], + req_properties: List[Any] = [], + excl_properties: List[Any] = [], + ) -> None: + self.requires = requires + self.excludes = excludes + self.req_properties = req_properties + self.excl_properties = excl_properties + + def is_a_match(self: "EvdevMatch", evdev: libevdev.Device) -> bool: + for m in self.requires: + if not evdev.has(m): + return False + for m in self.excludes: + if evdev.has(m): + return False + for p in self.req_properties: + if not evdev.has_property(p): + return False + for p in self.excl_properties: + if evdev.has_property(p): + return False + return True + + +class EvdevDevice(object): + """ + Represents an Evdev node and its properties. + This is a stub for the libevdev devices, as they are relying on + uevent to get the data, saving us some ioctls to fetch the names + and properties. + """ + + def __init__(self: "EvdevDevice", sysfs: Path) -> None: + self.sysfs = sysfs + self.event_node: Any = None + self.libevdev: Optional[libevdev.Device] = None + + self.uevents = {} + # all of the interesting properties are stored in the input uevent, so in the parent + # so convert the uevent file of the parent input node into a dict + with open(sysfs.parent / "uevent") as f: + for line in f.readlines(): + key, value = line.strip().split("=") + self.uevents[key] = value.strip('"') + + # we open all evdev nodes in order to not miss any event + self.open() + + @property + def name(self: "EvdevDevice") -> str: + assert "NAME" in self.uevents + + return self.uevents["NAME"] + + @property + def evdev(self: "EvdevDevice") -> Path: + return Path("/dev/input") / self.sysfs.name + + def matches_application( + self: "EvdevDevice", application: str, matches: Dict[str, EvdevMatch] + ) -> bool: + if self.libevdev is None: + return False + + if application in matches: + return matches[application].is_a_match(self.libevdev) + + logger.error( + f"application '{application}' is unknown, please update/fix hid-tools" + ) + assert False # hid-tools likely needs an update + + def open(self: "EvdevDevice") -> libevdev.Device: + self.event_node = open(self.evdev, "rb") + self.libevdev = libevdev.Device(self.event_node) + + assert self.libevdev.fd is not None + + fd = self.libevdev.fd.fileno() + flag = fcntl.fcntl(fd, fcntl.F_GETFD) + fcntl.fcntl(fd, fcntl.F_SETFL, flag | os.O_NONBLOCK) + + return self.libevdev + + def close(self: "EvdevDevice") -> None: + if self.libevdev is not None and self.libevdev.fd is not None: + self.libevdev.fd.close() + self.libevdev = None + if self.event_node is not None: + self.event_node.close() + self.event_node = None + + +class BaseDevice(UHIDDevice): + # default _application_matches that matches nothing. This needs + # to be set in the subclasses to have get_evdev() working + _application_matches: Dict[str, EvdevMatch] = {} + + def __init__( + self, + name, + application, + rdesc_str: Optional[str] = None, + rdesc: Optional[Union[hid.ReportDescriptor, str, bytes]] = None, + input_info=None, + ) -> None: + self._kernel_is_ready: HIDIsReady = UdevHIDIsReady(self) + if rdesc_str is None and rdesc is None: + raise Exception("Please provide at least a rdesc or rdesc_str") + super().__init__() + if name is None: + name = f"uhid gamepad test {self.__class__.__name__}" + if input_info is None: + input_info = (BusType.USB, 1, 2) + self.name = name + self.info = input_info + self.default_reportID = None + self.opened = False + self.started = False + self.application = application + self._input_nodes: Optional[list[EvdevDevice]] = None + if rdesc is None: + assert rdesc_str is not None + self.rdesc = hid.ReportDescriptor.from_human_descr(rdesc_str) # type: ignore + else: + self.rdesc = rdesc # type: ignore + + @property + def power_supply_class(self: "BaseDevice") -> Optional[PowerSupply]: + ps = self.walk_sysfs("power_supply", "power_supply/*") + if ps is None or len(ps) < 1: + return None + + return PowerSupply(ps[0]) + + @property + def led_classes(self: "BaseDevice") -> List[LED]: + leds = self.walk_sysfs("led", "**/max_brightness") + if leds is None: + return [] + + return [LED(led.parent) for led in leds] + + @property + def kernel_is_ready(self: "BaseDevice") -> bool: + return self._kernel_is_ready.is_ready()[0] and self.started + + @property + def kernel_ready_count(self: "BaseDevice") -> int: + return self._kernel_is_ready.is_ready()[1] + + @property + def input_nodes(self: "BaseDevice") -> List[EvdevDevice]: + if self._input_nodes is not None: + return self._input_nodes + + if not self.kernel_is_ready or not self.started: + return [] + + self._input_nodes = [ + EvdevDevice(path) + for path in self.walk_sysfs("input", "input/input*/event*") + ] + return self._input_nodes + + def match_evdev_rule(self, application, evdev): + """Replace this in subclasses if the device has multiple reports + of the same type and we need to filter based on the actual evdev + node. + + returning True will append the corresponding report to + `self.input_nodes[type]` + returning False will ignore this report / type combination + for the device. + """ + return True + + def open(self): + self.opened = True + + def _close_all_opened_evdev(self): + if self._input_nodes is not None: + for e in self._input_nodes: + e.close() + + def __del__(self): + self._close_all_opened_evdev() + + def close(self): + self.opened = False + + def start(self, flags): + self.started = True + + def stop(self): + self.started = False + self._close_all_opened_evdev() + + def next_sync_events(self, application=None): + evdev = self.get_evdev(application) + if evdev is not None: + return list(evdev.events()) + return [] + + @property + def application_matches(self: "BaseDevice") -> Dict[str, EvdevMatch]: + return self._application_matches + + @application_matches.setter + def application_matches(self: "BaseDevice", data: Dict[str, EvdevMatch]) -> None: + self._application_matches = data + + def get_evdev(self, application=None): + if application is None: + application = self.application + + if len(self.input_nodes) == 0: + return None + + assert self._input_nodes is not None + + if len(self._input_nodes) == 1: + evdev = self._input_nodes[0] + if self.match_evdev_rule(application, evdev.libevdev): + return evdev.libevdev + else: + for _evdev in self._input_nodes: + if _evdev.matches_application(application, self.application_matches): + if self.match_evdev_rule(application, _evdev.libevdev): + return _evdev.libevdev + + def is_ready(self): + """Returns whether a UHID device is ready. Can be overwritten in + subclasses to add extra conditions on when to consider a UHID + device ready. This can be: + + - we need to wait on different types of input devices to be ready + (Touch Screen and Pen for example) + - we need to have at least 4 LEDs present + (len(self.uhdev.leds_classes) == 4) + - or any other combinations""" + return self.kernel_is_ready diff --git a/tools/testing/selftests/hid/tests/base_gamepad.py b/tools/testing/selftests/hid/tests/base_gamepad.py new file mode 100644 index 0000000000..ec74d75767 --- /dev/null +++ b/tools/testing/selftests/hid/tests/base_gamepad.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: GPL-2.0 +import libevdev + +from .base_device import BaseDevice +from hidtools.util import BusType + + +class InvalidHIDCommunication(Exception): + pass + + +class GamepadData(object): + pass + + +class AxisMapping(object): + """Represents a mapping between a HID type + and an evdev event""" + + def __init__(self, hid, evdev=None): + self.hid = hid.lower() + + if evdev is None: + evdev = f"ABS_{hid.upper()}" + + self.evdev = libevdev.evbit("EV_ABS", evdev) + + +class BaseGamepad(BaseDevice): + buttons_map = { + 1: "BTN_SOUTH", + 2: "BTN_EAST", + 3: "BTN_C", + 4: "BTN_NORTH", + 5: "BTN_WEST", + 6: "BTN_Z", + 7: "BTN_TL", + 8: "BTN_TR", + 9: "BTN_TL2", + 10: "BTN_TR2", + 11: "BTN_SELECT", + 12: "BTN_START", + 13: "BTN_MODE", + 14: "BTN_THUMBL", + 15: "BTN_THUMBR", + } + + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("z"), + "y": AxisMapping("Rz"), + }, + } + + def __init__(self, rdesc, application="Game Pad", name=None, input_info=None): + assert rdesc is not None + super().__init__(name, application, input_info=input_info, rdesc=rdesc) + self.buttons = (1, 2, 3) + self._buttons = {} + self.left = (127, 127) + self.right = (127, 127) + self.hat_switch = 15 + assert self.parsed_rdesc is not None + + self.fields = [] + for r in self.parsed_rdesc.input_reports.values(): + if r.application_name == self.application: + self.fields.extend([f.usage_name for f in r]) + + def store_axes(self, which, gamepad, data): + amap = self.axes_map[which] + x, y = data + setattr(gamepad, amap["x"].hid, x) + setattr(gamepad, amap["y"].hid, y) + + def create_report( + self, + *, + left=(None, None), + right=(None, None), + hat_switch=None, + buttons=None, + reportID=None, + application="Game Pad", + ): + """ + Return an input report for this device. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + (expressed in 1/8 of circle, 0 being North, 2 East) + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + :param reportID: the numeric report ID for this report, if needed + :param application: the application used to report the values + """ + if buttons is not None: + for i, b in buttons.items(): + if i not in self.buttons: + raise InvalidHIDCommunication( + f"button {i} is not part of this {self.application}" + ) + if b is not None: + self._buttons[i] = b + + def replace_none_in_tuple(item, default): + if item is None: + item = (None, None) + + if None in item: + if item[0] is None: + item = (default[0], item[1]) + if item[1] is None: + item = (item[0], default[1]) + + return item + + right = replace_none_in_tuple(right, self.right) + self.right = right + left = replace_none_in_tuple(left, self.left) + self.left = left + + if hat_switch is None: + hat_switch = self.hat_switch + else: + self.hat_switch = hat_switch + + reportID = reportID or self.default_reportID + + gamepad = GamepadData() + for i, b in self._buttons.items(): + gamepad.__setattr__(f"b{i}", int(b) if b is not None else 0) + + self.store_axes("left_stick", gamepad, left) + self.store_axes("right_stick", gamepad, right) + gamepad.hatswitch = hat_switch # type: ignore ### gamepad is by default empty + return super().create_report( + gamepad, reportID=reportID, application=application + ) + + def event( + self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None + ): + """ + Send an input event on the default report ID. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + """ + r = self.create_report( + left=left, right=right, hat_switch=hat_switch, buttons=buttons + ) + self.call_input_event(r) + return [r] + + +class JoystickGamepad(BaseGamepad): + buttons_map = { + 1: "BTN_TRIGGER", + 2: "BTN_THUMB", + 3: "BTN_THUMB2", + 4: "BTN_TOP", + 5: "BTN_TOP2", + 6: "BTN_PINKIE", + 7: "BTN_BASE", + 8: "BTN_BASE2", + 9: "BTN_BASE3", + 10: "BTN_BASE4", + 11: "BTN_BASE5", + 12: "BTN_BASE6", + 13: "BTN_DEAD", + } + + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("rudder"), + "y": AxisMapping("throttle"), + }, + } + + def __init__(self, rdesc, application="Joystick", name=None, input_info=None): + super().__init__(rdesc, application, name, input_info) + + def create_report( + self, + *, + left=(None, None), + right=(None, None), + hat_switch=None, + buttons=None, + reportID=None, + application=None, + ): + """ + Return an input report for this device. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + :param reportID: the numeric report ID for this report, if needed + :param application: the application for this report, if needed + """ + if application is None: + application = "Joystick" + return super().create_report( + left=left, + right=right, + hat_switch=hat_switch, + buttons=buttons, + reportID=reportID, + application=application, + ) + + def store_right_joystick(self, gamepad, data): + gamepad.rudder, gamepad.throttle = data diff --git a/tools/testing/selftests/hid/tests/test_gamepad.py b/tools/testing/selftests/hid/tests/test_gamepad.py index 26c74040b7..8d5b5ffdae 100644 --- a/tools/testing/selftests/hid/tests/test_gamepad.py +++ b/tools/testing/selftests/hid/tests/test_gamepad.py @@ -10,7 +10,8 @@ from . import base import libevdev import pytest -from hidtools.device.base_gamepad import AsusGamepad, SaitekGamepad +from .base_gamepad import BaseGamepad, JoystickGamepad, AxisMapping +from hidtools.util import BusType import logging @@ -199,6 +200,449 @@ class BaseTest: ) +class SaitekGamepad(JoystickGamepad): + # fmt: off + report_descriptor = [ + 0x05, 0x01, # Usage Page (Generic Desktop) 0 + 0x09, 0x04, # Usage (Joystick) 2 + 0xa1, 0x01, # Collection (Application) 4 + 0x09, 0x01, # .Usage (Pointer) 6 + 0xa1, 0x00, # .Collection (Physical) 8 + 0x85, 0x01, # ..Report ID (1) 10 + 0x09, 0x30, # ..Usage (X) 12 + 0x15, 0x00, # ..Logical Minimum (0) 14 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 16 + 0x35, 0x00, # ..Physical Minimum (0) 19 + 0x46, 0xff, 0x00, # ..Physical Maximum (255) 21 + 0x75, 0x08, # ..Report Size (8) 24 + 0x95, 0x01, # ..Report Count (1) 26 + 0x81, 0x02, # ..Input (Data,Var,Abs) 28 + 0x09, 0x31, # ..Usage (Y) 30 + 0x81, 0x02, # ..Input (Data,Var,Abs) 32 + 0x05, 0x02, # ..Usage Page (Simulation Controls) 34 + 0x09, 0xba, # ..Usage (Rudder) 36 + 0x81, 0x02, # ..Input (Data,Var,Abs) 38 + 0x09, 0xbb, # ..Usage (Throttle) 40 + 0x81, 0x02, # ..Input (Data,Var,Abs) 42 + 0x05, 0x09, # ..Usage Page (Button) 44 + 0x19, 0x01, # ..Usage Minimum (1) 46 + 0x29, 0x0c, # ..Usage Maximum (12) 48 + 0x25, 0x01, # ..Logical Maximum (1) 50 + 0x45, 0x01, # ..Physical Maximum (1) 52 + 0x75, 0x01, # ..Report Size (1) 54 + 0x95, 0x0c, # ..Report Count (12) 56 + 0x81, 0x02, # ..Input (Data,Var,Abs) 58 + 0x95, 0x01, # ..Report Count (1) 60 + 0x75, 0x00, # ..Report Size (0) 62 + 0x81, 0x03, # ..Input (Cnst,Var,Abs) 64 + 0x05, 0x01, # ..Usage Page (Generic Desktop) 66 + 0x09, 0x39, # ..Usage (Hat switch) 68 + 0x25, 0x07, # ..Logical Maximum (7) 70 + 0x46, 0x3b, 0x01, # ..Physical Maximum (315) 72 + 0x55, 0x00, # ..Unit Exponent (0) 75 + 0x65, 0x44, # ..Unit (Degrees^4,EngRotation) 77 + 0x75, 0x04, # ..Report Size (4) 79 + 0x81, 0x42, # ..Input (Data,Var,Abs,Null) 81 + 0x65, 0x00, # ..Unit (None) 83 + 0xc0, # .End Collection 85 + 0x05, 0x0f, # .Usage Page (Vendor Usage Page 0x0f) 86 + 0x09, 0x92, # .Usage (Vendor Usage 0x92) 88 + 0xa1, 0x02, # .Collection (Logical) 90 + 0x85, 0x02, # ..Report ID (2) 92 + 0x09, 0xa0, # ..Usage (Vendor Usage 0xa0) 94 + 0x09, 0x9f, # ..Usage (Vendor Usage 0x9f) 96 + 0x25, 0x01, # ..Logical Maximum (1) 98 + 0x45, 0x00, # ..Physical Maximum (0) 100 + 0x75, 0x01, # ..Report Size (1) 102 + 0x95, 0x02, # ..Report Count (2) 104 + 0x81, 0x02, # ..Input (Data,Var,Abs) 106 + 0x75, 0x06, # ..Report Size (6) 108 + 0x95, 0x01, # ..Report Count (1) 110 + 0x81, 0x03, # ..Input (Cnst,Var,Abs) 112 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 114 + 0x75, 0x07, # ..Report Size (7) 116 + 0x25, 0x7f, # ..Logical Maximum (127) 118 + 0x81, 0x02, # ..Input (Data,Var,Abs) 120 + 0x09, 0x94, # ..Usage (Vendor Usage 0x94) 122 + 0x75, 0x01, # ..Report Size (1) 124 + 0x25, 0x01, # ..Logical Maximum (1) 126 + 0x81, 0x02, # ..Input (Data,Var,Abs) 128 + 0xc0, # .End Collection 130 + 0x09, 0x21, # .Usage (Vendor Usage 0x21) 131 + 0xa1, 0x02, # .Collection (Logical) 133 + 0x85, 0x0b, # ..Report ID (11) 135 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 137 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 139 + 0x75, 0x08, # ..Report Size (8) 142 + 0x91, 0x02, # ..Output (Data,Var,Abs) 144 + 0x09, 0x53, # ..Usage (Vendor Usage 0x53) 146 + 0x25, 0x0a, # ..Logical Maximum (10) 148 + 0x91, 0x02, # ..Output (Data,Var,Abs) 150 + 0x09, 0x50, # ..Usage (Vendor Usage 0x50) 152 + 0x27, 0xfe, 0xff, 0x00, 0x00, # ..Logical Maximum (65534) 154 + 0x47, 0xfe, 0xff, 0x00, 0x00, # ..Physical Maximum (65534) 159 + 0x75, 0x10, # ..Report Size (16) 164 + 0x55, 0xfd, # ..Unit Exponent (237) 166 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 168 + 0x91, 0x02, # ..Output (Data,Var,Abs) 171 + 0x55, 0x00, # ..Unit Exponent (0) 173 + 0x65, 0x00, # ..Unit (None) 175 + 0x09, 0x54, # ..Usage (Vendor Usage 0x54) 177 + 0x55, 0xfd, # ..Unit Exponent (237) 179 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 181 + 0x91, 0x02, # ..Output (Data,Var,Abs) 184 + 0x55, 0x00, # ..Unit Exponent (0) 186 + 0x65, 0x00, # ..Unit (None) 188 + 0x09, 0xa7, # ..Usage (Vendor Usage 0xa7) 190 + 0x55, 0xfd, # ..Unit Exponent (237) 192 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 194 + 0x91, 0x02, # ..Output (Data,Var,Abs) 197 + 0x55, 0x00, # ..Unit Exponent (0) 199 + 0x65, 0x00, # ..Unit (None) 201 + 0xc0, # .End Collection 203 + 0x09, 0x5a, # .Usage (Vendor Usage 0x5a) 204 + 0xa1, 0x02, # .Collection (Logical) 206 + 0x85, 0x0c, # ..Report ID (12) 208 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 210 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 212 + 0x45, 0x00, # ..Physical Maximum (0) 215 + 0x75, 0x08, # ..Report Size (8) 217 + 0x91, 0x02, # ..Output (Data,Var,Abs) 219 + 0x09, 0x5c, # ..Usage (Vendor Usage 0x5c) 221 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 223 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 226 + 0x75, 0x10, # ..Report Size (16) 229 + 0x55, 0xfd, # ..Unit Exponent (237) 231 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 233 + 0x91, 0x02, # ..Output (Data,Var,Abs) 236 + 0x55, 0x00, # ..Unit Exponent (0) 238 + 0x65, 0x00, # ..Unit (None) 240 + 0x09, 0x5b, # ..Usage (Vendor Usage 0x5b) 242 + 0x25, 0x7f, # ..Logical Maximum (127) 244 + 0x75, 0x08, # ..Report Size (8) 246 + 0x91, 0x02, # ..Output (Data,Var,Abs) 248 + 0x09, 0x5e, # ..Usage (Vendor Usage 0x5e) 250 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 252 + 0x75, 0x10, # ..Report Size (16) 255 + 0x55, 0xfd, # ..Unit Exponent (237) 257 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 259 + 0x91, 0x02, # ..Output (Data,Var,Abs) 262 + 0x55, 0x00, # ..Unit Exponent (0) 264 + 0x65, 0x00, # ..Unit (None) 266 + 0x09, 0x5d, # ..Usage (Vendor Usage 0x5d) 268 + 0x25, 0x7f, # ..Logical Maximum (127) 270 + 0x75, 0x08, # ..Report Size (8) 272 + 0x91, 0x02, # ..Output (Data,Var,Abs) 274 + 0xc0, # .End Collection 276 + 0x09, 0x73, # .Usage (Vendor Usage 0x73) 277 + 0xa1, 0x02, # .Collection (Logical) 279 + 0x85, 0x0d, # ..Report ID (13) 281 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 283 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 285 + 0x45, 0x00, # ..Physical Maximum (0) 288 + 0x91, 0x02, # ..Output (Data,Var,Abs) 290 + 0x09, 0x70, # ..Usage (Vendor Usage 0x70) 292 + 0x15, 0x81, # ..Logical Minimum (-127) 294 + 0x25, 0x7f, # ..Logical Maximum (127) 296 + 0x36, 0xf0, 0xd8, # ..Physical Minimum (-10000) 298 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 301 + 0x91, 0x02, # ..Output (Data,Var,Abs) 304 + 0xc0, # .End Collection 306 + 0x09, 0x6e, # .Usage (Vendor Usage 0x6e) 307 + 0xa1, 0x02, # .Collection (Logical) 309 + 0x85, 0x0e, # ..Report ID (14) 311 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 313 + 0x15, 0x00, # ..Logical Minimum (0) 315 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 317 + 0x35, 0x00, # ..Physical Minimum (0) 320 + 0x45, 0x00, # ..Physical Maximum (0) 322 + 0x91, 0x02, # ..Output (Data,Var,Abs) 324 + 0x09, 0x70, # ..Usage (Vendor Usage 0x70) 326 + 0x25, 0x7f, # ..Logical Maximum (127) 328 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 330 + 0x91, 0x02, # ..Output (Data,Var,Abs) 333 + 0x09, 0x6f, # ..Usage (Vendor Usage 0x6f) 335 + 0x15, 0x81, # ..Logical Minimum (-127) 337 + 0x36, 0xf0, 0xd8, # ..Physical Minimum (-10000) 339 + 0x91, 0x02, # ..Output (Data,Var,Abs) 342 + 0x09, 0x71, # ..Usage (Vendor Usage 0x71) 344 + 0x15, 0x00, # ..Logical Minimum (0) 346 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 348 + 0x35, 0x00, # ..Physical Minimum (0) 351 + 0x46, 0x68, 0x01, # ..Physical Maximum (360) 353 + 0x91, 0x02, # ..Output (Data,Var,Abs) 356 + 0x09, 0x72, # ..Usage (Vendor Usage 0x72) 358 + 0x75, 0x10, # ..Report Size (16) 360 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 362 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 365 + 0x55, 0xfd, # ..Unit Exponent (237) 368 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 370 + 0x91, 0x02, # ..Output (Data,Var,Abs) 373 + 0x55, 0x00, # ..Unit Exponent (0) 375 + 0x65, 0x00, # ..Unit (None) 377 + 0xc0, # .End Collection 379 + 0x09, 0x77, # .Usage (Vendor Usage 0x77) 380 + 0xa1, 0x02, # .Collection (Logical) 382 + 0x85, 0x51, # ..Report ID (81) 384 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 386 + 0x25, 0x7f, # ..Logical Maximum (127) 388 + 0x45, 0x00, # ..Physical Maximum (0) 390 + 0x75, 0x08, # ..Report Size (8) 392 + 0x91, 0x02, # ..Output (Data,Var,Abs) 394 + 0x09, 0x78, # ..Usage (Vendor Usage 0x78) 396 + 0xa1, 0x02, # ..Collection (Logical) 398 + 0x09, 0x7b, # ...Usage (Vendor Usage 0x7b) 400 + 0x09, 0x79, # ...Usage (Vendor Usage 0x79) 402 + 0x09, 0x7a, # ...Usage (Vendor Usage 0x7a) 404 + 0x15, 0x01, # ...Logical Minimum (1) 406 + 0x25, 0x03, # ...Logical Maximum (3) 408 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 410 + 0xc0, # ..End Collection 412 + 0x09, 0x7c, # ..Usage (Vendor Usage 0x7c) 413 + 0x15, 0x00, # ..Logical Minimum (0) 415 + 0x26, 0xfe, 0x00, # ..Logical Maximum (254) 417 + 0x91, 0x02, # ..Output (Data,Var,Abs) 420 + 0xc0, # .End Collection 422 + 0x09, 0x92, # .Usage (Vendor Usage 0x92) 423 + 0xa1, 0x02, # .Collection (Logical) 425 + 0x85, 0x52, # ..Report ID (82) 427 + 0x09, 0x96, # ..Usage (Vendor Usage 0x96) 429 + 0xa1, 0x02, # ..Collection (Logical) 431 + 0x09, 0x9a, # ...Usage (Vendor Usage 0x9a) 433 + 0x09, 0x99, # ...Usage (Vendor Usage 0x99) 435 + 0x09, 0x97, # ...Usage (Vendor Usage 0x97) 437 + 0x09, 0x98, # ...Usage (Vendor Usage 0x98) 439 + 0x09, 0x9b, # ...Usage (Vendor Usage 0x9b) 441 + 0x09, 0x9c, # ...Usage (Vendor Usage 0x9c) 443 + 0x15, 0x01, # ...Logical Minimum (1) 445 + 0x25, 0x06, # ...Logical Maximum (6) 447 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 449 + 0xc0, # ..End Collection 451 + 0xc0, # .End Collection 452 + 0x05, 0xff, # .Usage Page (Vendor Usage Page 0xff) 453 + 0x0a, 0x01, 0x03, # .Usage (Vendor Usage 0x301) 455 + 0xa1, 0x02, # .Collection (Logical) 458 + 0x85, 0x40, # ..Report ID (64) 460 + 0x0a, 0x02, 0x03, # ..Usage (Vendor Usage 0x302) 462 + 0xa1, 0x02, # ..Collection (Logical) 465 + 0x1a, 0x11, 0x03, # ...Usage Minimum (785) 467 + 0x2a, 0x20, 0x03, # ...Usage Maximum (800) 470 + 0x25, 0x10, # ...Logical Maximum (16) 473 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 475 + 0xc0, # ..End Collection 477 + 0x0a, 0x03, 0x03, # ..Usage (Vendor Usage 0x303) 478 + 0x15, 0x00, # ..Logical Minimum (0) 481 + 0x27, 0xff, 0xff, 0x00, 0x00, # ..Logical Maximum (65535) 483 + 0x75, 0x10, # ..Report Size (16) 488 + 0x91, 0x02, # ..Output (Data,Var,Abs) 490 + 0xc0, # .End Collection 492 + 0x05, 0x0f, # .Usage Page (Vendor Usage Page 0x0f) 493 + 0x09, 0x7d, # .Usage (Vendor Usage 0x7d) 495 + 0xa1, 0x02, # .Collection (Logical) 497 + 0x85, 0x43, # ..Report ID (67) 499 + 0x09, 0x7e, # ..Usage (Vendor Usage 0x7e) 501 + 0x26, 0x80, 0x00, # ..Logical Maximum (128) 503 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 506 + 0x75, 0x08, # ..Report Size (8) 509 + 0x91, 0x02, # ..Output (Data,Var,Abs) 511 + 0xc0, # .End Collection 513 + 0x09, 0x7f, # .Usage (Vendor Usage 0x7f) 514 + 0xa1, 0x02, # .Collection (Logical) 516 + 0x85, 0x0b, # ..Report ID (11) 518 + 0x09, 0x80, # ..Usage (Vendor Usage 0x80) 520 + 0x26, 0xff, 0x7f, # ..Logical Maximum (32767) 522 + 0x45, 0x00, # ..Physical Maximum (0) 525 + 0x75, 0x0f, # ..Report Size (15) 527 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 529 + 0x09, 0xa9, # ..Usage (Vendor Usage 0xa9) 531 + 0x25, 0x01, # ..Logical Maximum (1) 533 + 0x75, 0x01, # ..Report Size (1) 535 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 537 + 0x09, 0x83, # ..Usage (Vendor Usage 0x83) 539 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 541 + 0x75, 0x08, # ..Report Size (8) 544 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 546 + 0xc0, # .End Collection 548 + 0x09, 0xab, # .Usage (Vendor Usage 0xab) 549 + 0xa1, 0x03, # .Collection (Report) 551 + 0x85, 0x15, # ..Report ID (21) 553 + 0x09, 0x25, # ..Usage (Vendor Usage 0x25) 555 + 0xa1, 0x02, # ..Collection (Logical) 557 + 0x09, 0x26, # ...Usage (Vendor Usage 0x26) 559 + 0x09, 0x30, # ...Usage (Vendor Usage 0x30) 561 + 0x09, 0x32, # ...Usage (Vendor Usage 0x32) 563 + 0x09, 0x31, # ...Usage (Vendor Usage 0x31) 565 + 0x09, 0x33, # ...Usage (Vendor Usage 0x33) 567 + 0x09, 0x34, # ...Usage (Vendor Usage 0x34) 569 + 0x15, 0x01, # ...Logical Minimum (1) 571 + 0x25, 0x06, # ...Logical Maximum (6) 573 + 0xb1, 0x00, # ...Feature (Data,Arr,Abs) 575 + 0xc0, # ..End Collection 577 + 0xc0, # .End Collection 578 + 0x09, 0x89, # .Usage (Vendor Usage 0x89) 579 + 0xa1, 0x03, # .Collection (Report) 581 + 0x85, 0x16, # ..Report ID (22) 583 + 0x09, 0x8b, # ..Usage (Vendor Usage 0x8b) 585 + 0xa1, 0x02, # ..Collection (Logical) 587 + 0x09, 0x8c, # ...Usage (Vendor Usage 0x8c) 589 + 0x09, 0x8d, # ...Usage (Vendor Usage 0x8d) 591 + 0x09, 0x8e, # ...Usage (Vendor Usage 0x8e) 593 + 0x25, 0x03, # ...Logical Maximum (3) 595 + 0xb1, 0x00, # ...Feature (Data,Arr,Abs) 597 + 0xc0, # ..End Collection 599 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 600 + 0x15, 0x00, # ..Logical Minimum (0) 602 + 0x26, 0xfe, 0x00, # ..Logical Maximum (254) 604 + 0xb1, 0x02, # ..Feature (Data,Var,Abs) 607 + 0xc0, # .End Collection 609 + 0x09, 0x90, # .Usage (Vendor Usage 0x90) 610 + 0xa1, 0x03, # .Collection (Report) 612 + 0x85, 0x50, # ..Report ID (80) 614 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 616 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 618 + 0x91, 0x02, # ..Output (Data,Var,Abs) 621 + 0xc0, # .End Collection 623 + 0xc0, # End Collection 624 + ] + # fmt: on + + def __init__(self, rdesc=report_descriptor, name=None): + super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x06A3, 0xFF0D)) + self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) + + +class AsusGamepad(BaseGamepad): + # fmt: off + report_descriptor = [ + 0x05, 0x01, # Usage Page (Generic Desktop) 0 + 0x09, 0x05, # Usage (Game Pad) 2 + 0xa1, 0x01, # Collection (Application) 4 + 0x85, 0x01, # .Report ID (1) 6 + 0x05, 0x09, # .Usage Page (Button) 8 + 0x0a, 0x01, 0x00, # .Usage (Vendor Usage 0x01) 10 + 0x0a, 0x02, 0x00, # .Usage (Vendor Usage 0x02) 13 + 0x0a, 0x04, 0x00, # .Usage (Vendor Usage 0x04) 16 + 0x0a, 0x05, 0x00, # .Usage (Vendor Usage 0x05) 19 + 0x0a, 0x07, 0x00, # .Usage (Vendor Usage 0x07) 22 + 0x0a, 0x08, 0x00, # .Usage (Vendor Usage 0x08) 25 + 0x0a, 0x0e, 0x00, # .Usage (Vendor Usage 0x0e) 28 + 0x0a, 0x0f, 0x00, # .Usage (Vendor Usage 0x0f) 31 + 0x0a, 0x0d, 0x00, # .Usage (Vendor Usage 0x0d) 34 + 0x05, 0x0c, # .Usage Page (Consumer Devices) 37 + 0x0a, 0x24, 0x02, # .Usage (AC Back) 39 + 0x0a, 0x23, 0x02, # .Usage (AC Home) 42 + 0x15, 0x00, # .Logical Minimum (0) 45 + 0x25, 0x01, # .Logical Maximum (1) 47 + 0x75, 0x01, # .Report Size (1) 49 + 0x95, 0x0b, # .Report Count (11) 51 + 0x81, 0x02, # .Input (Data,Var,Abs) 53 + 0x75, 0x01, # .Report Size (1) 55 + 0x95, 0x01, # .Report Count (1) 57 + 0x81, 0x03, # .Input (Cnst,Var,Abs) 59 + 0x05, 0x01, # .Usage Page (Generic Desktop) 61 + 0x75, 0x04, # .Report Size (4) 63 + 0x95, 0x01, # .Report Count (1) 65 + 0x25, 0x07, # .Logical Maximum (7) 67 + 0x46, 0x3b, 0x01, # .Physical Maximum (315) 69 + 0x66, 0x14, 0x00, # .Unit (Degrees,EngRotation) 72 + 0x09, 0x39, # .Usage (Hat switch) 75 + 0x81, 0x42, # .Input (Data,Var,Abs,Null) 77 + 0x66, 0x00, 0x00, # .Unit (None) 79 + 0x09, 0x01, # .Usage (Pointer) 82 + 0xa1, 0x00, # .Collection (Physical) 84 + 0x09, 0x30, # ..Usage (X) 86 + 0x09, 0x31, # ..Usage (Y) 88 + 0x09, 0x32, # ..Usage (Z) 90 + 0x09, 0x35, # ..Usage (Rz) 92 + 0x05, 0x02, # ..Usage Page (Simulation Controls) 94 + 0x09, 0xc5, # ..Usage (Brake) 96 + 0x09, 0xc4, # ..Usage (Accelerator) 98 + 0x15, 0x00, # ..Logical Minimum (0) 100 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 102 + 0x35, 0x00, # ..Physical Minimum (0) 105 + 0x46, 0xff, 0x00, # ..Physical Maximum (255) 107 + 0x75, 0x08, # ..Report Size (8) 110 + 0x95, 0x06, # ..Report Count (6) 112 + 0x81, 0x02, # ..Input (Data,Var,Abs) 114 + 0xc0, # .End Collection 116 + 0x85, 0x02, # .Report ID (2) 117 + 0x05, 0x08, # .Usage Page (LEDs) 119 + 0x0a, 0x01, 0x00, # .Usage (Num Lock) 121 + 0x0a, 0x02, 0x00, # .Usage (Caps Lock) 124 + 0x0a, 0x03, 0x00, # .Usage (Scroll Lock) 127 + 0x0a, 0x04, 0x00, # .Usage (Compose) 130 + 0x15, 0x00, # .Logical Minimum (0) 133 + 0x25, 0x01, # .Logical Maximum (1) 135 + 0x75, 0x01, # .Report Size (1) 137 + 0x95, 0x04, # .Report Count (4) 139 + 0x91, 0x02, # .Output (Data,Var,Abs) 141 + 0x75, 0x04, # .Report Size (4) 143 + 0x95, 0x01, # .Report Count (1) 145 + 0x91, 0x03, # .Output (Cnst,Var,Abs) 147 + 0xc0, # End Collection 149 + 0x05, 0x0c, # Usage Page (Consumer Devices) 150 + 0x09, 0x01, # Usage (Consumer Control) 152 + 0xa1, 0x01, # Collection (Application) 154 + 0x85, 0x03, # .Report ID (3) 156 + 0x05, 0x01, # .Usage Page (Generic Desktop) 158 + 0x09, 0x06, # .Usage (Keyboard) 160 + 0xa1, 0x02, # .Collection (Logical) 162 + 0x05, 0x06, # ..Usage Page (Generic Device Controls) 164 + 0x09, 0x20, # ..Usage (Battery Strength) 166 + 0x15, 0x00, # ..Logical Minimum (0) 168 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 170 + 0x75, 0x08, # ..Report Size (8) 173 + 0x95, 0x01, # ..Report Count (1) 175 + 0x81, 0x02, # ..Input (Data,Var,Abs) 177 + 0x06, 0xbc, 0xff, # ..Usage Page (Vendor Usage Page 0xffbc) 179 + 0x0a, 0xad, 0xbd, # ..Usage (Vendor Usage 0xbdad) 182 + 0x75, 0x08, # ..Report Size (8) 185 + 0x95, 0x06, # ..Report Count (6) 187 + 0x81, 0x02, # ..Input (Data,Var,Abs) 189 + 0xc0, # .End Collection 191 + 0xc0, # End Collection 192 + ] + # fmt: on + + def __init__(self, rdesc=report_descriptor, name=None): + super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x18D1, 0x2C40)) + self.buttons = (1, 2, 4, 5, 7, 8, 14, 15, 13) + + +class RaptorMach2Joystick(JoystickGamepad): + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("z"), + "y": AxisMapping("Rz"), + }, + } + + def __init__( + self, + name, + rdesc=None, + application="Joystick", + input_info=(BusType.USB, 0x11C0, 0x5606), + ): + super().__init__(rdesc, application, name, input_info) + self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) + self.hat_switch = 240 # null value is 240 as max is 239 + + def event( + self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None + ): + if hat_switch is not None: + hat_switch *= 30 + + return super().event( + left=left, right=right, hat_switch=hat_switch, buttons=buttons + ) + + class TestSaitekGamepad(BaseTest.TestGamepad): def create_device(self): return SaitekGamepad() @@ -207,3 +651,14 @@ class TestSaitekGamepad(BaseTest.TestGamepad): class TestAsusGamepad(BaseTest.TestGamepad): def create_device(self): return AsusGamepad() + + +class TestRaptorMach2Joystick(BaseTest.TestGamepad): + hid_bpfs = [("FR-TEC__Raptor-Mach-2.bpf.o", True)] + + def create_device(self): + return RaptorMach2Joystick( + "uhid test Sanmos Group FR-TEC Raptor MACH 2", + rdesc="05 01 09 04 a1 01 05 01 85 01 05 01 09 30 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 31 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 33 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 00 09 00 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 32 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 35 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 34 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 36 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 09 19 01 2a 1d 00 15 00 25 01 75 01 96 80 00 81 02 05 01 09 39 26 ef 00 46 68 01 65 14 75 10 95 01 81 42 05 01 09 00 75 08 95 1d 81 01 15 00 26 ef 00 85 58 26 ff 00 46 ff 00 75 08 95 3f 09 00 91 02 85 59 75 08 95 80 09 00 b1 02 c0", + input_info=(BusType.USB, 0x11C0, 0x5606), + ) diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index 903f19f7cb..a9e2de1e88 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -35,6 +35,7 @@ class BtnPressed(Enum): PRIMARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS SECONDARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS2 + THIRD_PRESSED = libevdev.EV_KEY.BTN_STYLUS3 class PenState(Enum): @@ -44,58 +45,28 @@ class PenState(Enum): We extend it with the various buttons when we need to check them. """ - PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, None - PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, None - PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, BtnPressed.PRIMARY_PRESSED - PEN_IS_IN_RANGE_WITH_SECOND_BUTTON = ( - BtnTouch.UP, - ToolType.PEN, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, None - PEN_IS_IN_CONTACT_WITH_BUTTON = ( - BtnTouch.DOWN, - ToolType.PEN, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON = ( - BtnTouch.DOWN, - ToolType.PEN, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, None - PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = ( - BtnTouch.UP, - ToolType.RUBBER, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_SECOND_BUTTON = ( - BtnTouch.UP, - ToolType.RUBBER, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, None - PEN_IS_ERASING_WITH_BUTTON = ( - BtnTouch.DOWN, - ToolType.RUBBER, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_ERASING_WITH_SECOND_BUTTON = ( - BtnTouch.DOWN, - ToolType.RUBBER, - BtnPressed.SECONDARY_PRESSED, - ) - - def __init__(self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[BtnPressed]): + PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, False + PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, False + PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, True + PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, False + PEN_IS_IN_CONTACT_WITH_BUTTON = BtnTouch.DOWN, ToolType.PEN, True + PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, False + PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = BtnTouch.UP, ToolType.RUBBER, True + PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, False + PEN_IS_ERASING_WITH_BUTTON = BtnTouch.DOWN, ToolType.RUBBER, True + + def __init__( + self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[bool] + ): self.touch = touch # type: ignore self.tool = tool # type: ignore self.button = button # type: ignore @classmethod - def from_evdev(cls, evdev) -> "PenState": + def from_evdev(cls, evdev, test_button) -> "PenState": touch = BtnTouch(evdev.value[libevdev.EV_KEY.BTN_TOUCH]) tool = None - button = None + button = False if ( evdev.value[libevdev.EV_KEY.BTN_TOOL_RUBBER] and not evdev.value[libevdev.EV_KEY.BTN_TOOL_PEN] @@ -112,19 +83,20 @@ class PenState(Enum): ): raise ValueError("2 tools are not allowed") - # we take only the highest button in account - for b in [libevdev.EV_KEY.BTN_STYLUS, libevdev.EV_KEY.BTN_STYLUS2]: - if bool(evdev.value[b]): - button = BtnPressed(b) + # we take only the provided button into account + if test_button is not None: + button = bool(evdev.value[test_button.value]) # the kernel tends to insert an EV_SYN once removing the tool, so # the button will be released after if tool is None: - button = None + button = False return cls((touch, tool, button)) # type: ignore - def apply(self, events: List[libevdev.InputEvent], strict: bool) -> "PenState": + def apply( + self, events: List[libevdev.InputEvent], strict: bool, test_button: BtnPressed + ) -> "PenState": if libevdev.EV_SYN.SYN_REPORT in events: raise ValueError("EV_SYN is in the event sequence") touch = self.touch @@ -148,19 +120,16 @@ class PenState(Enum): raise ValueError(f"duplicated BTN_TOOL_* in {events}") tool_found = True tool = ToolType(ev.code) if ev.value else None - elif ev in ( - libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS), - libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS2), - ): + elif test_button is not None and ev in (test_button.value,): if button_found: raise ValueError(f"duplicated BTN_STYLUS* in {events}") button_found = True - button = BtnPressed(ev.code) if ev.value else None + button = bool(ev.value) # the kernel tends to insert an EV_SYN once removing the tool, so # the button will be released after if tool is None: - button = None + button = False new_state = PenState((touch, tool, button)) # type: ignore if strict: @@ -183,11 +152,9 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_ERASING, ) @@ -195,7 +162,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_CONTACT, ) @@ -204,7 +170,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE, ) @@ -236,21 +201,6 @@ class PenState(Enum): PenState.PEN_IS_IN_RANGE_WITH_BUTTON, ) - if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_OUT_OF_RANGE, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - ) - - if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - ) - return tuple() def historically_tolerated_transitions(self) -> Tuple["PenState", ...]: @@ -263,11 +213,9 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_ERASING, ) @@ -275,7 +223,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_CONTACT, ) @@ -284,7 +231,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_OUT_OF_RANGE, ) @@ -319,22 +265,6 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, ) - if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_OUT_OF_RANGE, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - ) - - if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_OUT_OF_RANGE, - ) - return tuple() @staticmethod @@ -402,9 +332,9 @@ class PenState(Enum): } @staticmethod - def legal_transitions_with_primary_button() -> Dict[str, Tuple["PenState", ...]]: + def legal_transitions_with_button() -> Dict[str, Tuple["PenState", ...]]: """We revisit the Windows Pen Implementation state machine: - we now have a primary button. + we now have a button. """ return { "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_BUTTON,), @@ -450,56 +380,6 @@ class PenState(Enum): ), } - @staticmethod - def legal_transitions_with_secondary_button() -> Dict[str, Tuple["PenState", ...]]: - """We revisit the Windows Pen Implementation state machine: - we now have a secondary button. - Note: we don't looks for 2 buttons interactions. - """ - return { - "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,), - "hover-button -> out-of-range": ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_OUT_OF_RANGE, - ), - "in-range -> button-press": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - ), - "in-range -> button-press -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> touch -> button-press -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - ), - "in-range -> touch -> button-press -> release -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> button-press -> touch -> release -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> button-press -> touch -> button-release -> release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE, - ), - } - @staticmethod def tolerated_transitions() -> Dict[str, Tuple["PenState", ...]]: """This is not adhering to the Windows Pen Implementation state machine @@ -616,10 +496,22 @@ class Pen(object): evdev.value[axis] == value ), f"assert evdev.value[{axis}] ({evdev.value[axis]}) != {value}" - def assert_expected_input_events(self, evdev): + def assert_expected_input_events(self, evdev, button): assert evdev.value[libevdev.EV_ABS.ABS_X] == self.x assert evdev.value[libevdev.EV_ABS.ABS_Y] == self.y - assert self.current_state == PenState.from_evdev(evdev) + + # assert no other buttons than the tested ones are set + buttons = [ + BtnPressed.PRIMARY_PRESSED, + BtnPressed.SECONDARY_PRESSED, + BtnPressed.THIRD_PRESSED, + ] + if button is not None: + buttons.remove(button) + for b in buttons: + assert evdev.value[b.value] is None or evdev.value[b.value] == False + + assert self.current_state == PenState.from_evdev(evdev, button) class PenDigitizer(base.UHIDTestDevice): @@ -647,7 +539,7 @@ class PenDigitizer(base.UHIDTestDevice): continue self.fields = [f.usage_name for f in r] - def move_to(self, pen, state): + def move_to(self, pen, state, button): # fill in the previous values if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: pen.restore() @@ -690,29 +582,17 @@ class PenDigitizer(base.UHIDTestDevice): pen.inrange = True pen.invert = False pen.eraser = False - pen.barrelswitch = True - pen.secondarybarrelswitch = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: pen.tipswitch = True pen.inrange = True pen.invert = False pen.eraser = False - pen.barrelswitch = True - pen.secondarybarrelswitch = False - elif state == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - pen.tipswitch = False - pen.inrange = True - pen.invert = False - pen.eraser = False - pen.barrelswitch = False - pen.secondarybarrelswitch = True - elif state == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - pen.tipswitch = True - pen.inrange = True - pen.invert = False - pen.eraser = False - pen.barrelswitch = False - pen.secondarybarrelswitch = True + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: pen.tipswitch = False pen.inrange = True @@ -730,7 +610,7 @@ class PenDigitizer(base.UHIDTestDevice): pen.current_state = state - def event(self, pen): + def event(self, pen, button): rs = [] r = self.create_report(application=self.cur_application, data=pen) self.call_input_event(r) @@ -771,17 +651,17 @@ class BaseTest: def create_device(self): raise Exception("please reimplement me in subclasses") - def post(self, uhdev, pen): - r = uhdev.event(pen) + def post(self, uhdev, pen, test_button): + r = uhdev.event(pen, test_button) events = uhdev.next_sync_events() self.debug_reports(r, uhdev, events) return events def validate_transitions( - self, from_state, pen, evdev, events, allow_intermediate_states + self, from_state, pen, evdev, events, allow_intermediate_states, button ): # check that the final state is correct - pen.assert_expected_input_events(evdev) + pen.assert_expected_input_events(evdev, button) state = from_state @@ -794,12 +674,14 @@ class BaseTest: events = events[idx + 1 :] # now check for a valid transition - state = state.apply(sync_events, not allow_intermediate_states) + state = state.apply(sync_events, not allow_intermediate_states, button) if events: - state = state.apply(sync_events, not allow_intermediate_states) + state = state.apply(sync_events, not allow_intermediate_states, button) - def _test_states(self, state_list, scribble, allow_intermediate_states): + def _test_states( + self, state_list, scribble, allow_intermediate_states, button=None + ): """Internal method to test against a list of transition between states. state_list is a list of PenState objects @@ -812,10 +694,10 @@ class BaseTest: cur_state = PenState.PEN_IS_OUT_OF_RANGE p = Pen(50, 60) - uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE) - events = self.post(uhdev, p) + uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE, button) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) cur_state = p.current_state @@ -824,18 +706,18 @@ class BaseTest: if scribble and cur_state != PenState.PEN_IS_OUT_OF_RANGE: p.x += 1 p.y -= 1 - events = self.post(uhdev, p) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) assert len(events) >= 3 # X, Y, SYN - uhdev.move_to(p, state) + uhdev.move_to(p, state, button) if scribble and state != PenState.PEN_IS_OUT_OF_RANGE: p.x += 1 p.y -= 1 - events = self.post(uhdev, p) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) cur_state = p.current_state @@ -874,12 +756,17 @@ class BaseTest: "state_list", [ pytest.param(v, id=k) - for k, v in PenState.legal_transitions_with_primary_button().items() + for k, v in PenState.legal_transitions_with_button().items() ], ) def test_valid_primary_button_pen_states(self, state_list, scribble): """Rework the transition state machine by adding the primary button.""" - self._test_states(state_list, scribble, allow_intermediate_states=False) + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.PRIMARY_PRESSED, + ) @pytest.mark.skip_if_uhdev( lambda uhdev: "Secondary Barrel Switch" not in uhdev.fields, @@ -890,12 +777,38 @@ class BaseTest: "state_list", [ pytest.param(v, id=k) - for k, v in PenState.legal_transitions_with_secondary_button().items() + for k, v in PenState.legal_transitions_with_button().items() ], ) def test_valid_secondary_button_pen_states(self, state_list, scribble): """Rework the transition state machine by adding the secondary button.""" - self._test_states(state_list, scribble, allow_intermediate_states=False) + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.SECONDARY_PRESSED, + ) + + @pytest.mark.skip_if_uhdev( + lambda uhdev: "Third Barrel Switch" not in uhdev.fields, + "Device not compatible, missing Third Barrel Switch usage", + ) + @pytest.mark.parametrize("scribble", [True, False], ids=["scribble", "static"]) + @pytest.mark.parametrize( + "state_list", + [ + pytest.param(v, id=k) + for k, v in PenState.legal_transitions_with_button().items() + ], + ) + def test_valid_third_button_pen_states(self, state_list, scribble): + """Rework the transition state machine by adding the secondary button.""" + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.THIRD_PRESSED, + ) @pytest.mark.skip_if_uhdev( lambda uhdev: "Invert" not in uhdev.fields, @@ -956,7 +869,7 @@ class BaseTest: class GXTP_pen(PenDigitizer): - def event(self, pen): + def event(self, pen, test_button): if not hasattr(self, "prev_tip_state"): self.prev_tip_state = False @@ -977,13 +890,407 @@ class GXTP_pen(PenDigitizer): if pen.eraser: internal_pen.invert = False - return super().event(internal_pen) + return super().event(internal_pen, test_button) class USIPen(PenDigitizer): pass +class XPPen_ArtistPro16Gen2_28bd_095b(PenDigitizer): + """ + Pen with two buttons and a rubber end, but which reports + the second button as an eraser + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x28BD, 0x095B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + + def move_to(self, pen, state, button): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.x = 0 + pen.y = 0 + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: + pen.tipswitch = False + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_ERASING: + pen.tipswitch = True + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + + pen.current_state = state + + def event(self, pen, test_button): + import math + + pen_copy = copy.copy(pen) + width = 13.567 + height = 8.480 + tip_height = 0.055677699 + hx = tip_height * (32767 / width) + hy = tip_height * (32767 / height) + if pen_copy.xtilt != 0: + pen_copy.x += round(hx * math.sin(math.radians(pen_copy.xtilt))) + if pen_copy.ytilt != 0: + pen_copy.y += round(hy * math.sin(math.radians(pen_copy.ytilt))) + + return super().event(pen_copy, test_button) + + +class XPPen_Artist24_28bd_093a(PenDigitizer): + """ + Pen that reports secondary barrel switch through eraser + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x28BD, 0x093A), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + self.previous_state = PenState.PEN_IS_OUT_OF_RANGE + + def move_to(self, pen, state, button, debug=True): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + if debug: + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + + pen.current_state = state + + def send_intermediate_state(self, pen, state, button): + intermediate_pen = copy.copy(pen) + self.move_to(intermediate_pen, state, button, debug=False) + return super().event(intermediate_pen, button) + + def event(self, pen, button): + rs = [] + + # the pen reliably sends in-range events in a normal case (non emulation of eraser mode) + if self.previous_state == PenState.PEN_IS_IN_CONTACT: + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button) + ) + + if button == BtnPressed.SECONDARY_PRESSED: + if self.previous_state == PenState.PEN_IS_IN_RANGE: + if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + if pen.current_state == PenState.PEN_IS_IN_RANGE: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_CONTACT: + if pen.current_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + if pen.current_state == PenState.PEN_IS_IN_CONTACT: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_IN_RANGE, button + ) + ) + + rs.extend(super().event(pen, button)) + self.previous_state = pen.current_state + return rs + + +class Huion_Kamvas_Pro_19_256c_006b(PenDigitizer): + """ + Pen that reports secondary barrel switch through secondary TipSwtich + and 3rd button through Invert + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Stylus", + physical=None, + input_info=(BusType.USB, 0x256C, 0x006B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + self.fields.append("Third Barrel Switch") + self.previous_state = PenState.PEN_IS_OUT_OF_RANGE + + def move_to(self, pen, state, button, debug=True): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + if debug: + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.eraser = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED + pen.invert = button == BtnPressed.THIRD_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.eraser = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED + pen.invert = button == BtnPressed.THIRD_PRESSED + elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: + pen.tipswitch = False + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_ERASING: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = True + pen.barrelswitch = False + pen.secondarytipswitch = False + + pen.current_state = state + + def call_input_event(self, report): + if report[0] == 0x0a: + # ensures the original second Eraser usage is null + report[1] &= 0xdf + + # ensures the original last bit is equal to bit 6 (In Range) + if report[1] & 0x40: + report[1] |= 0x80 + + super().call_input_event(report) + + def send_intermediate_state(self, pen, state, test_button): + intermediate_pen = copy.copy(pen) + self.move_to(intermediate_pen, state, test_button, debug=False) + return super().event(intermediate_pen, test_button) + + def event(self, pen, button): + rs = [] + + # it's not possible to go between eraser mode or not without + # going out-of-prox: the eraser mode is activated by presenting + # the tail of the pen + if self.previous_state in ( + PenState.PEN_IS_IN_RANGE, + PenState.PEN_IS_IN_RANGE_WITH_BUTTON, + PenState.PEN_IS_IN_CONTACT, + PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, + ) and pen.current_state in ( + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON, + PenState.PEN_IS_ERASING, + PenState.PEN_IS_ERASING_WITH_BUTTON, + ): + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button) + ) + + # same than above except from eraser to normal + if self.previous_state in ( + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON, + PenState.PEN_IS_ERASING, + PenState.PEN_IS_ERASING_WITH_BUTTON, + ) and pen.current_state in ( + PenState.PEN_IS_IN_RANGE, + PenState.PEN_IS_IN_RANGE_WITH_BUTTON, + PenState.PEN_IS_IN_CONTACT, + PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, + ): + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button) + ) + + if self.previous_state == PenState.PEN_IS_OUT_OF_RANGE: + if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button) + ) + + rs.extend(super().event(pen, button)) + self.previous_state = pen.current_state + return rs + + ################################################################################ # # Windows 7 compatible devices @@ -1162,3 +1469,37 @@ class TestGoodix_27c6_0e00(BaseTest.TestTablet): rdesc="05 0d 09 04 a1 01 85 01 09 22 a1 02 55 0e 65 11 35 00 15 00 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 54 15 00 25 7f 75 08 95 01 81 02 85 02 09 55 95 01 25 0a b1 02 85 03 06 00 ff 09 c5 15 00 26 ff 00 75 08 96 00 01 b1 02 c0 05 0d 09 02 a1 01 09 20 a1 00 85 08 05 01 a4 09 30 35 00 46 e6 09 15 00 26 04 20 55 0d 65 13 75 10 95 01 81 02 09 31 46 9a 06 26 60 15 81 02 b4 05 0d 09 38 95 01 75 08 15 00 25 01 81 02 09 30 75 10 26 ff 0f 81 02 09 31 81 02 09 42 09 44 09 5a 09 3c 09 45 09 32 75 01 95 06 25 01 81 02 95 02 81 03 09 3d 55 0e 65 14 36 d8 dc 46 28 23 16 d8 dc 26 28 23 95 01 75 10 81 02 09 3e 81 02 09 41 15 00 27 a0 8c 00 00 35 00 47 a0 8c 00 00 81 02 05 20 0a 53 04 65 00 16 01 f8 26 ff 07 75 10 95 01 81 02 0a 54 04 81 02 0a 55 04 81 02 0a 57 04 81 02 0a 58 04 81 02 0a 59 04 81 02 0a 72 04 81 02 0a 73 04 81 02 0a 74 04 81 02 05 0d 09 3b 15 00 25 64 75 08 81 02 09 5b 25 ff 75 40 81 02 06 00 ff 09 5b 75 20 81 02 05 0d 09 5c 26 ff 00 75 08 81 02 09 5e 81 02 09 70 a1 02 15 01 25 06 09 72 09 73 09 74 09 75 09 76 09 77 81 20 c0 06 00 ff 09 01 15 00 27 ff ff 00 00 75 10 95 01 81 02 85 09 09 81 a1 02 09 81 15 01 25 04 09 82 09 83 09 84 09 85 81 20 c0 85 10 09 5c a1 02 15 00 25 01 75 08 95 01 09 38 b1 02 09 5c 26 ff 00 b1 02 09 5d 75 01 95 01 25 01 b1 02 95 07 b1 03 c0 85 11 09 5e a1 02 09 38 15 00 25 01 75 08 95 01 b1 02 09 5e 26 ff 00 b1 02 09 5f 75 01 25 01 b1 02 75 07 b1 03 c0 85 12 09 70 a1 02 75 08 95 01 15 00 25 01 09 38 b1 02 09 70 a1 02 25 06 09 72 09 73 09 74 09 75 09 76 09 77 b1 20 c0 09 71 75 01 25 01 b1 02 75 07 b1 03 c0 85 13 09 80 15 00 25 ff 75 40 95 01 b1 02 85 14 09 44 a1 02 09 38 75 08 95 01 25 01 b1 02 15 01 25 03 09 44 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 5a a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 45 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 c0 85 15 75 08 95 01 05 0d 09 90 a1 02 09 38 25 01 b1 02 09 91 75 10 26 ff 0f b1 02 09 92 75 40 25 ff b1 02 05 06 09 2a 75 08 26 ff 00 a1 02 09 2d b1 02 09 2e b1 02 c0 c0 85 16 05 06 09 2b a1 02 05 0d 25 01 09 38 b1 02 05 06 09 2b a1 02 09 2d 26 ff 00 b1 02 09 2e b1 02 c0 c0 85 17 06 00 ff 09 01 a1 02 05 0d 09 38 75 08 95 01 25 01 b1 02 06 00 ff 09 01 75 10 27 ff ff 00 00 b1 02 c0 85 18 05 0d 09 38 75 08 95 01 15 00 25 01 b1 02 c0 c0 06 f0 ff 09 01 a1 01 85 0e 09 01 15 00 25 ff 75 08 95 40 91 02 09 01 15 00 25 ff 75 08 95 40 81 02 c0", input_info=(BusType.I2C, 0x27C6, 0x0E00), ) + + +class TestXPPen_ArtistPro16Gen2_28bd_095b(BaseTest.TestTablet): + hid_bpfs = [("XPPen__ArtistPro16Gen2.bpf.o", True)] + + def create_device(self): + dev = XPPen_ArtistPro16Gen2_28bd_095b( + "uhid test XPPen Artist Pro 16 Gen2 28bd 095b", + rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 09 3c 15 00 25 01 75 01 95 04 81 02 95 01 81 03 09 32 15 00 25 01 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 ff 34 26 ff 7f 81 02 09 31 46 20 21 26 ff 7f 81 02 b4 09 30 45 00 26 ff 3f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0", + input_info=(BusType.USB, 0x28BD, 0x095B), + ) + return dev + + +class TestXPPen_Artist24_28bd_093a(BaseTest.TestTablet): + hid_bpfs = [("XPPen__Artist24.bpf.o", True)] + + def create_device(self): + return XPPen_Artist24_28bd_093a( + "uhid test XPPen Artist 24 28bd 093a", + rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 15 00 25 01 75 01 95 03 81 02 95 02 81 03 09 32 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 f0 50 26 ff 7f 81 02 09 31 46 91 2d 26 ff 7f 81 02 b4 09 30 45 00 26 ff 1f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0", + input_info=(BusType.USB, 0x28BD, 0x093A), + ) + + +class TestHuion_Kamvas_Pro_19_256c_006b(BaseTest.TestTablet): + hid_bpfs = [("Huion__Kamvas-Pro-19.bpf.o", True)] + + def create_device(self): + return Huion_Kamvas_Pro_19_256c_006b( + "uhid test HUION Huion Tablet_GT1902", + rdesc="05 0d 09 02 a1 01 85 0a 09 20 a1 01 09 42 09 44 09 43 09 3c 09 45 15 00 25 01 75 01 95 06 81 02 09 32 75 01 95 01 81 02 81 03 05 01 09 30 09 31 55 0d 65 33 26 ff 7f 35 00 46 00 08 75 10 95 02 81 02 05 0d 09 30 26 ff 3f 75 10 95 01 81 02 09 3d 09 3e 15 a6 25 5a 75 08 95 02 81 02 c0 c0 05 0d 09 04 a1 01 85 04 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 56 55 00 65 00 27 ff ff ff 7f 95 01 75 20 81 02 09 54 25 7f 95 01 75 08 81 02 75 08 95 08 81 03 85 05 09 55 25 0a 75 08 95 01 b1 02 06 00 ff 09 c5 85 06 15 00 26 ff 00 75 08 96 00 01 b1 02 c0", + input_info=(BusType.USB, 0x256C, 0x006B), + ) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index edf1c99c99..5f7d5a5ba8 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1722,10 +1722,17 @@ FIXTURE_VARIANT(iommufd_dirty_tracking) FIXTURE_SETUP(iommufd_dirty_tracking) { + unsigned long size; int mmap_flags; void *vrc; int rc; + if (variant->buffer_size < MOCK_PAGE_SIZE) { + SKIP(return, + "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu", + variant->buffer_size, MOCK_PAGE_SIZE); + } + self->fd = open("/dev/iommu", O_RDWR); ASSERT_NE(-1, self->fd); @@ -1749,12 +1756,11 @@ FIXTURE_SETUP(iommufd_dirty_tracking) assert(vrc == self->buffer); self->page_size = MOCK_PAGE_SIZE; - self->bitmap_size = - variant->buffer_size / self->page_size / BITS_PER_BYTE; + self->bitmap_size = variant->buffer_size / self->page_size; /* Provision with an extra (PAGE_SIZE) for the unaligned case */ - rc = posix_memalign(&self->bitmap, PAGE_SIZE, - self->bitmap_size + PAGE_SIZE); + size = DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE); + rc = posix_memalign(&self->bitmap, PAGE_SIZE, size + PAGE_SIZE); assert(!rc); assert(self->bitmap); assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); @@ -1775,51 +1781,63 @@ FIXTURE_SETUP(iommufd_dirty_tracking) FIXTURE_TEARDOWN(iommufd_dirty_tracking) { munmap(self->buffer, variant->buffer_size); - munmap(self->bitmap, self->bitmap_size); + munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE)); teardown_iommufd(self->fd, _metadata); } -FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k) +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty8k) +{ + /* half of an u8 index bitmap */ + .buffer_size = 8UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty16k) +{ + /* one u8 index bitmap */ + .buffer_size = 16UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty64k) { /* one u32 index bitmap */ - .buffer_size = 128UL * 1024UL, + .buffer_size = 64UL * 1024UL, }; -FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256k) +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k) { /* one u64 index bitmap */ - .buffer_size = 256UL * 1024UL, + .buffer_size = 128UL * 1024UL, }; -FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty640k) +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty320k) { /* two u64 index and trailing end bitmap */ - .buffer_size = 640UL * 1024UL, + .buffer_size = 320UL * 1024UL, }; -FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M) +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty64M) { - /* 4K bitmap (128M IOVA range) */ - .buffer_size = 128UL * 1024UL * 1024UL, + /* 4K bitmap (64M IOVA range) */ + .buffer_size = 64UL * 1024UL * 1024UL, }; -FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M_huge) +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty64M_huge) { - /* 4K bitmap (128M IOVA range) */ - .buffer_size = 128UL * 1024UL * 1024UL, + /* 4K bitmap (64M IOVA range) */ + .buffer_size = 64UL * 1024UL * 1024UL, .hugepages = true, }; -FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M) +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M) { - /* 8K bitmap (256M IOVA range) */ - .buffer_size = 256UL * 1024UL * 1024UL, + /* 8K bitmap (128M IOVA range) */ + .buffer_size = 128UL * 1024UL * 1024UL, }; -FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M_huge) +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M_huge) { - /* 8K bitmap (256M IOVA range) */ - .buffer_size = 256UL * 1024UL * 1024UL, + /* 8K bitmap (128M IOVA range) */ + .buffer_size = 128UL * 1024UL * 1024UL, .hugepages = true, }; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 8d2b46b211..c612fbf019 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -22,6 +22,8 @@ #define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / __BITS_PER_LONG) +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + static inline void set_bit(unsigned int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); @@ -346,12 +348,12 @@ static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length, static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, __u64 iova, size_t page_size, size_t pte_page_size, __u64 *bitmap, - __u64 bitmap_size, __u32 flags, + __u64 nbits, __u32 flags, struct __test_metadata *_metadata) { unsigned long npte = pte_page_size / page_size, pteset = 2 * npte; - unsigned long nbits = bitmap_size * BITS_PER_BYTE; unsigned long j, i, nr = nbits / pteset ?: 1; + unsigned long bitmap_size = DIV_ROUND_UP(nbits, BITS_PER_BYTE); __u64 out_dirty = 0; /* Mark all even bits as dirty in the mock domain */ diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c index 656c43c240..c75ea40948 100644 --- a/tools/testing/selftests/ipc/msgque.c +++ b/tools/testing/selftests/ipc/msgque.c @@ -198,13 +198,12 @@ int main(int argc, char **argv) struct msgque_data msgque; if (getuid() != 0) - return ksft_exit_skip( - "Please run the test as root - Exiting.\n"); + ksft_exit_skip("Please run the test as root - Exiting.\n"); msgque.key = ftok(argv[0], 822155650); if (msgque.key == -1) { printf("Can't make key: %d\n", -errno); - return ksft_exit_fail(); + ksft_exit_fail(); } msgque.msq_id = msgget(msgque.key, IPC_CREAT | IPC_EXCL | 0666); @@ -243,13 +242,13 @@ int main(int argc, char **argv) printf("Failed to test queue: %d\n", err); goto err_out; } - return ksft_exit_pass(); + ksft_exit_pass(); err_destroy: if (msgctl(msgque.msq_id, IPC_RMID, NULL)) { printf("Failed to destroy queue: %d\n", -errno); - return ksft_exit_fail(); + ksft_exit_fail(); } err_out: - return ksft_exit_fail(); + ksft_exit_fail(); } diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 14bbab0cce..76c2a6945d 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -16,10 +16,12 @@ * For each test, report any progress, debugging, etc with: * * ksft_print_msg(fmt, ...); + * ksft_perror(msg); * * and finally report the pass/fail/skip/xfail state of the test with one of: * * ksft_test_result(condition, fmt, ...); + * ksft_test_result_report(result, fmt, ...); * ksft_test_result_pass(fmt, ...); * ksft_test_result_fail(fmt, ...); * ksft_test_result_skip(fmt, ...); @@ -39,6 +41,7 @@ * the program is aborting before finishing all tests): * * ksft_exit_fail_msg(fmt, ...); + * ksft_exit_fail_perror(msg); * */ #ifndef __KSELFTEST_H @@ -305,13 +308,34 @@ void ksft_test_result_code(int exit_code, const char *test_name, printf("\n"); } -static inline __noreturn int ksft_exit_pass(void) +/** + * ksft_test_result() - Report test success based on truth of condition + * + * @condition: if true, report test success, otherwise failure. + */ +#define ksft_test_result_report(result, fmt, ...) do { \ + switch (result) { \ + case KSFT_PASS: \ + ksft_test_result_pass(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_FAIL: \ + ksft_test_result_fail(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_XFAIL: \ + ksft_test_result_xfail(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_SKIP: \ + ksft_test_result_skip(fmt, ##__VA_ARGS__); \ + break; \ + } } while (0) + +static inline __noreturn void ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline __noreturn int ksft_exit_fail(void) +static inline __noreturn void ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -338,7 +362,7 @@ static inline __noreturn int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) void ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -353,19 +377,32 @@ static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, exit(KSFT_FAIL); } -static inline __noreturn int ksft_exit_xfail(void) +static inline __noreturn void ksft_exit_fail_perror(const char *msg) +{ +#ifndef NOLIBC + ksft_exit_fail_msg("%s: %s (%d)\n", msg, strerror(errno), errno); +#else + /* + * nolibc doesn't provide strerror() and it seems + * inappropriate to add one, just print the errno. + */ + ksft_exit_fail_msg("%s: %d)\n", msg, errno); +#endif +} + +static inline __noreturn void ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline __noreturn int ksft_exit_xpass(void) +static inline __noreturn void ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index de59cc8f03..487e49fdf2 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -244,6 +244,7 @@ l4_test() l5_test() { tests=$(find $(dirname "$test") -type f -name "*.mk") + [[ -z "${tests// }" ]] && return test_libs=$(grep "^IOURING_EXTRA_LIBS +\?=" $tests | \ cut -d "=" -f 2) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 741c7dc16a..ac280dcba9 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -45,6 +45,7 @@ LIBKVM_x86_64 += lib/x86_64/vmx.c LIBKVM_aarch64 += lib/aarch64/gic.c LIBKVM_aarch64 += lib/aarch64/gic_v3.c +LIBKVM_aarch64 += lib/aarch64/gic_v3_its.c LIBKVM_aarch64 += lib/aarch64/handlers.S LIBKVM_aarch64 += lib/aarch64/processor.c LIBKVM_aarch64 += lib/aarch64/spinlock.c @@ -120,6 +121,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test TEST_GEN_PROGS_x86_64 += x86_64/amx_test @@ -157,6 +159,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq +TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer @@ -180,6 +183,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test TEST_GEN_PROGS_s390x += s390x/tprot TEST_GEN_PROGS_s390x += s390x/cmma_test TEST_GEN_PROGS_s390x += s390x/debug_test +TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += guest_print_test @@ -189,6 +193,8 @@ TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS_s390x += kvm_binary_stats_test +TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test +TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer TEST_GEN_PROGS_riscv += demand_paging_test TEST_GEN_PROGS_riscv += dirty_log_test @@ -225,8 +231,8 @@ LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ - -fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \ - -fno-builtin-strnlen \ + -D_GNU_SOURCE -fno-builtin-memcmp -fno-builtin-memcpy \ + -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ -I$(= 0, "Failed to create vgic-v3"); /* Make all the test's cmdline args visible to the guest */ diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 5972905275..d29b08198b 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -7,7 +7,6 @@ * hugetlbfs with a hole). It checks that the expected handling method is * called (e.g., uffd faults with the right address and write/read flag). */ -#define _GNU_SOURCE #include #include #include @@ -375,14 +374,14 @@ static void setup_uffd(struct kvm_vm *vm, struct test_params *p, *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0, pt_args.hva, pt_args.paging_size, - test->uffd_pt_handler); + 1, test->uffd_pt_handler); *data_uffd = NULL; if (test->uffd_data_handler) *data_uffd = uffd_setup_demand_paging(uffd_mode, 0, data_args.hva, data_args.paging_size, - test->uffd_data_handler); + 1, test->uffd_data_handler); } static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 9b004905d1..61731a950d 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -11,9 +11,9 @@ * KVM_SYSTEM_EVENT_SUSPEND UAPI. */ -#define _GNU_SOURCE - +#include #include +#include #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 16e2338686..a7de39fa2a 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -327,8 +327,8 @@ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) return ftr; } -static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, - const struct reg_ftr_bits *ftr_bits) +static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, + const struct reg_ftr_bits *ftr_bits) { uint8_t shift = ftr_bits->shift; uint64_t mask = ftr_bits->mask; @@ -346,6 +346,8 @@ static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, vcpu_set_reg(vcpu, reg, val); vcpu_get_reg(vcpu, reg, &new_val); TEST_ASSERT_EQ(new_val, val); + + return new_val; } static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, @@ -374,7 +376,15 @@ static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, TEST_ASSERT_EQ(val, old_val); } -static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) +static uint64_t test_reg_vals[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + +#define encoding_to_range_idx(encoding) \ + KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(encoding), sys_reg_Op1(encoding), \ + sys_reg_CRn(encoding), sys_reg_CRm(encoding), \ + sys_reg_Op2(encoding)) + + +static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) { uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; struct reg_mask_range range = { @@ -398,9 +408,7 @@ static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) int idx; /* Get the index to masks array for the idreg */ - idx = KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(reg_id), sys_reg_Op1(reg_id), - sys_reg_CRn(reg_id), sys_reg_CRm(reg_id), - sys_reg_Op2(reg_id)); + idx = encoding_to_range_idx(reg_id); for (int j = 0; ftr_bits[j].type != FTR_END; j++) { /* Skip aarch32 reg on aarch64 only system, since they are RAZ/WI. */ @@ -414,7 +422,9 @@ static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) TEST_ASSERT_EQ(masks[idx] & ftr_bits[j].mask, ftr_bits[j].mask); test_reg_set_fail(vcpu, reg, &ftr_bits[j]); - test_reg_set_success(vcpu, reg, &ftr_bits[j]); + + test_reg_vals[idx] = test_reg_set_success(vcpu, reg, + &ftr_bits[j]); ksft_test_result_pass("%s\n", ftr_bits[j].name); } @@ -425,7 +435,6 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) { bool done = false; struct ucall uc; - uint64_t val; while (!done) { vcpu_run(vcpu); @@ -436,8 +445,8 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) break; case UCALL_SYNC: /* Make sure the written values are seen by guest */ - vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(uc.args[2]), &val); - TEST_ASSERT_EQ(val, uc.args[3]); + TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])], + uc.args[3]); break; case UCALL_DONE: done = true; @@ -448,13 +457,85 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) } } +/* Politely lifted from arch/arm64/include/asm/cache.h */ +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +static void test_clidr(struct kvm_vcpu *vcpu) +{ + uint64_t clidr; + int level; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), &clidr); + + /* find the first empty level in the cache hierarchy */ + for (level = 1; level < 7; level++) { + if (!CLIDR_CTYPE(clidr, level)) + break; + } + + /* + * If you have a mind-boggling 7 levels of cache, congratulations, you + * get to fix this. + */ + TEST_ASSERT(level <= 7, "can't find an empty level in cache hierarchy"); + + /* stick in a unified cache level */ + clidr |= BIT(2) << CLIDR_CTYPE_SHIFT(level); + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), clidr); + test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr; +} + +static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +{ + u64 val; + + test_clidr(vcpu); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val); + val++; + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val); + + test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val; + ksft_test_result_pass("%s\n", __func__); +} + +static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding) +{ + size_t idx = encoding_to_range_idx(encoding); + uint64_t observed; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding), &observed); + TEST_ASSERT_EQ(test_reg_vals[idx], observed); +} + +static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) +{ + /* + * Calls KVM_ARM_VCPU_INIT behind the scenes, which will do an + * architectural reset of the vCPU. + */ + aarch64_vcpu_setup(vcpu, NULL); + + for (int i = 0; i < ARRAY_SIZE(test_regs); i++) + test_assert_id_reg_unchanged(vcpu, test_regs[i].reg); + + test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1); + + ksft_test_result_pass("%s\n", __func__); +} + int main(void) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; bool aarch64_only; uint64_t val, el0; - int ftr_cnt; + int test_cnt; TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES)); @@ -467,18 +548,22 @@ int main(void) ksft_print_header(); - ftr_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + - ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + - ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - - ARRAY_SIZE(test_regs); + test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + + ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + + ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - + ARRAY_SIZE(test_regs) + 2; - ksft_set_plan(ftr_cnt); + ksft_set_plan(test_cnt); + + test_vm_ftr_id_regs(vcpu, aarch64_only); + test_vcpu_ftr_id_regs(vcpu); - test_user_set_reg(vcpu, aarch64_only); test_guest_reg_read(vcpu); + test_reset_preserves_id_regs(vcpu); + kvm_vm_free(vm); ksft_finished(); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index ca917c71ff..b3b5fb0ff0 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -4,7 +4,6 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 2e64b4856e..a51dbd2a5f 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -19,9 +19,6 @@ #include "gic_v3.h" #include "vgic.h" -#define GICD_BASE_GPA 0x08000000ULL -#define GICR_BASE_GPA 0x080A0000ULL - /* * Stores the user specified args; it's passed to the guest and to every test * function. @@ -49,9 +46,6 @@ struct test_args { #define IRQ_DEFAULT_PRIO (LOWEST_PRIO - 1) #define IRQ_DEFAULT_PRIO_REG (IRQ_DEFAULT_PRIO << KVM_PRIO_SHIFT) /* 0xf0 */ -static void *dist = (void *)GICD_BASE_GPA; -static void *redist = (void *)GICR_BASE_GPA; - /* * The kvm_inject_* utilities are used by the guest to ask the host to inject * interrupts (e.g., using the KVM_IRQ_LINE ioctl). @@ -152,7 +146,7 @@ static void reset_stats(void) static uint64_t gic_read_ap1r0(void) { - uint64_t reg = read_sysreg_s(SYS_ICV_AP1R0_EL1); + uint64_t reg = read_sysreg_s(SYS_ICC_AP1R0_EL1); dsb(sy); return reg; @@ -160,7 +154,7 @@ static uint64_t gic_read_ap1r0(void) static void gic_write_ap1r0(uint64_t val) { - write_sysreg_s(val, SYS_ICV_AP1R0_EL1); + write_sysreg_s(val, SYS_ICC_AP1R0_EL1); isb(); } @@ -478,7 +472,7 @@ static void guest_code(struct test_args *args) bool level_sensitive = args->level_sensitive; struct kvm_inject_desc *f, *inject_fns; - gic_init(GIC_V3, 1, dist, redist); + gic_init(GIC_V3, 1); for (i = 0; i < nr_irqs; i++) gic_irq_enable(i); @@ -764,8 +758,7 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); vcpu_args_set(vcpu, 1, args_gva); - gic_fd = vgic_v3_setup(vm, 1, nr_irqs, - GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, 1, nr_irqs); __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3, skipping"); vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, diff --git a/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c new file mode 100644 index 0000000000..fc4fe52fb6 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vgic_lpi_stress - Stress test for KVM's ITS emulation + * + * Copyright (c) 2024 Google LLC + */ + +#include +#include +#include +#include + +#include "kvm_util.h" +#include "gic.h" +#include "gic_v3.h" +#include "gic_v3_its.h" +#include "processor.h" +#include "ucall.h" +#include "vgic.h" + +#define TEST_MEMSLOT_INDEX 1 + +#define GIC_LPI_OFFSET 8192 + +static size_t nr_iterations = 1000; +static vm_paddr_t gpa_base; + +static struct kvm_vm *vm; +static struct kvm_vcpu **vcpus; +static int gic_fd, its_fd; + +static struct test_data { + bool request_vcpus_stop; + u32 nr_cpus; + u32 nr_devices; + u32 nr_event_ids; + + vm_paddr_t device_table; + vm_paddr_t collection_table; + vm_paddr_t cmdq_base; + void *cmdq_base_va; + vm_paddr_t itt_tables; + + vm_paddr_t lpi_prop_table; + vm_paddr_t lpi_pend_tables; +} test_data = { + .nr_cpus = 1, + .nr_devices = 1, + .nr_event_ids = 16, +}; + +static void guest_irq_handler(struct ex_regs *regs) +{ + u32 intid = gic_get_and_ack_irq(); + + if (intid == IAR_SPURIOUS) + return; + + GUEST_ASSERT(intid >= GIC_LPI_OFFSET); + gic_set_eoi(intid); +} + +static void guest_setup_its_mappings(void) +{ + u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET; + u32 nr_events = test_data.nr_event_ids; + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + + for (coll_id = 0; coll_id < nr_cpus; coll_id++) + its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true); + + /* Round-robin the LPIs to all of the vCPUs in the VM */ + coll_id = 0; + for (device_id = 0; device_id < nr_devices; device_id++) { + vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K); + + its_send_mapd_cmd(test_data.cmdq_base_va, device_id, + itt_base, SZ_64K, true); + + for (event_id = 0; event_id < nr_events; event_id++) { + its_send_mapti_cmd(test_data.cmdq_base_va, device_id, + event_id, coll_id, intid++); + + coll_id = (coll_id + 1) % test_data.nr_cpus; + } + } +} + +static void guest_invalidate_all_rdists(void) +{ + int i; + + for (i = 0; i < test_data.nr_cpus; i++) + its_send_invall_cmd(test_data.cmdq_base_va, i); +} + +static void guest_setup_gic(void) +{ + static atomic_int nr_cpus_ready = 0; + u32 cpuid = guest_get_vcpuid(); + + gic_init(GIC_V3, test_data.nr_cpus); + gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K, + test_data.lpi_pend_tables + (cpuid * SZ_64K)); + + atomic_fetch_add(&nr_cpus_ready, 1); + + if (cpuid > 0) + return; + + while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus) + cpu_relax(); + + its_init(test_data.collection_table, SZ_64K, + test_data.device_table, SZ_64K, + test_data.cmdq_base, SZ_64K); + + guest_setup_its_mappings(); + guest_invalidate_all_rdists(); +} + +static void guest_code(size_t nr_lpis) +{ + guest_setup_gic(); + + GUEST_SYNC(0); + + /* + * Don't use WFI here to avoid blocking the vCPU thread indefinitely and + * never getting the stop signal. + */ + while (!READ_ONCE(test_data.request_vcpus_stop)) + cpu_relax(); + + GUEST_DONE(); +} + +static void setup_memslot(void) +{ + size_t pages; + size_t sz; + + /* + * For the ITS: + * - A single level device table + * - A single level collection table + * - The command queue + * - An ITT for each device + */ + sz = (3 + test_data.nr_devices) * SZ_64K; + + /* + * For the redistributors: + * - A shared LPI configuration table + * - An LPI pending table for each vCPU + */ + sz += (1 + test_data.nr_cpus) * SZ_64K; + + pages = sz / vm->page_size; + gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz; + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base, + TEST_MEMSLOT_INDEX, pages, 0); +} + +#define LPI_PROP_DEFAULT_PRIO 0xa0 + +static void configure_lpis(void) +{ + size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids; + u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table); + size_t i; + + for (i = 0; i < nr_lpis; i++) { + tbl[i] = LPI_PROP_DEFAULT_PRIO | + LPI_PROP_GROUP1 | + LPI_PROP_ENABLED; + } +} + +static void setup_test_data(void) +{ + size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K); + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + vm_paddr_t cmdq_base; + + test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, + TEST_MEMSLOT_INDEX); + virt_map(vm, cmdq_base, cmdq_base, pages_per_64k); + test_data.cmdq_base = cmdq_base; + test_data.cmdq_base_va = (void *)cmdq_base; + + test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices, + gpa_base, TEST_MEMSLOT_INDEX); + + test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, TEST_MEMSLOT_INDEX); + configure_lpis(); + + test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus, + gpa_base, TEST_MEMSLOT_INDEX); + + sync_global_to_guest(vm, test_data); +} + +static void setup_gic(void) +{ + gic_fd = vgic_v3_setup(vm, test_data.nr_cpus, 64); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create GICv3"); + + its_fd = vgic_its_setup(vm); +} + +static void signal_lpi(u32 device_id, u32 event_id) +{ + vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER; + + struct kvm_msi msi = { + .address_lo = db_addr, + .address_hi = db_addr >> 32, + .data = event_id, + .devid = device_id, + .flags = KVM_MSI_VALID_DEVID, + }; + + /* + * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM, + * which for arm64 implies having a valid translation in the ITS. + */ + TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1, + "KVM_SIGNAL_MSI ioctl failed"); +} + +static pthread_barrier_t test_setup_barrier; + +static void *lpi_worker_thread(void *data) +{ + u32 device_id = (size_t)data; + u32 event_id; + size_t i; + + pthread_barrier_wait(&test_setup_barrier); + + for (i = 0; i < nr_iterations; i++) + for (event_id = 0; event_id < test_data.nr_event_ids; event_id++) + signal_lpi(device_id, event_id); + + return NULL; +} + +static void *vcpu_worker_thread(void *data) +{ + struct kvm_vcpu *vcpu = data; + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + pthread_barrier_wait(&test_setup_barrier); + continue; + case UCALL_DONE: + return NULL; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unknown ucall: %lu", uc.cmd); + } + } + + return NULL; +} + +static void report_stats(struct timespec delta) +{ + double nr_lpis; + double time; + + nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations; + + time = delta.tv_sec; + time += ((double)delta.tv_nsec) / NSEC_PER_SEC; + + pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time); +} + +static void run_test(void) +{ + u32 nr_devices = test_data.nr_devices; + u32 nr_vcpus = test_data.nr_cpus; + pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t)); + pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t)); + struct timespec start, delta; + size_t i; + + TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays"); + + pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1); + + for (i = 0; i < nr_vcpus; i++) + pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]); + + for (i = 0; i < nr_devices; i++) + pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i); + + pthread_barrier_wait(&test_setup_barrier); + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (i = 0; i < nr_devices; i++) + pthread_join(lpi_threads[i], NULL); + + delta = timespec_elapsed(start); + write_guest_global(vm, test_data.request_vcpus_stop, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i], NULL); + + report_stats(delta); +} + +static void setup_vm(void) +{ + int i; + + vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu)); + TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); + + vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus); + + vm_init_descriptor_tables(vm); + for (i = 0; i < test_data.nr_cpus; i++) + vcpu_init_descriptor_tables(vcpus[i]); + + vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); + + setup_memslot(); + + setup_gic(); + + setup_test_data(); +} + +static void destroy_vm(void) +{ + close(its_fd); + close(gic_fd); + kvm_vm_free(vm); + free(vcpus); +} + +static void pr_usage(const char *name) +{ + pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name); + pr_info(" -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus); + pr_info(" -d:\tnumber of devices (default: %u)\n", test_data.nr_devices); + pr_info(" -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids); + pr_info(" -i:\tnumber of iterations (default: %lu)\n", nr_iterations); +} + +int main(int argc, char **argv) +{ + u32 nr_threads; + int c; + + while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) { + switch (c) { + case 'v': + test_data.nr_cpus = atoi(optarg); + break; + case 'd': + test_data.nr_devices = atoi(optarg); + break; + case 'e': + test_data.nr_event_ids = atoi(optarg); + break; + case 'i': + nr_iterations = strtoul(optarg, NULL, 0); + break; + case 'h': + default: + pr_usage(argv[0]); + return 1; + } + } + + nr_threads = test_data.nr_cpus + test_data.nr_devices; + if (nr_threads > get_nprocs()) + pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n", + nr_threads, get_nprocs()); + + setup_vm(); + + run_test(); + + destroy_vm(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c index f2fb0e3f14..d31b9f64ba 100644 --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -404,9 +404,6 @@ static void guest_code(uint64_t expected_pmcr_n) GUEST_DONE(); } -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - /* Create a VM that has one vCPU with PMUv3 configured. */ static void create_vpmu_vm(void *guest_code) { @@ -438,8 +435,7 @@ static void create_vpmu_vm(void *guest_code) init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3); vpmu_vm.vcpu = aarch64_vcpu_add(vpmu_vm.vm, 0, &init, guest_code); vcpu_init_descriptor_tables(vpmu_vm.vcpu); - vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64, - GICD_BASE_GPA, GICR_BASE_GPA); + vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64); __TEST_REQUIRE(vpmu_vm.gic_fd >= 0, "Failed to create vgic-v3, skipping"); diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c index ae1f1a6d83..acb2cb5963 100644 --- a/tools/testing/selftests/kvm/arch_timer.c +++ b/tools/testing/selftests/kvm/arch_timer.c @@ -19,9 +19,6 @@ * * Copyright (c) 2021, Google LLC. */ - -#define _GNU_SOURCE - #include #include #include @@ -29,6 +26,7 @@ #include #include "timer_test.h" +#include "ucall_common.h" struct test_args test_args = { .nr_vcpus = NR_VCPUS_DEF, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index bf3609f718..0202b78f86 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -6,14 +6,10 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019, Google, Inc. */ - -#define _GNU_SOURCE /* for pipe2 */ - #include #include #include #include -#include #include #include #include @@ -22,6 +18,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #include "userfaultfd_util.h" #ifdef __NR_userfaultfd @@ -77,8 +74,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, copy.mode = 0; r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -89,8 +98,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, cont.range.len = demand_paging_size; r = ioctl(uffd, UFFDIO_CONTINUE, &cont); - if (r == -1) { - pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx, thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -110,7 +131,9 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, struct test_params { int uffd_mode; + bool single_uffd; useconds_t uffd_delay; + int readers_per_uffd; enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; }; @@ -131,10 +154,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct memstress_vcpu_args *vcpu_args; struct test_params *p = arg; struct uffd_desc **uffd_descs = NULL; + uint64_t uffd_region_size; struct timespec start; struct timespec ts_diff; + double vcpu_paging_rate; struct kvm_vm *vm; - int i; + int i, num_uffds = 0; vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); @@ -147,7 +172,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) memset(guest_data_prototype, 0xAB, demand_paging_size); if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { - for (i = 0; i < nr_vcpus; i++) { + num_uffds = p->single_uffd ? 1 : nr_vcpus; + for (i = 0; i < num_uffds; i++) { vcpu_args = &memstress_args.vcpu_args[i]; prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), vcpu_args->pages * memstress_args.guest_page_size); @@ -155,9 +181,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) } if (p->uffd_mode) { - uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *)); + num_uffds = p->single_uffd ? 1 : nr_vcpus; + uffd_region_size = nr_vcpus * guest_percpu_mem_size / num_uffds; + + uffd_descs = malloc(num_uffds * sizeof(struct uffd_desc *)); TEST_ASSERT(uffd_descs, "Memory allocation failed"); - for (i = 0; i < nr_vcpus; i++) { + for (i = 0; i < num_uffds; i++) { + struct memstress_vcpu_args *vcpu_args; void *vcpu_hva; vcpu_args = &memstress_args.vcpu_args[i]; @@ -170,7 +200,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) */ uffd_descs[i] = uffd_setup_demand_paging( p->uffd_mode, p->uffd_delay, vcpu_hva, - vcpu_args->pages * memstress_args.guest_page_size, + uffd_region_size, + p->readers_per_uffd, &handle_uffd_page_request); } } @@ -187,15 +218,19 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (p->uffd_mode) { /* Tell the user fault fd handler threads to quit */ - for (i = 0; i < nr_vcpus; i++) + for (i = 0; i < num_uffds; i++) uffd_stop_demand_paging(uffd_descs[i]); } - pr_info("Total guest execution time: %ld.%.9lds\n", + pr_info("Total guest execution time:\t%ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); - pr_info("Overall demand paging rate: %f pgs/sec\n", - memstress_args.vcpu_args[0].pages * nr_vcpus / - ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC)); + + vcpu_paging_rate = memstress_args.vcpu_args[0].pages / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC); + pr_info("Per-vcpu demand paging rate:\t%f pgs/sec/vcpu\n", + vcpu_paging_rate); + pr_info("Overall demand paging rate:\t%f pgs/sec\n", + vcpu_paging_rate * nr_vcpus); memstress_destroy_vm(vm); @@ -207,15 +242,20 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-a]\n" + " [-d uffd_delay_usec] [-r readers_per_uffd] [-b memory]\n" + " [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); kvm_print_vcpu_pinning_help(); + printf(" -a: Use a single userfaultfd for all of guest memory, instead of\n" + " creating one for each region paged by a unique vCPU\n" + " Set implicitly with -o, and no effect without -u.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); + printf(" -r: Set the number of reader threads per uffd.\n"); printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); @@ -234,12 +274,14 @@ int main(int argc, char *argv[]) struct test_params p = { .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, + .readers_per_uffd = 1, + .single_uffd = false, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:c:o")) != -1) { + while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -251,6 +293,9 @@ int main(int argc, char *argv[]) p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); break; + case 'a': + p.single_uffd = true; + break; case 'd': p.uffd_delay = strtoul(optarg, NULL, 0); TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported."); @@ -271,6 +316,13 @@ int main(int argc, char *argv[]) break; case 'o': p.partition_vcpu_memory_access = false; + p.single_uffd = true; + break; + case 'r': + p.readers_per_uffd = atoi(optarg); + TEST_ASSERT(p.readers_per_uffd >= 1, + "Invalid number of readers per uffd %d: must be >=1", + p.readers_per_uffd); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 504f6fe980..9f24303acb 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -18,13 +18,11 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #ifdef __aarch64__ #include "aarch64/vgic.h" -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - static int gic_fd; static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) @@ -33,7 +31,7 @@ static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) * The test can still run even if hardware does not support GICv3, as it * is only an optimization to reduce guest exits. */ - gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64); } static void arch_cleanup_vm(struct kvm_vm *vm) @@ -132,7 +130,6 @@ struct test_params { enum vm_mem_backing_src_type backing_src; int slots; uint32_t write_percent; - uint32_t random_seed; bool random_access; }; @@ -156,8 +153,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) p->slots, p->backing_src, p->partition_vcpu_memory_access); - pr_info("Random seed: %u\n", p->random_seed); - memstress_set_random_seed(vm, p->random_seed); memstress_set_write_percent(vm, p->write_percent); guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; @@ -346,11 +341,13 @@ int main(int argc, char *argv[]) .partition_vcpu_memory_access = true, .backing_src = DEFAULT_VM_MEM_SRC, .slots = 1, - .random_seed = 1, .write_percent = 100, }; int opt; + /* Override the seed to be deterministic by default. */ + guest_random_seed = 1; + dirty_log_manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | @@ -395,7 +392,7 @@ int main(int argc, char *argv[]) p.phys_offset = strtoull(optarg, NULL, 0); break; case 'r': - p.random_seed = atoi_positive("Random seed", optarg); + guest_random_seed = atoi_positive("Random seed", optarg); break; case 's': p.backing_src = parse_backing_src_type(optarg); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index eaad5b2085..aacf80f574 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include @@ -23,6 +20,7 @@ #include "test_util.h" #include "guest_modes.h" #include "processor.h" +#include "ucall_common.h" #define DIRTY_MEM_BITS 30 /* 1G */ #define PAGE_SHIFT_4K 12 @@ -76,7 +74,6 @@ static uint64_t host_page_size; static uint64_t guest_page_size; static uint64_t guest_num_pages; -static uint64_t random_array[TEST_PAGES_PER_LOOP]; static uint64_t iteration; /* @@ -109,19 +106,19 @@ static void guest_code(void) */ for (i = 0; i < guest_num_pages; i++) { addr = guest_test_virt_mem + i * guest_page_size; - *(uint64_t *)addr = READ_ONCE(iteration); + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } while (true) { for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { addr = guest_test_virt_mem; - addr += (READ_ONCE(random_array[i]) % guest_num_pages) + addr += (guest_random_u64(&guest_rng) % guest_num_pages) * guest_page_size; addr = align_down(addr, host_page_size); - *(uint64_t *)addr = READ_ONCE(iteration); + + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } - /* Tell the host that we need more random numbers */ GUEST_SYNC(1); } } @@ -508,20 +505,10 @@ static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) mode->after_vcpu_run(vcpu, ret, err); } -static void generate_random_array(uint64_t *guest_array, uint64_t size) -{ - uint64_t i; - - for (i = 0; i < size; i++) - guest_array[i] = random(); -} - static void *vcpu_worker(void *data) { int ret; struct kvm_vcpu *vcpu = data; - struct kvm_vm *vm = vcpu->vm; - uint64_t *guest_array; uint64_t pages_count = 0; struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset) + sizeof(sigset_t)); @@ -540,11 +527,8 @@ static void *vcpu_worker(void *data) sigemptyset(sigset); sigaddset(sigset, SIG_IPI); - guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); - while (!READ_ONCE(host_quit)) { /* Clear any existing kick signals */ - generate_random_array(guest_array, TEST_PAGES_PER_LOOP); pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ ret = __vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 92eae206ba..ba0c8e9960 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -4,8 +4,6 @@ * * Author: Chao Peng */ - -#define _GNU_SOURCE #include #include #include @@ -19,8 +17,8 @@ #include #include +#include "kvm_util.h" #include "test_util.h" -#include "kvm_util_base.h" static void test_file_read_write(int fd) { diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c index 3502caa359..8092c2d0f5 100644 --- a/tools/testing/selftests/kvm/guest_print_test.c +++ b/tools/testing/selftests/kvm/guest_print_test.c @@ -13,6 +13,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" struct guest_vals { uint64_t a; diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index decc521fc7..bce73bcb97 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -4,9 +4,6 @@ * kvm_arch_hardware_disable is called and it attempts to unregister the user * return notifiers. */ - -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h index b217ea17ca..baeb3c8593 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic.h @@ -6,11 +6,26 @@ #ifndef SELFTEST_KVM_GIC_H #define SELFTEST_KVM_GIC_H +#include + enum gic_type { GIC_V3, GIC_TYPE_MAX, }; +/* + * Note that the redistributor frames are at the end, as the range scales + * with the number of vCPUs in the VM. + */ +#define GITS_BASE_GPA 0x8000000ULL +#define GICD_BASE_GPA (GITS_BASE_GPA + KVM_VGIC_V3_ITS_SIZE) +#define GICR_BASE_GPA (GICD_BASE_GPA + KVM_VGIC_V3_DIST_SIZE) + +/* The GIC is identity-mapped into the guest at the time of setup. */ +#define GITS_BASE_GVA ((volatile void *)GITS_BASE_GPA) +#define GICD_BASE_GVA ((volatile void *)GICD_BASE_GPA) +#define GICR_BASE_GVA ((volatile void *)GICR_BASE_GPA) + #define MIN_SGI 0 #define MIN_PPI 16 #define MIN_SPI 32 @@ -21,8 +36,7 @@ enum gic_type { #define INTID_IS_PPI(intid) (MIN_PPI <= (intid) && (intid) < MIN_SPI) #define INTID_IS_SPI(intid) (MIN_SPI <= (intid) && (intid) <= MAX_SPI) -void gic_init(enum gic_type type, unsigned int nr_cpus, - void *dist_base, void *redist_base); +void gic_init(enum gic_type type, unsigned int nr_cpus); void gic_irq_enable(unsigned int intid); void gic_irq_disable(unsigned int intid); unsigned int gic_get_and_ack_irq(void); @@ -44,4 +58,7 @@ void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table); + #endif /* SELFTEST_KVM_GIC_H */ diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h index ba0886e8a2..a76615fa39 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h @@ -1,82 +1,604 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ /* - * ARM Generic Interrupt Controller (GIC) v3 specific defines + * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved. + * Author: Marc Zyngier */ - -#ifndef SELFTEST_KVM_GICV3_H -#define SELFTEST_KVM_GICV3_H - -#include +#ifndef __SELFTESTS_GIC_V3_H +#define __SELFTESTS_GIC_V3_H /* - * Distributor registers + * Distributor registers. We assume we're running non-secure, with ARE + * being set. Secure-only and non-ARE registers are not described. */ #define GICD_CTLR 0x0000 #define GICD_TYPER 0x0004 +#define GICD_IIDR 0x0008 +#define GICD_TYPER2 0x000C +#define GICD_STATUSR 0x0010 +#define GICD_SETSPI_NSR 0x0040 +#define GICD_CLRSPI_NSR 0x0048 +#define GICD_SETSPI_SR 0x0050 +#define GICD_CLRSPI_SR 0x0058 #define GICD_IGROUPR 0x0080 #define GICD_ISENABLER 0x0100 #define GICD_ICENABLER 0x0180 #define GICD_ISPENDR 0x0200 #define GICD_ICPENDR 0x0280 -#define GICD_ICACTIVER 0x0380 #define GICD_ISACTIVER 0x0300 +#define GICD_ICACTIVER 0x0380 #define GICD_IPRIORITYR 0x0400 #define GICD_ICFGR 0x0C00 +#define GICD_IGRPMODR 0x0D00 +#define GICD_NSACR 0x0E00 +#define GICD_IGROUPRnE 0x1000 +#define GICD_ISENABLERnE 0x1200 +#define GICD_ICENABLERnE 0x1400 +#define GICD_ISPENDRnE 0x1600 +#define GICD_ICPENDRnE 0x1800 +#define GICD_ISACTIVERnE 0x1A00 +#define GICD_ICACTIVERnE 0x1C00 +#define GICD_IPRIORITYRnE 0x2000 +#define GICD_ICFGRnE 0x3000 +#define GICD_IROUTER 0x6000 +#define GICD_IROUTERnE 0x8000 +#define GICD_IDREGS 0xFFD0 +#define GICD_PIDR2 0xFFE8 + +#define ESPI_BASE_INTID 4096 /* - * The assumption is that the guest runs in a non-secure mode. - * The following bits of GICD_CTLR are defined accordingly. + * Those registers are actually from GICv2, but the spec demands that they + * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3). */ +#define GICD_ITARGETSR 0x0800 +#define GICD_SGIR 0x0F00 +#define GICD_CPENDSGIR 0x0F10 +#define GICD_SPENDSGIR 0x0F20 + #define GICD_CTLR_RWP (1U << 31) #define GICD_CTLR_nASSGIreq (1U << 8) +#define GICD_CTLR_DS (1U << 6) #define GICD_CTLR_ARE_NS (1U << 4) #define GICD_CTLR_ENABLE_G1A (1U << 1) #define GICD_CTLR_ENABLE_G1 (1U << 0) +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + +/* + * In systems with a single security state (what we emulate in KVM) + * the meaning of the interrupt group enable bits is slightly different + */ +#define GICD_CTLR_ENABLE_SS_G1 (1U << 1) +#define GICD_CTLR_ENABLE_SS_G0 (1U << 0) + +#define GICD_TYPER_RSS (1U << 26) +#define GICD_TYPER_LPIS (1U << 17) +#define GICD_TYPER_MBIS (1U << 16) +#define GICD_TYPER_ESPI (1U << 8) + +#define GICD_TYPER_ID_BITS(typer) ((((typer) >> 19) & 0x1f) + 1) +#define GICD_TYPER_NUM_LPIS(typer) ((((typer) >> 11) & 0x1f) + 1) #define GICD_TYPER_SPIS(typer) ((((typer) & 0x1f) + 1) * 32) -#define GICD_INT_DEF_PRI_X4 0xa0a0a0a0 +#define GICD_TYPER_ESPIS(typer) \ + (((typer) & GICD_TYPER_ESPI) ? GICD_TYPER_SPIS((typer) >> 27) : 0) + +#define GICD_TYPER2_nASSGIcap (1U << 8) +#define GICD_TYPER2_VIL (1U << 7) +#define GICD_TYPER2_VID GENMASK(4, 0) + +#define GICD_IROUTER_SPI_MODE_ONE (0U << 31) +#define GICD_IROUTER_SPI_MODE_ANY (1U << 31) + +#define GIC_PIDR2_ARCH_MASK 0xf0 +#define GIC_PIDR2_ARCH_GICv3 0x30 +#define GIC_PIDR2_ARCH_GICv4 0x40 + +#define GIC_V3_DIST_SIZE 0x10000 + +#define GIC_PAGE_SIZE_4K 0ULL +#define GIC_PAGE_SIZE_16K 1ULL +#define GIC_PAGE_SIZE_64K 2ULL +#define GIC_PAGE_SIZE_MASK 3ULL /* - * Redistributor registers + * Re-Distributor registers, offsets from RD_base */ -#define GICR_CTLR 0x000 -#define GICR_WAKER 0x014 +#define GICR_CTLR GICD_CTLR +#define GICR_IIDR 0x0004 +#define GICR_TYPER 0x0008 +#define GICR_STATUSR GICD_STATUSR +#define GICR_WAKER 0x0014 +#define GICR_SETLPIR 0x0040 +#define GICR_CLRLPIR 0x0048 +#define GICR_PROPBASER 0x0070 +#define GICR_PENDBASER 0x0078 +#define GICR_INVLPIR 0x00A0 +#define GICR_INVALLR 0x00B0 +#define GICR_SYNCR 0x00C0 +#define GICR_IDREGS GICD_IDREGS +#define GICR_PIDR2 GICD_PIDR2 + +#define GICR_CTLR_ENABLE_LPIS (1UL << 0) +#define GICR_CTLR_CES (1UL << 1) +#define GICR_CTLR_IR (1UL << 2) +#define GICR_CTLR_RWP (1UL << 3) -#define GICR_CTLR_RWP (1U << 3) +#define GICR_TYPER_CPU_NUMBER(r) (((r) >> 8) & 0xffff) + +#define EPPI_BASE_INTID 1056 + +#define GICR_TYPER_NR_PPIS(r) \ + ({ \ + unsigned int __ppinum = ((r) >> 27) & 0x1f; \ + unsigned int __nr_ppis = 16; \ + if (__ppinum == 1 || __ppinum == 2) \ + __nr_ppis += __ppinum * 32; \ + \ + __nr_ppis; \ + }) #define GICR_WAKER_ProcessorSleep (1U << 1) #define GICR_WAKER_ChildrenAsleep (1U << 2) +#define GIC_BASER_CACHE_nCnB 0ULL +#define GIC_BASER_CACHE_SameAsInner 0ULL +#define GIC_BASER_CACHE_nC 1ULL +#define GIC_BASER_CACHE_RaWt 2ULL +#define GIC_BASER_CACHE_RaWb 3ULL +#define GIC_BASER_CACHE_WaWt 4ULL +#define GIC_BASER_CACHE_WaWb 5ULL +#define GIC_BASER_CACHE_RaWaWt 6ULL +#define GIC_BASER_CACHE_RaWaWb 7ULL +#define GIC_BASER_CACHE_MASK 7ULL +#define GIC_BASER_NonShareable 0ULL +#define GIC_BASER_InnerShareable 1ULL +#define GIC_BASER_OuterShareable 2ULL +#define GIC_BASER_SHAREABILITY_MASK 3ULL + +#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \ + (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT) + +#define GIC_BASER_SHAREABILITY(reg, type) \ + (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) + +/* encode a size field of width @w containing @n - 1 units */ +#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0)) + +#define GICR_PROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK) +#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK) +#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK) +#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_PROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) + +#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) +#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt) +#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) + +#define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GICR_PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 12)) +#define GICR_PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 16)) + +#define GICR_PENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK) +#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK) +#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK) +#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_PENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) + +#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) +#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt) +#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb) + +#define GICR_PENDBASER_PTZ BIT_ULL(62) + /* - * Redistributor registers, offsets from SGI base + * Re-Distributor registers, offsets from SGI_base */ #define GICR_IGROUPR0 GICD_IGROUPR #define GICR_ISENABLER0 GICD_ISENABLER #define GICR_ICENABLER0 GICD_ICENABLER #define GICR_ISPENDR0 GICD_ISPENDR +#define GICR_ICPENDR0 GICD_ICPENDR #define GICR_ISACTIVER0 GICD_ISACTIVER #define GICR_ICACTIVER0 GICD_ICACTIVER -#define GICR_ICENABLER GICD_ICENABLER -#define GICR_ICACTIVER GICD_ICACTIVER #define GICR_IPRIORITYR0 GICD_IPRIORITYR +#define GICR_ICFGR0 GICD_ICFGR +#define GICR_IGRPMODR0 GICD_IGRPMODR +#define GICR_NSACR GICD_NSACR + +#define GICR_TYPER_PLPIS (1U << 0) +#define GICR_TYPER_VLPIS (1U << 1) +#define GICR_TYPER_DIRTY (1U << 2) +#define GICR_TYPER_DirectLPIS (1U << 3) +#define GICR_TYPER_LAST (1U << 4) +#define GICR_TYPER_RVPEID (1U << 7) +#define GICR_TYPER_COMMON_LPI_AFF GENMASK_ULL(25, 24) +#define GICR_TYPER_AFFINITY GENMASK_ULL(63, 32) + +#define GICR_INVLPIR_INTID GENMASK_ULL(31, 0) +#define GICR_INVLPIR_VPEID GENMASK_ULL(47, 32) +#define GICR_INVLPIR_V GENMASK_ULL(63, 63) + +#define GICR_INVALLR_VPEID GICR_INVLPIR_VPEID +#define GICR_INVALLR_V GICR_INVLPIR_V + +#define GIC_V3_REDIST_SIZE 0x20000 + +#define LPI_PROP_GROUP1 (1 << 1) +#define LPI_PROP_ENABLED (1 << 0) + +/* + * Re-Distributor registers, offsets from VLPI_base + */ +#define GICR_VPROPBASER 0x0070 + +#define GICR_VPROPBASER_IDBITS_MASK 0x1f + +#define GICR_VPROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_SHIFT (56) + +#define GICR_VPROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, SHAREABILITY_MASK) +#define GICR_VPROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, MASK) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, OUTER, MASK) +#define GICR_VPROPBASER_CACHEABILITY_MASK \ + GICR_VPROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, InnerShareable) + +#define GICR_VPROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nCnB) +#define GICR_VPROPBASER_nC GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nC) +#define GICR_VPROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt) +#define GICR_VPROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWb) +#define GICR_VPROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWt) +#define GICR_VPROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWb) +#define GICR_VPROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWt) +#define GICR_VPROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWb) + +/* + * GICv4.1 VPROPBASER reinvention. A subtle mix between the old + * VPROPBASER and ITS_BASER. Just not quite any of the two. + */ +#define GICR_VPROPBASER_4_1_VALID (1ULL << 63) +#define GICR_VPROPBASER_4_1_ENTRY_SIZE GENMASK_ULL(61, 59) +#define GICR_VPROPBASER_4_1_INDIRECT (1ULL << 55) +#define GICR_VPROPBASER_4_1_PAGE_SIZE GENMASK_ULL(54, 53) +#define GICR_VPROPBASER_4_1_Z (1ULL << 52) +#define GICR_VPROPBASER_4_1_ADDR GENMASK_ULL(51, 12) +#define GICR_VPROPBASER_4_1_SIZE GENMASK_ULL(6, 0) + +#define GICR_VPENDBASER 0x0078 + +#define GICR_VPENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_VPENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, SHAREABILITY_MASK) +#define GICR_VPENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, MASK) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, OUTER, MASK) +#define GICR_VPENDBASER_CACHEABILITY_MASK \ + GICR_VPENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPENDBASER_NonShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, NonShareable) + +#define GICR_VPENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, InnerShareable) + +#define GICR_VPENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nCnB) +#define GICR_VPENDBASER_nC GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nC) +#define GICR_VPENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt) +#define GICR_VPENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWb) +#define GICR_VPENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWt) +#define GICR_VPENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWb) +#define GICR_VPENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWt) +#define GICR_VPENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWb) + +#define GICR_VPENDBASER_Dirty (1ULL << 60) +#define GICR_VPENDBASER_PendingLast (1ULL << 61) +#define GICR_VPENDBASER_IDAI (1ULL << 62) +#define GICR_VPENDBASER_Valid (1ULL << 63) + +/* + * GICv4.1 VPENDBASER, used for VPE residency. On top of these fields, + * also use the above Valid, PendingLast and Dirty. + */ +#define GICR_VPENDBASER_4_1_DB (1ULL << 62) +#define GICR_VPENDBASER_4_1_VGRP0EN (1ULL << 59) +#define GICR_VPENDBASER_4_1_VGRP1EN (1ULL << 58) +#define GICR_VPENDBASER_4_1_VPEID GENMASK_ULL(15, 0) + +#define GICR_VSGIR 0x0080 + +#define GICR_VSGIR_VPEID GENMASK(15, 0) + +#define GICR_VSGIPENDR 0x0088 + +#define GICR_VSGIPENDR_BUSY (1U << 31) +#define GICR_VSGIPENDR_PENDING GENMASK(15, 0) + +/* + * ITS registers, offsets from ITS_base + */ +#define GITS_CTLR 0x0000 +#define GITS_IIDR 0x0004 +#define GITS_TYPER 0x0008 +#define GITS_MPIDR 0x0018 +#define GITS_CBASER 0x0080 +#define GITS_CWRITER 0x0088 +#define GITS_CREADR 0x0090 +#define GITS_BASER 0x0100 +#define GITS_IDREGS_BASE 0xffd0 +#define GITS_PIDR0 0xffe0 +#define GITS_PIDR1 0xffe4 +#define GITS_PIDR2 GICR_PIDR2 +#define GITS_PIDR4 0xffd0 +#define GITS_CIDR0 0xfff0 +#define GITS_CIDR1 0xfff4 +#define GITS_CIDR2 0xfff8 +#define GITS_CIDR3 0xfffc + +#define GITS_TRANSLATER 0x10040 + +#define GITS_SGIR 0x20020 + +#define GITS_SGIR_VPEID GENMASK_ULL(47, 32) +#define GITS_SGIR_VINTID GENMASK_ULL(3, 0) + +#define GITS_CTLR_ENABLE (1U << 0) +#define GITS_CTLR_ImDe (1U << 1) +#define GITS_CTLR_ITS_NUMBER_SHIFT 4 +#define GITS_CTLR_ITS_NUMBER (0xFU << GITS_CTLR_ITS_NUMBER_SHIFT) +#define GITS_CTLR_QUIESCENT (1U << 31) + +#define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_VLPIS (1UL << 1) +#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 +#define GITS_TYPER_ITT_ENTRY_SIZE GENMASK_ULL(7, 4) +#define GITS_TYPER_IDBITS_SHIFT 8 +#define GITS_TYPER_DEVBITS_SHIFT 13 +#define GITS_TYPER_DEVBITS GENMASK_ULL(17, 13) +#define GITS_TYPER_PTA (1UL << 19) +#define GITS_TYPER_HCC_SHIFT 24 +#define GITS_TYPER_HCC(r) (((r) >> GITS_TYPER_HCC_SHIFT) & 0xff) +#define GITS_TYPER_VMOVP (1ULL << 37) +#define GITS_TYPER_VMAPP (1ULL << 40) +#define GITS_TYPER_SVPET GENMASK_ULL(42, 41) -/* CPU interface registers */ -#define SYS_ICC_PMR_EL1 sys_reg(3, 0, 4, 6, 0) -#define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) -#define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) -#define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) -#define SYS_ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) -#define SYS_ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) -#define SYS_ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) +#define GITS_IIDR_REV_SHIFT 12 +#define GITS_IIDR_REV_MASK (0xf << GITS_IIDR_REV_SHIFT) +#define GITS_IIDR_REV(r) (((r) >> GITS_IIDR_REV_SHIFT) & 0xf) +#define GITS_IIDR_PRODUCTID_SHIFT 24 -#define SYS_ICV_AP1R0_EL1 sys_reg(3, 0, 12, 9, 0) +#define GITS_CBASER_VALID (1ULL << 63) +#define GITS_CBASER_SHAREABILITY_SHIFT (10) +#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_CBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK) +#define GITS_CBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK) +#define GITS_CBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK) +#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK -#define ICC_PMR_DEF_PRIO 0xf0 +#define GITS_CBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) +#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWb) +#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) +#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) +#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) + +#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) + +#define GITS_BASER_NR_REGS 8 + +#define GITS_BASER_VALID (1ULL << 63) +#define GITS_BASER_INDIRECT (1ULL << 62) + +#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_BASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK +#define GITS_BASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) +#define GITS_BASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) + +#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB) +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) +#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt) +#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb) + +#define GITS_BASER_TYPE_SHIFT (56) +#define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) +#define GITS_BASER_ENTRY_SIZE_SHIFT (48) +#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) +#define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) +#define GITS_BASER_PHYS_52_to_48(phys) \ + (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) +#define GITS_BASER_ADDR_48_to_52(baser) \ + (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) + +#define GITS_BASER_SHAREABILITY_SHIFT (10) +#define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) +#define GITS_BASER_PAGE_SIZE_SHIFT (8) +#define __GITS_BASER_PSZ(sz) (GIC_PAGE_SIZE_ ## sz << GITS_BASER_PAGE_SIZE_SHIFT) +#define GITS_BASER_PAGE_SIZE_4K __GITS_BASER_PSZ(4K) +#define GITS_BASER_PAGE_SIZE_16K __GITS_BASER_PSZ(16K) +#define GITS_BASER_PAGE_SIZE_64K __GITS_BASER_PSZ(64K) +#define GITS_BASER_PAGE_SIZE_MASK __GITS_BASER_PSZ(MASK) +#define GITS_BASER_PAGES_MAX 256 +#define GITS_BASER_PAGES_SHIFT (0) +#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1) + +#define GITS_BASER_TYPE_NONE 0 +#define GITS_BASER_TYPE_DEVICE 1 +#define GITS_BASER_TYPE_VCPU 2 +#define GITS_BASER_TYPE_RESERVED3 3 +#define GITS_BASER_TYPE_COLLECTION 4 +#define GITS_BASER_TYPE_RESERVED5 5 +#define GITS_BASER_TYPE_RESERVED6 6 +#define GITS_BASER_TYPE_RESERVED7 7 + +#define GITS_LVL1_ENTRY_SIZE (8UL) + +/* + * ITS commands + */ +#define GITS_CMD_MAPD 0x08 +#define GITS_CMD_MAPC 0x09 +#define GITS_CMD_MAPTI 0x0a +#define GITS_CMD_MAPI 0x0b +#define GITS_CMD_MOVI 0x01 +#define GITS_CMD_DISCARD 0x0f +#define GITS_CMD_INV 0x0c +#define GITS_CMD_MOVALL 0x0e +#define GITS_CMD_INVALL 0x0d +#define GITS_CMD_INT 0x03 +#define GITS_CMD_CLEAR 0x04 +#define GITS_CMD_SYNC 0x05 + +/* + * GICv4 ITS specific commands + */ +#define GITS_CMD_GICv4(x) ((x) | 0x20) +#define GITS_CMD_VINVALL GITS_CMD_GICv4(GITS_CMD_INVALL) +#define GITS_CMD_VMAPP GITS_CMD_GICv4(GITS_CMD_MAPC) +#define GITS_CMD_VMAPTI GITS_CMD_GICv4(GITS_CMD_MAPTI) +#define GITS_CMD_VMOVI GITS_CMD_GICv4(GITS_CMD_MOVI) +#define GITS_CMD_VSYNC GITS_CMD_GICv4(GITS_CMD_SYNC) +/* VMOVP, VSGI and INVDB are the odd ones, as they dont have a physical counterpart */ +#define GITS_CMD_VMOVP GITS_CMD_GICv4(2) +#define GITS_CMD_VSGI GITS_CMD_GICv4(3) +#define GITS_CMD_INVDB GITS_CMD_GICv4(0xe) + +/* + * ITS error numbers + */ +#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 +#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_INT_UNMAPPED_INTERRUPT 0x010307 +#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 +#define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPD_ITTSIZE_OOR 0x010802 +#define E_ITS_MAPC_PROCNUM_OOR 0x010902 +#define E_ITS_MAPC_COLLECTION_OOR 0x010903 +#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_ID_OOR 0x010a05 +#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 +#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 +#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 +#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01 +#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07 + +/* + * CPU interface registers + */ +#define ICC_CTLR_EL1_EOImode_SHIFT (1) +#define ICC_CTLR_EL1_EOImode_drop_dir (0U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_drop (1U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_CBPR_SHIFT 0 +#define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT) +#define ICC_CTLR_EL1_PMHE_SHIFT 6 +#define ICC_CTLR_EL1_PMHE_MASK (1 << ICC_CTLR_EL1_PMHE_SHIFT) +#define ICC_CTLR_EL1_PRI_BITS_SHIFT 8 +#define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT) +#define ICC_CTLR_EL1_ID_BITS_SHIFT 11 +#define ICC_CTLR_EL1_ID_BITS_MASK (0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT) +#define ICC_CTLR_EL1_SEIS_SHIFT 14 +#define ICC_CTLR_EL1_SEIS_MASK (0x1 << ICC_CTLR_EL1_SEIS_SHIFT) +#define ICC_CTLR_EL1_A3V_SHIFT 15 +#define ICC_CTLR_EL1_A3V_MASK (0x1 << ICC_CTLR_EL1_A3V_SHIFT) +#define ICC_CTLR_EL1_RSS (0x1 << 18) +#define ICC_CTLR_EL1_ExtRange (0x1 << 19) +#define ICC_PMR_EL1_SHIFT 0 +#define ICC_PMR_EL1_MASK (0xff << ICC_PMR_EL1_SHIFT) +#define ICC_BPR0_EL1_SHIFT 0 +#define ICC_BPR0_EL1_MASK (0x7 << ICC_BPR0_EL1_SHIFT) +#define ICC_BPR1_EL1_SHIFT 0 +#define ICC_BPR1_EL1_MASK (0x7 << ICC_BPR1_EL1_SHIFT) +#define ICC_IGRPEN0_EL1_SHIFT 0 +#define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT) +#define ICC_IGRPEN1_EL1_SHIFT 0 +#define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT) +#define ICC_SRE_EL1_DIB (1U << 2) +#define ICC_SRE_EL1_DFB (1U << 1) #define ICC_SRE_EL1_SRE (1U << 0) -#define ICC_IGRPEN1_EL1_ENABLE (1U << 0) +/* These are for GICv2 emulation only */ +#define GICH_LR_VIRTUALID (0x3ffUL << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT (10) +#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) + +#define ICC_IAR1_EL1_SPURIOUS 0x3ff + +#define ICC_SRE_EL2_SRE (1 << 0) +#define ICC_SRE_EL2_ENABLE (1 << 3) -#define GICV3_MAX_CPUS 512 +#define ICC_SGI1R_TARGET_LIST_SHIFT 0 +#define ICC_SGI1R_TARGET_LIST_MASK (0xffff << ICC_SGI1R_TARGET_LIST_SHIFT) +#define ICC_SGI1R_AFFINITY_1_SHIFT 16 +#define ICC_SGI1R_AFFINITY_1_MASK (0xff << ICC_SGI1R_AFFINITY_1_SHIFT) +#define ICC_SGI1R_SGI_ID_SHIFT 24 +#define ICC_SGI1R_SGI_ID_MASK (0xfULL << ICC_SGI1R_SGI_ID_SHIFT) +#define ICC_SGI1R_AFFINITY_2_SHIFT 32 +#define ICC_SGI1R_AFFINITY_2_MASK (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT) +#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT 40 +#define ICC_SGI1R_RS_SHIFT 44 +#define ICC_SGI1R_RS_MASK (0xfULL << ICC_SGI1R_RS_SHIFT) +#define ICC_SGI1R_AFFINITY_3_SHIFT 48 +#define ICC_SGI1R_AFFINITY_3_MASK (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT) -#endif /* SELFTEST_KVM_GICV3_H */ +#endif diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h new file mode 100644 index 0000000000..3722ed9c8f --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTESTS_GIC_V3_ITS_H__ +#define __SELFTESTS_GIC_V3_ITS_H__ + +#include + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size); + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid); +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid); +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid); +void its_send_invall_cmd(void *cmdq_base, u32 collection_id); + +#endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 9e518b5628..9b20a355d8 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -8,6 +8,8 @@ #define SELFTEST_KVM_PROCESSOR_H #include "kvm_util.h" +#include "ucall_common.h" + #include #include #include @@ -58,8 +60,6 @@ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT)) -#define MPIDR_HWID_BITMASK (0xff00fffffful) - void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init); struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu_init *init, void *guest_code); @@ -177,11 +177,28 @@ static __always_inline u32 __raw_readl(const volatile void *addr) return val; } +static __always_inline void __raw_writeq(u64 val, volatile void *addr) +{ + asm volatile("str %0, [%1]" : : "rZ" (val), "r" (addr)); +} + +static __always_inline u64 __raw_readq(const volatile void *addr) +{ + u64 val; + asm volatile("ldr %0, [%1]" : "=r" (val) : "r" (addr)); + return val; +} + #define writel_relaxed(v,c) ((void)__raw_writel((__force u32)cpu_to_le32(v),(c))) #define readl_relaxed(c) ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; }) +#define writeq_relaxed(v,c) ((void)__raw_writeq((__force u64)cpu_to_le64(v),(c))) +#define readq_relaxed(c) ({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; }) #define writel(v,c) ({ __iowmb(); writel_relaxed((v),(c));}) #define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(__v); __v; }) +#define writeq(v,c) ({ __iowmb(); writeq_relaxed((v),(c));}) +#define readq(c) ({ u64 __v = readq_relaxed(c); __iormb(__v); __v; }) + static inline void local_irq_enable(void) { diff --git a/tools/testing/selftests/kvm/include/aarch64/ucall.h b/tools/testing/selftests/kvm/include/aarch64/ucall.h index 4b68f37efd..4ec801f37f 100644 --- a/tools/testing/selftests/kvm/include/aarch64/ucall.h +++ b/tools/testing/selftests/kvm/include/aarch64/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_MMIO diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h index 0ac6f05c63..c481d0c00a 100644 --- a/tools/testing/selftests/kvm/include/aarch64/vgic.h +++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h @@ -16,8 +16,7 @@ ((uint64_t)(flags) << 12) | \ index) -int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, - uint64_t gicd_base_gpa, uint64_t gicr_base_gpa); +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs); #define VGIC_MAX_RESERVED 1023 @@ -33,4 +32,6 @@ void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu); #define KVM_IRQCHIP_NUM_PINS (1020 - 32) +int vgic_its_setup(struct kvm_vm *vm); + #endif // SELFTEST_KVM_VGIC_H diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index c9286811a4..63c2aaae51 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1,13 +1,1116 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * tools/testing/selftests/kvm/include/kvm_util.h - * * Copyright (C) 2018, Google LLC. */ #ifndef SELFTEST_KVM_UTIL_H #define SELFTEST_KVM_UTIL_H -#include "kvm_util_base.h" -#include "ucall_common.h" +#include "test_util.h" + +#include +#include "linux/hashtable.h" +#include "linux/list.h" +#include +#include +#include "linux/rbtree.h" +#include + +#include +#include + +#include + +#include "kvm_util_arch.h" +#include "kvm_util_types.h" +#include "sparsebit.h" + +#define KVM_DEV_PATH "/dev/kvm" +#define KVM_MAX_VCPUS 512 + +#define NSEC_PER_SEC 1000000000L + +struct userspace_mem_region { + struct kvm_userspace_memory_region2 region; + struct sparsebit *unused_phy_pages; + struct sparsebit *protected_phy_pages; + int fd; + off_t offset; + enum vm_mem_backing_src_type backing_src_type; + void *host_mem; + void *host_alias; + void *mmap_start; + void *mmap_alias; + size_t mmap_size; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; +}; + +struct kvm_vcpu { + struct list_head list; + uint32_t id; + int fd; + struct kvm_vm *vm; + struct kvm_run *run; +#ifdef __x86_64__ + struct kvm_cpuid2 *cpuid; +#endif + struct kvm_dirty_gfn *dirty_gfns; + uint32_t fetch_index; + uint32_t dirty_gfns_count; +}; + +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + +enum kvm_mem_region_type { + MEM_REGION_CODE, + MEM_REGION_DATA, + MEM_REGION_PT, + MEM_REGION_TEST_DATA, + NR_MEM_REGIONS, +}; + +struct kvm_vm { + int mode; + unsigned long type; + int kvm_fd; + int fd; + unsigned int pgtable_levels; + unsigned int page_size; + unsigned int page_shift; + unsigned int pa_bits; + unsigned int va_bits; + uint64_t max_gfn; + struct list_head vcpus; + struct userspace_mem_regions regions; + struct sparsebit *vpages_valid; + struct sparsebit *vpages_mapped; + bool has_irqchip; + bool pgd_created; + vm_paddr_t ucall_mmio_addr; + vm_paddr_t pgd; + vm_vaddr_t handlers; + uint32_t dirty_ring_size; + uint64_t gpa_tag_mask; + + struct kvm_vm_arch arch; + + /* Cache of information for binary stats interface */ + int stats_fd; + struct kvm_stats_header stats_header; + struct kvm_stats_desc *stats_desc; + + /* + * KVM region slots. These are the default memslots used by page + * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] + * memslot. + */ + uint32_t memslots[NR_MEM_REGIONS]; +}; + +struct vcpu_reg_sublist { + const char *name; + long capability; + int feature; + int feature_type; + bool finalize; + __u64 *regs; + __u64 regs_n; + __u64 *rejects_set; + __u64 rejects_set_n; + __u64 *skips_set; + __u64 skips_set_n; +}; + +struct vcpu_reg_list { + char *name; + struct vcpu_reg_sublist sublists[]; +}; + +#define for_each_sublist(c, s) \ + for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) + +#define kvm_for_each_vcpu(vm, i, vcpu) \ + for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ + if (!((vcpu) = vm->vcpus[i])) \ + continue; \ + else + +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); + +static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, + enum kvm_mem_region_type type) +{ + assert(type < NR_MEM_REGIONS); + return memslot2region(vm, vm->memslots[type]); +} + +/* Minimum allocated guest virtual and physical addresses */ +#define KVM_UTIL_MIN_VADDR 0x2000 +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 +#define DEFAULT_STACK_PGS 5 + +enum vm_guest_mode { + VM_MODE_P52V48_4K, + VM_MODE_P52V48_16K, + VM_MODE_P52V48_64K, + VM_MODE_P48V48_4K, + VM_MODE_P48V48_16K, + VM_MODE_P48V48_64K, + VM_MODE_P40V48_4K, + VM_MODE_P40V48_16K, + VM_MODE_P40V48_64K, + VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, + VM_MODE_P44V64_4K, + VM_MODE_P36V48_4K, + VM_MODE_P36V48_16K, + VM_MODE_P36V48_64K, + VM_MODE_P36V47_16K, + NUM_VM_MODES, +}; + +struct vm_shape { + uint32_t type; + uint8_t mode; + uint8_t pad0; + uint16_t pad1; +}; + +kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); + +#define VM_TYPE_DEFAULT 0 + +#define VM_SHAPE(__mode) \ +({ \ + struct vm_shape shape = { \ + .mode = (__mode), \ + .type = VM_TYPE_DEFAULT \ + }; \ + \ + shape; \ +}) + +#if defined(__aarch64__) + +extern enum vm_guest_mode vm_mode_default; + +#define VM_MODE_DEFAULT vm_mode_default +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#elif defined(__x86_64__) + +#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#elif defined(__s390x__) + +#define VM_MODE_DEFAULT VM_MODE_P44V64_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 16) + +#elif defined(__riscv) + +#if __riscv_xlen == 32 +#error "RISC-V 32-bit kvm selftests not supported" +#endif + +#define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#endif + +#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) + +#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) +#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) + +struct vm_guest_mode_params { + unsigned int pa_bits; + unsigned int va_bits; + unsigned int page_size; + unsigned int page_shift; +}; +extern const struct vm_guest_mode_params vm_guest_mode_params[]; + +int open_path_or_exit(const char *path, int flags); +int open_kvm_dev_path_or_exit(void); + +bool get_kvm_param_bool(const char *param); +bool get_kvm_intel_param_bool(const char *param); +bool get_kvm_amd_param_bool(const char *param); + +int get_kvm_param_integer(const char *param); +int get_kvm_intel_param_integer(const char *param); +int get_kvm_amd_param_integer(const char *param); + +unsigned int kvm_check_cap(long cap); + +static inline bool kvm_has_cap(long cap) +{ + return kvm_check_cap(cap); +} + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* + * Use the "inner", double-underscore macro when reporting errors from within + * other macros so that the name of ioctl() and not its literal numeric value + * is printed on error. The "outer" macro is strongly preferred when reporting + * errors "directly", i.e. without an additional layer of macros, as it reduces + * the probability of passing in the wrong string. + */ +#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) +#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) + +#define kvm_do_ioctl(fd, cmd, arg) \ +({ \ + kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ + ioctl(fd, cmd, arg); \ +}) + +#define __kvm_ioctl(kvm_fd, cmd, arg) \ + kvm_do_ioctl(kvm_fd, cmd, arg) + +#define kvm_ioctl(kvm_fd, cmd, arg) \ +({ \ + int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ +}) + +static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } + +#define __vm_ioctl(vm, cmd, arg) \ +({ \ + static_assert_is_vm(vm); \ + kvm_do_ioctl((vm)->fd, cmd, arg); \ +}) + +/* + * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if + * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, + * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before + * selftests existed and (b) should never outright fail, i.e. is supposed to + * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the + * VM and its vCPUs, including KVM_CHECK_EXTENSION. + */ +#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ +do { \ + int __errno = errno; \ + \ + static_assert_is_vm(vm); \ + \ + if (cond) \ + break; \ + \ + if (errno == EIO && \ + __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ + TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ + TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ + } \ + errno = __errno; \ + TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ +} while (0) + +#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ + __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) + +#define vm_ioctl(vm, cmd, arg) \ +({ \ + int ret = __vm_ioctl(vm, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ +}) + +static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } + +#define __vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + static_assert_is_vcpu(vcpu); \ + kvm_do_ioctl((vcpu)->fd, cmd, arg); \ +}) + +#define vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + int ret = __vcpu_ioctl(vcpu, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ +}) + +/* + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +static inline int vm_check_cap(struct kvm_vm *vm, long cap) +{ + int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); + + TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); + return ret; +} + +static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} +static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, + uint64_t size, uint64_t attributes) +{ + struct kvm_memory_attributes attr = { + .attributes = attributes, + .address = gpa, + .size = size, + .flags = 0, + }; + + /* + * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows + * need significant enhancements to support multiple attributes. + */ + TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, + "Update me to support multiple attributes!"); + + vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); +} + + +static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, 0); +} + +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, + bool punch_hole); + +static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, true); +} + +static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, false); +} + +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); +const char *vm_guest_mode_string(uint32_t i); + +void kvm_vm_free(struct kvm_vm *vmp); +void kvm_vm_restart(struct kvm_vm *vmp); +void kvm_vm_release(struct kvm_vm *vmp); +int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, + size_t len); +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); +int kvm_memfd_alloc(size_t size, bool hugepages); + +void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + + vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); +} + +static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { + .dirty_bitmap = log, + .slot = slot, + .first_page = first_page, + .num_pages = num_pages + }; + + vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); +} + +static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) +{ + return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); +} + +static inline int vm_get_stats_fd(struct kvm_vm *vm) +{ + int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); + return fd; +} + +static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) +{ + ssize_t ret; + + ret = pread(stats_fd, header, sizeof(*header), 0); + TEST_ASSERT(ret == sizeof(*header), + "Failed to read '%lu' header bytes, ret = '%ld'", + sizeof(*header), ret); +} + +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header); + +static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) +{ + /* + * The base size of the descriptor is defined by KVM's ABI, but the + * size of the name field is variable, as far as KVM's ABI is + * concerned. For a given instance of KVM, the name field is the same + * size for all stats and is provided in the overall stats header. + */ + return sizeof(struct kvm_stats_desc) + header->name_size; +} + +static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, + int index, + struct kvm_stats_header *header) +{ + /* + * Note, size_desc includes the size of the name field, which is + * variable. i.e. this is NOT equivalent to &stats_desc[i]. + */ + return (void *)stats + index * get_stats_descriptor_size(header); +} + +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements); + +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements); + +static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) +{ + uint64_t data; + + __vm_get_stat(vm, stat_name, &data, 1); + return data; +} + +void vm_create_irqchip(struct kvm_vm *vm); + +static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + struct kvm_create_guest_memfd guest_memfd = { + .size = size, + .flags = flags, + }; + + return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); +} + +static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + int fd = __vm_create_guest_memfd(vm, size, flags); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); + return fd; +} + +void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); + +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags); +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + +#ifndef vm_arch_has_protected_memory +static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) +{ + return false; +} +#endif + +void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vm_populate_vaddr_bitmap(struct kvm_vm *vm); +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); +vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); + +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + unsigned int npages); +void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); +void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); +vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); + +#ifndef vcpu_arch_put_guest +#define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0) +#endif + +static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) +{ + return gpa & ~vm->gpa_tag_mask; +} + +void vcpu_run(struct kvm_vcpu *vcpu); +int _vcpu_run(struct kvm_vcpu *vcpu); + +static inline int __vcpu_run(struct kvm_vcpu *vcpu) +{ + return __vcpu_ioctl(vcpu, KVM_RUN, NULL); +} + +void vcpu_run_complete_io(struct kvm_vcpu *vcpu); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); + +static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, + uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *debug) +{ + vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); +} + +static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); +} +static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); +} + +static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_GET_REGS, regs); +} + +static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_SET_REGS, regs); +} +static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); + +} +static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); +} +static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); +} + +static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} +static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} + +#ifdef __KVM_HAVE_VCPU_EVENTS +static inline void vcpu_events_get(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); +} +static inline void vcpu_events_set(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); +} +#endif +#ifdef __x86_64__ +static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); +} +static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} + +static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} +#endif +static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) +{ + int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); + return fd; +} + +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); + +static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + int ret = __kvm_has_device_attr(dev_fd, group, attr); + + TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); +} + +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_get(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_get(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); +} + +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_set(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_set(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); +} + +static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + return __kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); +int __kvm_create_device(struct kvm_vm *vm, uint64_t type); + +static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) +{ + int fd = __kvm_create_device(vm, type); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); + return fd; +} + +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); + +/* + * VM VCPU Args Set + * + * Input Args: + * vm - Virtual Machine + * num - number of arguments + * ... - arguments, each of type uint64_t + * + * Output Args: None + * + * Return: None + * + * Sets the first @num input parameters for the function at @vcpu's entry point, + * per the C calling convention of the architecture, to the values given as + * variable args. Each of the variable args is expected to be of type uint64_t. + * The maximum @num can be is specific to the architecture. + */ +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); + +void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); +int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); + +#define KVM_MAX_IRQ_ROUTES 4096 + +struct kvm_irq_routing *kvm_gsi_routing_create(void); +void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, + uint32_t gsi, uint32_t pin); +int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); +void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); + +const char *exit_reason_str(unsigned int exit_reason); + +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, + uint32_t memslot); +vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot, + bool protected); +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); + +static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot) +{ + /* + * By default, allocate memory as protected for VMs that support + * protected memory, as the majority of memory for such VMs is + * protected, i.e. using shared memory is effectively opt-in. + */ + return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, + vm_arch_has_protected_memory(vm)); +} + +/* + * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also + * loads the test binary into guest memory and creates an IRQ chip (x86 only). + * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to + * calculate the amount of memory needed for per-vCPU data, e.g. stacks. + */ +struct kvm_vm *____vm_create(struct vm_shape shape); +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages); + +static inline struct kvm_vm *vm_create_barebones(void) +{ + return ____vm_create(VM_SHAPE_DEFAULT); +} + +static inline struct kvm_vm *vm_create_barebones_type(unsigned long type) +{ + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = type, + }; + + return ____vm_create(shape); +} + +static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) +{ + return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); +} + +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, + uint64_t extra_mem_pages, + void *guest_code, struct kvm_vcpu *vcpus[]); + +static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, + void *guest_code, + struct kvm_vcpu *vcpus[]) +{ + return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, + guest_code, vcpus); +} + + +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code); + +/* + * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages + * additional pages of guest memory. Returns the VM and vCPU (via out param). + */ +static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, + extra_mem_pages, guest_code); +} + +static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_with_one_vcpu(vcpu, 0, guest_code); +} + +static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); +} + +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); + +void kvm_pin_this_task_to_pcpu(uint32_t pcpu); +void kvm_print_vcpu_pinning_help(void); +void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + int nr_vcpus); + +unsigned long vm_compute_max_gfn(struct kvm_vm *vm); +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); +unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); +unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); +static inline unsigned int +vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + unsigned int n; + n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); +#ifdef __s390x__ + /* s390 requires 1M aligned guest sizes */ + n = (n + 255) & ~255; +#endif + return n; +} + +#define sync_global_to_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(_p, &(g), sizeof(g)); \ +}) + +#define sync_global_from_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(&(g), _p, sizeof(g)); \ +}) + +/* + * Write a global value, but only in the VM's (guest's) domain. Primarily used + * for "globals" that hold per-VM values (VMs always duplicate code and global + * data into their own region of physical memory), but can be used anytime it's + * undesirable to change the host's copy of the global. + */ +#define write_guest_global(vm, g, val) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + typeof(g) _val = val; \ + \ + memcpy(_p, &(_val), sizeof(g)); \ +}) + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); + +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent); + +static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent) +{ + vcpu_arch_dump(stream, vcpu, indent); +} + +/* + * Adds a vCPU with reasonable defaults (e.g. a stack) + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - The id of the VCPU to add to the VM. + */ +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); + +static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) +{ + struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); + + vcpu_arch_set_entry_point(vcpu, guest_code); + + return vcpu; +} + +/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); + +static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) +{ + return vm_arch_vcpu_recreate(vm, vcpu_id); +} + +void vcpu_arch_free(struct kvm_vcpu *vcpu); + +void virt_arch_pgd_alloc(struct kvm_vm *vm); + +static inline void virt_pgd_alloc(struct kvm_vm *vm) +{ + virt_arch_pgd_alloc(vm); +} + +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within @vm, creates a virtual translation for the page starting + * at @vaddr to the page starting at @paddr. + */ +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); + +static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + virt_arch_pg_map(vm, vaddr, paddr); +} + + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Returns the VM physical address of the translated VM virtual + * address given by @gva. + */ +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); + +static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return addr_arch_gva2gpa(vm, gva); +} + +/* + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. + */ +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + virt_arch_dump(stream, vm, indent); +} + + +static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) +{ + return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); +} + +/* + * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * to allow for arch-specific setup that is common to all tests, e.g. computing + * the default guest "mode". + */ +void kvm_selftest_arch_init(void); + +void kvm_arch_vm_post_create(struct kvm_vm *vm); + +bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); + +uint32_t guest_get_vcpuid(void); #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h deleted file mode 100644 index 3e0db283a4..0000000000 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ /dev/null @@ -1,1135 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/kvm_util_base.h - * - * Copyright (C) 2018, Google LLC. - */ -#ifndef SELFTEST_KVM_UTIL_BASE_H -#define SELFTEST_KVM_UTIL_BASE_H - -#include "test_util.h" - -#include -#include "linux/hashtable.h" -#include "linux/list.h" -#include -#include -#include "linux/rbtree.h" -#include - -#include -#include - -#include - -#include "kvm_util_arch.h" -#include "sparsebit.h" - -/* - * Provide a version of static_assert() that is guaranteed to have an optional - * message param. If _ISOC11_SOURCE is defined, glibc (/usr/include/assert.h) - * #undefs and #defines static_assert() as a direct alias to _Static_assert(), - * i.e. effectively makes the message mandatory. Many KVM selftests #define - * _GNU_SOURCE for various reasons, and _GNU_SOURCE implies _ISOC11_SOURCE. As - * a result, static_assert() behavior is non-deterministic and may or may not - * require a message depending on #include order. - */ -#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) -#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) - -#define KVM_DEV_PATH "/dev/kvm" -#define KVM_MAX_VCPUS 512 - -#define NSEC_PER_SEC 1000000000L - -typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ -typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ - -struct userspace_mem_region { - struct kvm_userspace_memory_region2 region; - struct sparsebit *unused_phy_pages; - struct sparsebit *protected_phy_pages; - int fd; - off_t offset; - enum vm_mem_backing_src_type backing_src_type; - void *host_mem; - void *host_alias; - void *mmap_start; - void *mmap_alias; - size_t mmap_size; - struct rb_node gpa_node; - struct rb_node hva_node; - struct hlist_node slot_node; -}; - -struct kvm_vcpu { - struct list_head list; - uint32_t id; - int fd; - struct kvm_vm *vm; - struct kvm_run *run; -#ifdef __x86_64__ - struct kvm_cpuid2 *cpuid; -#endif - struct kvm_dirty_gfn *dirty_gfns; - uint32_t fetch_index; - uint32_t dirty_gfns_count; -}; - -struct userspace_mem_regions { - struct rb_root gpa_tree; - struct rb_root hva_tree; - DECLARE_HASHTABLE(slot_hash, 9); -}; - -enum kvm_mem_region_type { - MEM_REGION_CODE, - MEM_REGION_DATA, - MEM_REGION_PT, - MEM_REGION_TEST_DATA, - NR_MEM_REGIONS, -}; - -struct kvm_vm { - int mode; - unsigned long type; - uint8_t subtype; - int kvm_fd; - int fd; - unsigned int pgtable_levels; - unsigned int page_size; - unsigned int page_shift; - unsigned int pa_bits; - unsigned int va_bits; - uint64_t max_gfn; - struct list_head vcpus; - struct userspace_mem_regions regions; - struct sparsebit *vpages_valid; - struct sparsebit *vpages_mapped; - bool has_irqchip; - bool pgd_created; - vm_paddr_t ucall_mmio_addr; - vm_paddr_t pgd; - vm_vaddr_t gdt; - vm_vaddr_t tss; - vm_vaddr_t idt; - vm_vaddr_t handlers; - uint32_t dirty_ring_size; - uint64_t gpa_tag_mask; - - struct kvm_vm_arch arch; - - /* Cache of information for binary stats interface */ - int stats_fd; - struct kvm_stats_header stats_header; - struct kvm_stats_desc *stats_desc; - - /* - * KVM region slots. These are the default memslots used by page - * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] - * memslot. - */ - uint32_t memslots[NR_MEM_REGIONS]; -}; - -struct vcpu_reg_sublist { - const char *name; - long capability; - int feature; - int feature_type; - bool finalize; - __u64 *regs; - __u64 regs_n; - __u64 *rejects_set; - __u64 rejects_set_n; - __u64 *skips_set; - __u64 skips_set_n; -}; - -struct vcpu_reg_list { - char *name; - struct vcpu_reg_sublist sublists[]; -}; - -#define for_each_sublist(c, s) \ - for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) - -#define kvm_for_each_vcpu(vm, i, vcpu) \ - for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ - if (!((vcpu) = vm->vcpus[i])) \ - continue; \ - else - -struct userspace_mem_region * -memslot2region(struct kvm_vm *vm, uint32_t memslot); - -static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, - enum kvm_mem_region_type type) -{ - assert(type < NR_MEM_REGIONS); - return memslot2region(vm, vm->memslots[type]); -} - -/* Minimum allocated guest virtual and physical addresses */ -#define KVM_UTIL_MIN_VADDR 0x2000 -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - -#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 -#define DEFAULT_STACK_PGS 5 - -enum vm_guest_mode { - VM_MODE_P52V48_4K, - VM_MODE_P52V48_16K, - VM_MODE_P52V48_64K, - VM_MODE_P48V48_4K, - VM_MODE_P48V48_16K, - VM_MODE_P48V48_64K, - VM_MODE_P40V48_4K, - VM_MODE_P40V48_16K, - VM_MODE_P40V48_64K, - VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ - VM_MODE_P47V64_4K, - VM_MODE_P44V64_4K, - VM_MODE_P36V48_4K, - VM_MODE_P36V48_16K, - VM_MODE_P36V48_64K, - VM_MODE_P36V47_16K, - NUM_VM_MODES, -}; - -struct vm_shape { - uint32_t type; - uint8_t mode; - uint8_t subtype; - uint16_t padding; -}; - -kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); - -#define VM_TYPE_DEFAULT 0 - -#define VM_SHAPE(__mode) \ -({ \ - struct vm_shape shape = { \ - .mode = (__mode), \ - .type = VM_TYPE_DEFAULT \ - }; \ - \ - shape; \ -}) - -#if defined(__aarch64__) - -extern enum vm_guest_mode vm_mode_default; - -#define VM_MODE_DEFAULT vm_mode_default -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#elif defined(__x86_64__) - -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#elif defined(__s390x__) - -#define VM_MODE_DEFAULT VM_MODE_P44V64_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 16) - -#elif defined(__riscv) - -#if __riscv_xlen == 32 -#error "RISC-V 32-bit kvm selftests not supported" -#endif - -#define VM_MODE_DEFAULT VM_MODE_P40V48_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#endif - -#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) - -#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) -#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) - -struct vm_guest_mode_params { - unsigned int pa_bits; - unsigned int va_bits; - unsigned int page_size; - unsigned int page_shift; -}; -extern const struct vm_guest_mode_params vm_guest_mode_params[]; - -int open_path_or_exit(const char *path, int flags); -int open_kvm_dev_path_or_exit(void); - -bool get_kvm_param_bool(const char *param); -bool get_kvm_intel_param_bool(const char *param); -bool get_kvm_amd_param_bool(const char *param); - -int get_kvm_param_integer(const char *param); -int get_kvm_intel_param_integer(const char *param); -int get_kvm_amd_param_integer(const char *param); - -unsigned int kvm_check_cap(long cap); - -static inline bool kvm_has_cap(long cap) -{ - return kvm_check_cap(cap); -} - -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -/* - * Use the "inner", double-underscore macro when reporting errors from within - * other macros so that the name of ioctl() and not its literal numeric value - * is printed on error. The "outer" macro is strongly preferred when reporting - * errors "directly", i.e. without an additional layer of macros, as it reduces - * the probability of passing in the wrong string. - */ -#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) -#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) - -#define kvm_do_ioctl(fd, cmd, arg) \ -({ \ - kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ - ioctl(fd, cmd, arg); \ -}) - -#define __kvm_ioctl(kvm_fd, cmd, arg) \ - kvm_do_ioctl(kvm_fd, cmd, arg) - -#define kvm_ioctl(kvm_fd, cmd, arg) \ -({ \ - int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ - \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ -}) - -static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } - -#define __vm_ioctl(vm, cmd, arg) \ -({ \ - static_assert_is_vm(vm); \ - kvm_do_ioctl((vm)->fd, cmd, arg); \ -}) - -/* - * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if - * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, - * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before - * selftests existed and (b) should never outright fail, i.e. is supposed to - * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the - * VM and its vCPUs, including KVM_CHECK_EXTENSION. - */ -#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ -do { \ - int __errno = errno; \ - \ - static_assert_is_vm(vm); \ - \ - if (cond) \ - break; \ - \ - if (errno == EIO && \ - __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ - TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ - TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ - } \ - errno = __errno; \ - TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ -} while (0) - -#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ - __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) - -#define vm_ioctl(vm, cmd, arg) \ -({ \ - int ret = __vm_ioctl(vm, cmd, arg); \ - \ - __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ -}) - -static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } - -#define __vcpu_ioctl(vcpu, cmd, arg) \ -({ \ - static_assert_is_vcpu(vcpu); \ - kvm_do_ioctl((vcpu)->fd, cmd, arg); \ -}) - -#define vcpu_ioctl(vcpu, cmd, arg) \ -({ \ - int ret = __vcpu_ioctl(vcpu, cmd, arg); \ - \ - __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ -}) - -/* - * Looks up and returns the value corresponding to the capability - * (KVM_CAP_*) given by cap. - */ -static inline int vm_check_cap(struct kvm_vm *vm, long cap) -{ - int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); - - TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); - return ret; -} - -static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); -} -static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); -} - -static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, - uint64_t size, uint64_t attributes) -{ - struct kvm_memory_attributes attr = { - .attributes = attributes, - .address = gpa, - .size = size, - .flags = 0, - }; - - /* - * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows - * need significant enhancements to support multiple attributes. - */ - TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, - "Update me to support multiple attributes!"); - - vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); -} - - -static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); -} - -static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_set_memory_attributes(vm, gpa, size, 0); -} - -void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, - bool punch_hole); - -static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_guest_mem_fallocate(vm, gpa, size, true); -} - -static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_guest_mem_fallocate(vm, gpa, size, false); -} - -void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); -const char *vm_guest_mode_string(uint32_t i); - -void kvm_vm_free(struct kvm_vm *vmp); -void kvm_vm_restart(struct kvm_vm *vmp); -void kvm_vm_release(struct kvm_vm *vmp); -int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, - size_t len); -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); -int kvm_memfd_alloc(size_t size, bool hugepages); - -void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) -{ - struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; - - vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); -} - -static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages) -{ - struct kvm_clear_dirty_log args = { - .dirty_bitmap = log, - .slot = slot, - .first_page = first_page, - .num_pages = num_pages - }; - - vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); -} - -static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) -{ - return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); -} - -static inline int vm_get_stats_fd(struct kvm_vm *vm) -{ - int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); - - TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); - return fd; -} - -static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) -{ - ssize_t ret; - - ret = pread(stats_fd, header, sizeof(*header), 0); - TEST_ASSERT(ret == sizeof(*header), - "Failed to read '%lu' header bytes, ret = '%ld'", - sizeof(*header), ret); -} - -struct kvm_stats_desc *read_stats_descriptors(int stats_fd, - struct kvm_stats_header *header); - -static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) -{ - /* - * The base size of the descriptor is defined by KVM's ABI, but the - * size of the name field is variable, as far as KVM's ABI is - * concerned. For a given instance of KVM, the name field is the same - * size for all stats and is provided in the overall stats header. - */ - return sizeof(struct kvm_stats_desc) + header->name_size; -} - -static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, - int index, - struct kvm_stats_header *header) -{ - /* - * Note, size_desc includes the size of the name field, which is - * variable. i.e. this is NOT equivalent to &stats_desc[i]. - */ - return (void *)stats + index * get_stats_descriptor_size(header); -} - -void read_stat_data(int stats_fd, struct kvm_stats_header *header, - struct kvm_stats_desc *desc, uint64_t *data, - size_t max_elements); - -void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, - size_t max_elements); - -static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) -{ - uint64_t data; - - __vm_get_stat(vm, stat_name, &data, 1); - return data; -} - -void vm_create_irqchip(struct kvm_vm *vm); - -static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, - uint64_t flags) -{ - struct kvm_create_guest_memfd guest_memfd = { - .size = size, - .flags = flags, - }; - - return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); -} - -static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, - uint64_t flags) -{ - int fd = __vm_create_guest_memfd(vm, size, flags); - - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); - return fd; -} - -void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva); -int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva); -void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva, - uint32_t guest_memfd, uint64_t guest_memfd_offset); -int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva, - uint32_t guest_memfd, uint64_t guest_memfd_offset); - -void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags); -void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); - -#ifndef vm_arch_has_protected_memory -static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) -{ - return false; -} -#endif - -void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); -void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); -void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); -struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); -void vm_populate_vaddr_bitmap(struct kvm_vm *vm); -vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); -vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, - vm_vaddr_t vaddr_min, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); -vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); - -void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages); -void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); -void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); -vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); -void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); - - -static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) -{ - return gpa & ~vm->gpa_tag_mask; -} - -void vcpu_run(struct kvm_vcpu *vcpu); -int _vcpu_run(struct kvm_vcpu *vcpu); - -static inline int __vcpu_run(struct kvm_vcpu *vcpu) -{ - return __vcpu_ioctl(vcpu, KVM_RUN, NULL); -} - -void vcpu_run_complete_io(struct kvm_vcpu *vcpu); -struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); - -static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, - uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); -} - -static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *debug) -{ - vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); -} - -static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); -} -static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); -} - -static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_ioctl(vcpu, KVM_GET_REGS, regs); -} - -static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_ioctl(vcpu, KVM_SET_REGS, regs); -} -static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); - -} -static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); -} -static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); -} -static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); -} -static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); -} - -static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - - return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); -} -static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - - return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); -} -static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - - vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); -} -static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - - vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); -} - -#ifdef __KVM_HAVE_VCPU_EVENTS -static inline void vcpu_events_get(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); -} -static inline void vcpu_events_set(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); -} -#endif -#ifdef __x86_64__ -static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); -} -static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); -} - -static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); -} -#endif -static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) -{ - int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); - - TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); - return fd; -} - -int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); - -static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) -{ - int ret = __kvm_has_device_attr(dev_fd, group, attr); - - TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); -} - -int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); - -static inline void kvm_device_attr_get(int dev_fd, uint32_t group, - uint64_t attr, void *val) -{ - int ret = __kvm_device_attr_get(dev_fd, group, attr, val); - - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); -} - -int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); - -static inline void kvm_device_attr_set(int dev_fd, uint32_t group, - uint64_t attr, void *val) -{ - int ret = __kvm_device_attr_set(dev_fd, group, attr, val); - - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); -} - -static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr) -{ - return __kvm_has_device_attr(vcpu->fd, group, attr); -} - -static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr) -{ - kvm_has_device_attr(vcpu->fd, group, attr); -} - -static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_get(vcpu->fd, group, attr, val); -} - -static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_get(vcpu->fd, group, attr, val); -} - -static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_set(vcpu->fd, group, attr, val); -} - -static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_set(vcpu->fd, group, attr, val); -} - -int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); -int __kvm_create_device(struct kvm_vm *vm, uint64_t type); - -static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) -{ - int fd = __kvm_create_device(vm, type); - - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); - return fd; -} - -void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); - -/* - * VM VCPU Args Set - * - * Input Args: - * vm - Virtual Machine - * num - number of arguments - * ... - arguments, each of type uint64_t - * - * Output Args: None - * - * Return: None - * - * Sets the first @num input parameters for the function at @vcpu's entry point, - * per the C calling convention of the architecture, to the values given as - * variable args. Each of the variable args is expected to be of type uint64_t. - * The maximum @num can be is specific to the architecture. - */ -void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); - -void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); -int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); - -#define KVM_MAX_IRQ_ROUTES 4096 - -struct kvm_irq_routing *kvm_gsi_routing_create(void); -void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, - uint32_t gsi, uint32_t pin); -int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); -void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); - -const char *exit_reason_str(unsigned int exit_reason); - -vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, - uint32_t memslot); -vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot, - bool protected); -vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); - -static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot) -{ - /* - * By default, allocate memory as protected for VMs that support - * protected memory, as the majority of memory for such VMs is - * protected, i.e. using shared memory is effectively opt-in. - */ - return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, - vm_arch_has_protected_memory(vm)); -} - -/* - * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also - * loads the test binary into guest memory and creates an IRQ chip (x86 only). - * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to - * calculate the amount of memory needed for per-vCPU data, e.g. stacks. - */ -struct kvm_vm *____vm_create(struct vm_shape shape); -struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, - uint64_t nr_extra_pages); - -static inline struct kvm_vm *vm_create_barebones(void) -{ - return ____vm_create(VM_SHAPE_DEFAULT); -} - -#ifdef __x86_64__ -static inline struct kvm_vm *vm_create_barebones_protected_vm(void) -{ - const struct vm_shape shape = { - .mode = VM_MODE_DEFAULT, - .type = KVM_X86_SW_PROTECTED_VM, - }; - - return ____vm_create(shape); -} -#endif - -static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) -{ - return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); -} - -struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, - uint64_t extra_mem_pages, - void *guest_code, struct kvm_vcpu *vcpus[]); - -static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, - void *guest_code, - struct kvm_vcpu *vcpus[]) -{ - return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, - guest_code, vcpus); -} - - -struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, - struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code); - -/* - * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages - * additional pages of guest memory. Returns the VM and vCPU (via out param). - */ -static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code) -{ - return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, - extra_mem_pages, guest_code); -} - -static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - void *guest_code) -{ - return __vm_create_with_one_vcpu(vcpu, 0, guest_code); -} - -static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, - struct kvm_vcpu **vcpu, - void *guest_code) -{ - return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); -} - -struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); - -void kvm_pin_this_task_to_pcpu(uint32_t pcpu); -void kvm_print_vcpu_pinning_help(void); -void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], - int nr_vcpus); - -unsigned long vm_compute_max_gfn(struct kvm_vm *vm); -unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); -unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); -unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); -static inline unsigned int -vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) -{ - unsigned int n; - n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); -#ifdef __s390x__ - /* s390 requires 1M aligned guest sizes */ - n = (n + 255) & ~255; -#endif - return n; -} - -#define sync_global_to_guest(vm, g) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - memcpy(_p, &(g), sizeof(g)); \ -}) - -#define sync_global_from_guest(vm, g) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - memcpy(&(g), _p, sizeof(g)); \ -}) - -/* - * Write a global value, but only in the VM's (guest's) domain. Primarily used - * for "globals" that hold per-VM values (VMs always duplicate code and global - * data into their own region of physical memory), but can be used anytime it's - * undesirable to change the host's copy of the global. - */ -#define write_guest_global(vm, g, val) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - typeof(g) _val = val; \ - \ - memcpy(_p, &(_val), sizeof(g)); \ -}) - -void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); - -void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, - uint8_t indent); - -static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, - uint8_t indent) -{ - vcpu_arch_dump(stream, vcpu, indent); -} - -/* - * Adds a vCPU with reasonable defaults (e.g. a stack) - * - * Input Args: - * vm - Virtual Machine - * vcpu_id - The id of the VCPU to add to the VM. - */ -struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); -void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); - -static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, - void *guest_code) -{ - struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); - - vcpu_arch_set_entry_point(vcpu, guest_code); - - return vcpu; -} - -/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ -struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); - -static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, - uint32_t vcpu_id) -{ - return vm_arch_vcpu_recreate(vm, vcpu_id); -} - -void vcpu_arch_free(struct kvm_vcpu *vcpu); - -void virt_arch_pgd_alloc(struct kvm_vm *vm); - -static inline void virt_pgd_alloc(struct kvm_vm *vm) -{ - virt_arch_pgd_alloc(vm); -} - -/* - * VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * vaddr - VM Virtual Address - * paddr - VM Physical Address - * memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within @vm, creates a virtual translation for the page starting - * at @vaddr to the page starting at @paddr. - */ -void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); - -static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) -{ - virt_arch_pg_map(vm, vaddr, paddr); -} - - -/* - * Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gva - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Returns the VM physical address of the translated VM virtual - * address given by @gva. - */ -vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); - -static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) -{ - return addr_arch_gva2gpa(vm, gva); -} - -/* - * Virtual Translation Tables Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps to the FILE stream given by @stream, the contents of all the - * virtual translation tables for the VM given by @vm. - */ -void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) -{ - virt_arch_dump(stream, vm, indent); -} - - -static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) -{ - return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); -} - -/* - * Arch hook that is invoked via a constructor, i.e. before exeucting main(), - * to allow for arch-specific setup that is common to all tests, e.g. computing - * the default guest "mode". - */ -void kvm_selftest_arch_init(void); - -void kvm_arch_vm_post_create(struct kvm_vm *vm); - -bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); - -uint32_t guest_get_vcpuid(void); - -#endif /* SELFTEST_KVM_UTIL_BASE_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util_types.h b/tools/testing/selftests/kvm/include/kvm_util_types.h new file mode 100644 index 0000000000..ec787b97cf --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_util_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_TYPES_H +#define SELFTEST_KVM_UTIL_TYPES_H + +/* + * Provide a version of static_assert() that is guaranteed to have an optional + * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE + * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and + * #defines static_assert() as a direct alias to _Static_assert() (see + * usr/include/assert.h). Define a custom macro instead of redefining + * static_assert() to avoid creating non-deterministic behavior that is + * dependent on include order. + */ +#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) +#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) + +typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ +typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ + +#endif /* SELFTEST_KVM_UTIL_TYPES_H */ diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h index ce4e603050..9071eb6dea 100644 --- a/tools/testing/selftests/kvm/include/memstress.h +++ b/tools/testing/selftests/kvm/include/memstress.h @@ -62,7 +62,6 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, void memstress_destroy_vm(struct kvm_vm *vm); void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); -void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); void memstress_set_random_access(struct kvm_vm *vm, bool random_access); void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *)); diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index ce473fe251..5f38916633 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -50,6 +50,16 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype, bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext); +static inline bool __vcpu_has_isa_ext(struct kvm_vcpu *vcpu, uint64_t isa_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_ISA_EXT_REG(isa_ext)); +} + +static inline bool __vcpu_has_sbi_ext(struct kvm_vcpu *vcpu, uint64_t sbi_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_SBI_EXT_REG(sbi_ext)); +} + struct ex_regs { unsigned long ra; unsigned long sp; @@ -154,45 +164,6 @@ void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handle #define PGTBL_PAGE_SIZE PGTBL_L0_BLOCK_SIZE #define PGTBL_PAGE_SIZE_SHIFT PGTBL_L0_BLOCK_SHIFT -/* SBI return error codes */ -#define SBI_SUCCESS 0 -#define SBI_ERR_FAILURE -1 -#define SBI_ERR_NOT_SUPPORTED -2 -#define SBI_ERR_INVALID_PARAM -3 -#define SBI_ERR_DENIED -4 -#define SBI_ERR_INVALID_ADDRESS -5 -#define SBI_ERR_ALREADY_AVAILABLE -6 -#define SBI_ERR_ALREADY_STARTED -7 -#define SBI_ERR_ALREADY_STOPPED -8 - -#define SBI_EXT_EXPERIMENTAL_START 0x08000000 -#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF - -#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END -#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 -#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 - -enum sbi_ext_id { - SBI_EXT_BASE = 0x10, - SBI_EXT_STA = 0x535441, -}; - -enum sbi_ext_base_fid { - SBI_EXT_BASE_PROBE_EXT = 3, -}; - -struct sbiret { - long error; - long value; -}; - -struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, - unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5); - -bool guest_sbi_probe_extension(int extid, long *out_val); - static inline void local_irq_enable(void) { csr_set(CSR_SSTATUS, SR_SIE); diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h new file mode 100644 index 0000000000..046b432ae8 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * RISC-V SBI specific definitions + * + * Copyright (C) 2024 Rivos Inc. + */ + +#ifndef SELFTEST_KVM_SBI_H +#define SELFTEST_KVM_SBI_H + +/* SBI spec version fields */ +#define SBI_SPEC_VERSION_DEFAULT 0x1 +#define SBI_SPEC_VERSION_MAJOR_SHIFT 24 +#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f +#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff + +/* SBI return error codes */ +#define SBI_SUCCESS 0 +#define SBI_ERR_FAILURE -1 +#define SBI_ERR_NOT_SUPPORTED -2 +#define SBI_ERR_INVALID_PARAM -3 +#define SBI_ERR_DENIED -4 +#define SBI_ERR_INVALID_ADDRESS -5 +#define SBI_ERR_ALREADY_AVAILABLE -6 +#define SBI_ERR_ALREADY_STARTED -7 +#define SBI_ERR_ALREADY_STOPPED -8 + +#define SBI_EXT_EXPERIMENTAL_START 0x08000000 +#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF + +#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END +#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 +#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 + +enum sbi_ext_id { + SBI_EXT_BASE = 0x10, + SBI_EXT_STA = 0x535441, + SBI_EXT_PMU = 0x504D55, +}; + +enum sbi_ext_base_fid { + SBI_EXT_BASE_GET_SPEC_VERSION = 0, + SBI_EXT_BASE_GET_IMP_ID, + SBI_EXT_BASE_GET_IMP_VERSION, + SBI_EXT_BASE_PROBE_EXT = 3, +}; +enum sbi_ext_pmu_fid { + SBI_EXT_PMU_NUM_COUNTERS = 0, + SBI_EXT_PMU_COUNTER_GET_INFO, + SBI_EXT_PMU_COUNTER_CFG_MATCH, + SBI_EXT_PMU_COUNTER_START, + SBI_EXT_PMU_COUNTER_STOP, + SBI_EXT_PMU_COUNTER_FW_READ, + SBI_EXT_PMU_COUNTER_FW_READ_HI, + SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, +}; + +union sbi_pmu_ctr_info { + unsigned long value; + struct { + unsigned long csr:12; + unsigned long width:6; +#if __riscv_xlen == 32 + unsigned long reserved:13; +#else + unsigned long reserved:45; +#endif + unsigned long type:1; + }; +}; + +struct riscv_pmu_snapshot_data { + u64 ctr_overflow_mask; + u64 ctr_values[64]; + u64 reserved[447]; +}; + +struct sbiret { + long error; + long value; +}; + +/** General pmu event codes specified in SBI PMU extension */ +enum sbi_pmu_hw_generic_events_t { + SBI_PMU_HW_NO_EVENT = 0, + SBI_PMU_HW_CPU_CYCLES = 1, + SBI_PMU_HW_INSTRUCTIONS = 2, + SBI_PMU_HW_CACHE_REFERENCES = 3, + SBI_PMU_HW_CACHE_MISSES = 4, + SBI_PMU_HW_BRANCH_INSTRUCTIONS = 5, + SBI_PMU_HW_BRANCH_MISSES = 6, + SBI_PMU_HW_BUS_CYCLES = 7, + SBI_PMU_HW_STALLED_CYCLES_FRONTEND = 8, + SBI_PMU_HW_STALLED_CYCLES_BACKEND = 9, + SBI_PMU_HW_REF_CPU_CYCLES = 10, + + SBI_PMU_HW_GENERAL_MAX, +}; + +/* SBI PMU counter types */ +enum sbi_pmu_ctr_type { + SBI_PMU_CTR_TYPE_HW = 0x0, + SBI_PMU_CTR_TYPE_FW, +}; + +/* Flags defined for config matching function */ +#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0) +#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1) +#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2) +#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3) +#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4) +#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5) +#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6) +#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7) + +/* Flags defined for counter start function */ +#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0) +#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1) + +/* Flags defined for counter stop function */ +#define SBI_PMU_STOP_FLAG_RESET BIT(0) +#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1) + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5); + +bool guest_sbi_probe_extension(int extid, long *out_val); + +/* Make SBI version */ +static inline unsigned long sbi_mk_version(unsigned long major, + unsigned long minor) +{ + return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT) + | (minor & SBI_SPEC_VERSION_MINOR_MASK); +} + +unsigned long get_host_sbi_spec_version(void); + +#endif /* SELFTEST_KVM_SBI_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/ucall.h b/tools/testing/selftests/kvm/include/riscv/ucall.h index be46eb32ec..a695ae36f3 100644 --- a/tools/testing/selftests/kvm/include/riscv/ucall.h +++ b/tools/testing/selftests/kvm/include/riscv/ucall.h @@ -3,6 +3,7 @@ #define SELFTEST_KVM_UCALL_H #include "processor.h" +#include "sbi.h" #define UCALL_EXIT_REASON KVM_EXIT_RISCV_SBI diff --git a/tools/testing/selftests/kvm/include/s390x/ucall.h b/tools/testing/selftests/kvm/include/s390x/ucall.h index b231bf2e49..8035a872a3 100644 --- a/tools/testing/selftests/kvm/include/s390x/ucall.h +++ b/tools/testing/selftests/kvm/include/s390x/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_S390_SIEIC diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 8a6e30612c..3e47305884 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -91,9 +91,28 @@ struct guest_random_state { uint32_t seed; }; +extern uint32_t guest_random_seed; +extern struct guest_random_state guest_rng; + struct guest_random_state new_guest_random_state(uint32_t seed); uint32_t guest_random_u32(struct guest_random_state *state); +static inline bool __guest_random_bool(struct guest_random_state *state, + uint8_t percent) +{ + return (guest_random_u32(state) % 100) < percent; +} + +static inline bool guest_random_bool(struct guest_random_state *state) +{ + return __guest_random_bool(state, 50); +} + +static inline uint64_t guest_random_u64(struct guest_random_state *state) +{ + return ((uint64_t)guest_random_u32(state) << 32) | guest_random_u32(state); +} + enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS, VM_MEM_SRC_ANONYMOUS_THP, diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h index 877449c345..60f7f9d435 100644 --- a/tools/testing/selftests/kvm/include/userfaultfd_util.h +++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h @@ -5,9 +5,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019-2022 Google LLC */ - -#define _GNU_SOURCE /* for pipe2 */ - #include #include #include @@ -17,17 +14,27 @@ typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg); -struct uffd_desc { +struct uffd_reader_args { int uffd_mode; int uffd; - int pipefds[2]; useconds_t delay; uffd_handler_t handler; - pthread_t thread; + /* Holds the read end of the pipe for killing the reader. */ + int pipe; +}; + +struct uffd_desc { + int uffd; + uint64_t num_readers; + /* Holds the write ends of the pipes for killing the readers. */ + int *pipefds; + pthread_t *readers; + struct uffd_reader_args *reader_args; }; struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler); void uffd_stop_demand_paging(struct uffd_desc *uffd); diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h index 9f1725192a..972bb1c4ab 100644 --- a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h @@ -5,7 +5,16 @@ #include #include +#include "kvm_util_types.h" +#include "test_util.h" + +extern bool is_forced_emulation_enabled; + struct kvm_vm_arch { + vm_vaddr_t gdt; + vm_vaddr_t tss; + vm_vaddr_t idt; + uint64_t c_bit; uint64_t s_bit; int sev_fd; @@ -20,4 +29,23 @@ static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) #define vm_arch_has_protected_memory(vm) \ __vm_arch_has_protected_memory(&(vm)->arch) +#define vcpu_arch_put_guest(mem, __val) \ +do { \ + const typeof(mem) val = (__val); \ + \ + if (!is_forced_emulation_enabled || guest_random_bool(&guest_rng)) { \ + (mem) = val; \ + } else if (guest_random_bool(&guest_rng)) { \ + __asm__ __volatile__(KVM_FEP "mov %1, %0" \ + : "+m" (mem) \ + : "r" (val) : "memory"); \ + } else { \ + uint64_t __old = READ_ONCE(mem); \ + \ + __asm__ __volatile__(KVM_FEP LOCK_PREFIX "cmpxchg %[new], %[ptr]" \ + : [ptr] "+m" (mem), [old] "+a" (__old) \ + : [new]"r" (val) : "memory", "cc"); \ + } \ +} while (0) + #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 81ce37ec40..c0c7c1fe93 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -18,17 +18,12 @@ #include #include -#include "../kvm_util.h" +#include "kvm_util.h" +#include "ucall_common.h" extern bool host_cpu_is_intel; extern bool host_cpu_is_amd; -enum vm_guest_x86_subtype { - VM_SUBTYPE_NONE = 0, - VM_SUBTYPE_SEV, - VM_SUBTYPE_SEV_ES, -}; - /* Forced emulation prefix, used to invoke the emulator unconditionally. */ #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" @@ -282,6 +277,7 @@ struct kvm_x86_cpu_property { #define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31) #define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7) #define X86_PROPERTY_MAX_VIRT_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15) +#define X86_PROPERTY_GUEST_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 16, 23) #define X86_PROPERTY_SEV_C_BIT KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5) #define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) @@ -1139,8 +1135,6 @@ struct idt_entry { uint32_t offset2; uint32_t reserved; }; -void vm_init_descriptor_tables(struct kvm_vm *vm); -void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h index 8a1bf88474..82c11c81a9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/sev.h +++ b/tools/testing/selftests/kvm/include/x86_64/sev.h @@ -31,8 +31,9 @@ void sev_vm_launch(struct kvm_vm *vm, uint32_t policy); void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement); void sev_vm_launch_finish(struct kvm_vm *vm); -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu); +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement); kvm_static_assert(SEV_RET_SUCCESS == 0); @@ -67,20 +68,8 @@ kvm_static_assert(SEV_RET_SUCCESS == 0); __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ }) -static inline void sev_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); -} - - -static inline void sev_es_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); -} +void sev_vm_init(struct kvm_vm *vm); +void sev_es_vm_init(struct kvm_vm *vm); static inline void sev_register_encrypted_memory(struct kvm_vm *vm, struct userspace_mem_region *region) diff --git a/tools/testing/selftests/kvm/include/x86_64/ucall.h b/tools/testing/selftests/kvm/include/x86_64/ucall.h index 06b244bd06..d3825dcc3c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/ucall.h +++ b/tools/testing/selftests/kvm/include/x86_64/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_IO diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 698c1cfa31..f02355c3c4 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -6,8 +6,6 @@ * * Test the fd-based interface for KVM statistics. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index b9e23265e4..c78f34699f 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -6,8 +6,6 @@ * * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index e0ba97ac1c..dd8b12f626 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -8,9 +8,6 @@ * page size have been pre-allocated on your system, if you are planning to * use hugepages to back the guest memory for testing. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include @@ -21,6 +18,7 @@ #include "kvm_util.h" #include "processor.h" #include "guest_modes.h" +#include "ucall_common.h" #define TEST_MEM_SLOT_INDEX 1 diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic.c b/tools/testing/selftests/kvm/lib/aarch64/gic.c index 55668631d5..7abbf88665 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic.c @@ -17,13 +17,12 @@ static const struct gic_common_ops *gic_common_ops; static struct spinlock gic_lock; -static void gic_cpu_init(unsigned int cpu, void *redist_base) +static void gic_cpu_init(unsigned int cpu) { - gic_common_ops->gic_cpu_init(cpu, redist_base); + gic_common_ops->gic_cpu_init(cpu); } -static void -gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) +static void gic_dist_init(enum gic_type type, unsigned int nr_cpus) { const struct gic_common_ops *gic_ops = NULL; @@ -40,7 +39,7 @@ gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) GUEST_ASSERT(gic_ops); - gic_ops->gic_init(nr_cpus, dist_base); + gic_ops->gic_init(nr_cpus); gic_common_ops = gic_ops; /* Make sure that the initialized data is visible to all the vCPUs */ @@ -49,18 +48,15 @@ gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) spin_unlock(&gic_lock); } -void gic_init(enum gic_type type, unsigned int nr_cpus, - void *dist_base, void *redist_base) +void gic_init(enum gic_type type, unsigned int nr_cpus) { uint32_t cpu = guest_get_vcpuid(); GUEST_ASSERT(type < GIC_TYPE_MAX); - GUEST_ASSERT(dist_base); - GUEST_ASSERT(redist_base); GUEST_ASSERT(nr_cpus); - gic_dist_init(type, nr_cpus, dist_base); - gic_cpu_init(cpu, redist_base); + gic_dist_init(type, nr_cpus); + gic_cpu_init(cpu); } void gic_irq_enable(unsigned int intid) diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h index 75d07313c8..d24e9ecc96 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h @@ -8,8 +8,8 @@ #define SELFTEST_KVM_GIC_PRIVATE_H struct gic_common_ops { - void (*gic_init)(unsigned int nr_cpus, void *dist_base); - void (*gic_cpu_init)(unsigned int cpu, void *redist_base); + void (*gic_init)(unsigned int nr_cpus); + void (*gic_cpu_init)(unsigned int cpu); void (*gic_irq_enable)(unsigned int intid); void (*gic_irq_disable)(unsigned int intid); uint64_t (*gic_read_iar)(void); diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c index 263bf3ed8f..66d05506f7 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c @@ -9,12 +9,21 @@ #include "processor.h" #include "delay.h" +#include "gic.h" #include "gic_v3.h" #include "gic_private.h" +#define GICV3_MAX_CPUS 512 + +#define GICD_INT_DEF_PRI 0xa0 +#define GICD_INT_DEF_PRI_X4 ((GICD_INT_DEF_PRI << 24) |\ + (GICD_INT_DEF_PRI << 16) |\ + (GICD_INT_DEF_PRI << 8) |\ + GICD_INT_DEF_PRI) + +#define ICC_PMR_DEF_PRIO 0xf0 + struct gicv3_data { - void *dist_base; - void *redist_base[GICV3_MAX_CPUS]; unsigned int nr_cpus; unsigned int nr_spis; }; @@ -35,17 +44,23 @@ static void gicv3_gicd_wait_for_rwp(void) { unsigned int count = 100000; /* 1s */ - while (readl(gicv3_data.dist_base + GICD_CTLR) & GICD_CTLR_RWP) { + while (readl(GICD_BASE_GVA + GICD_CTLR) & GICD_CTLR_RWP) { GUEST_ASSERT(count--); udelay(10); } } -static void gicv3_gicr_wait_for_rwp(void *redist_base) +static inline volatile void *gicr_base_cpu(uint32_t cpu) +{ + /* Align all the redistributors sequentially */ + return GICR_BASE_GVA + cpu * SZ_64K * 2; +} + +static void gicv3_gicr_wait_for_rwp(uint32_t cpu) { unsigned int count = 100000; /* 1s */ - while (readl(redist_base + GICR_CTLR) & GICR_CTLR_RWP) { + while (readl(gicr_base_cpu(cpu) + GICR_CTLR) & GICR_CTLR_RWP) { GUEST_ASSERT(count--); udelay(10); } @@ -56,7 +71,7 @@ static void gicv3_wait_for_rwp(uint32_t cpu_or_dist) if (cpu_or_dist & DIST_BIT) gicv3_gicd_wait_for_rwp(); else - gicv3_gicr_wait_for_rwp(gicv3_data.redist_base[cpu_or_dist]); + gicv3_gicr_wait_for_rwp(cpu_or_dist); } static enum gicv3_intid_range get_intid_range(unsigned int intid) @@ -116,15 +131,15 @@ static void gicv3_set_eoi_split(bool split) uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset) { - void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base - : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); return readl(base + offset); } void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val) { - void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base - : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); writel(reg_val, base + offset); } @@ -263,7 +278,7 @@ static bool gicv3_irq_get_pending(uint32_t intid) return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1); } -static void gicv3_enable_redist(void *redist_base) +static void gicv3_enable_redist(volatile void *redist_base) { uint32_t val = readl(redist_base + GICR_WAKER); unsigned int count = 100000; /* 1s */ @@ -278,21 +293,15 @@ static void gicv3_enable_redist(void *redist_base) } } -static inline void *gicr_base_cpu(void *redist_base, uint32_t cpu) +static void gicv3_cpu_init(unsigned int cpu) { - /* Align all the redistributors sequentially */ - return redist_base + cpu * SZ_64K * 2; -} - -static void gicv3_cpu_init(unsigned int cpu, void *redist_base) -{ - void *sgi_base; + volatile void *sgi_base; unsigned int i; - void *redist_base_cpu; + volatile void *redist_base_cpu; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); - redist_base_cpu = gicr_base_cpu(redist_base, cpu); + redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); gicv3_enable_redist(redist_base_cpu); @@ -310,7 +319,7 @@ static void gicv3_cpu_init(unsigned int cpu, void *redist_base) writel(GICD_INT_DEF_PRI_X4, sgi_base + GICR_IPRIORITYR0 + i); - gicv3_gicr_wait_for_rwp(redist_base_cpu); + gicv3_gicr_wait_for_rwp(cpu); /* Enable the GIC system register (ICC_*) access */ write_sysreg_s(read_sysreg_s(SYS_ICC_SRE_EL1) | ICC_SRE_EL1_SRE, @@ -320,18 +329,15 @@ static void gicv3_cpu_init(unsigned int cpu, void *redist_base) write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); /* Enable non-secure Group-1 interrupts */ - write_sysreg_s(ICC_IGRPEN1_EL1_ENABLE, SYS_ICC_GRPEN1_EL1); - - gicv3_data.redist_base[cpu] = redist_base_cpu; + write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); } static void gicv3_dist_init(void) { - void *dist_base = gicv3_data.dist_base; unsigned int i; /* Disable the distributor until we set things up */ - writel(0, dist_base + GICD_CTLR); + writel(0, GICD_BASE_GVA + GICD_CTLR); gicv3_gicd_wait_for_rwp(); /* @@ -339,33 +345,32 @@ static void gicv3_dist_init(void) * Also, deactivate and disable them. */ for (i = 32; i < gicv3_data.nr_spis; i += 32) { - writel(~0, dist_base + GICD_IGROUPR + i / 8); - writel(~0, dist_base + GICD_ICACTIVER + i / 8); - writel(~0, dist_base + GICD_ICENABLER + i / 8); + writel(~0, GICD_BASE_GVA + GICD_IGROUPR + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICACTIVER + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICENABLER + i / 8); } /* Set a default priority for all the SPIs */ for (i = 32; i < gicv3_data.nr_spis; i += 4) writel(GICD_INT_DEF_PRI_X4, - dist_base + GICD_IPRIORITYR + i); + GICD_BASE_GVA + GICD_IPRIORITYR + i); /* Wait for the settings to sync-in */ gicv3_gicd_wait_for_rwp(); /* Finally, enable the distributor globally with ARE */ writel(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | - GICD_CTLR_ENABLE_G1, dist_base + GICD_CTLR); + GICD_CTLR_ENABLE_G1, GICD_BASE_GVA + GICD_CTLR); gicv3_gicd_wait_for_rwp(); } -static void gicv3_init(unsigned int nr_cpus, void *dist_base) +static void gicv3_init(unsigned int nr_cpus) { GUEST_ASSERT(nr_cpus <= GICV3_MAX_CPUS); gicv3_data.nr_cpus = nr_cpus; - gicv3_data.dist_base = dist_base; gicv3_data.nr_spis = GICD_TYPER_SPIS( - readl(gicv3_data.dist_base + GICD_TYPER)); + readl(GICD_BASE_GVA + GICD_TYPER)); if (gicv3_data.nr_spis > 1020) gicv3_data.nr_spis = 1020; @@ -396,3 +401,27 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, }; + +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table) +{ + volatile void *rdist_base = gicr_base_cpu(guest_get_vcpuid()); + + u32 ctlr; + u64 val; + + val = (cfg_table | + GICR_PROPBASER_InnerShareable | + GICR_PROPBASER_RaWaWb | + ((ilog2(cfg_table_size) - 1) & GICR_PROPBASER_IDBITS_MASK)); + writeq_relaxed(val, rdist_base + GICR_PROPBASER); + + val = (pend_table | + GICR_PENDBASER_InnerShareable | + GICR_PENDBASER_RaWaWb); + writeq_relaxed(val, rdist_base + GICR_PENDBASER); + + ctlr = readl_relaxed(rdist_base + GICR_CTLR); + ctlr |= GICR_CTLR_ENABLE_LPIS; + writel_relaxed(ctlr, rdist_base + GICR_CTLR); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c new file mode 100644 index 0000000000..09f2705456 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest ITS library, generously donated by drivers/irqchip/irq-gic-v3-its.c + * over in the kernel tree. + */ + +#include +#include +#include +#include + +#include "kvm_util.h" +#include "vgic.h" +#include "gic.h" +#include "gic_v3.h" +#include "processor.h" + +static u64 its_read_u64(unsigned long offset) +{ + return readq_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u64(unsigned long offset, u64 val) +{ + writeq_relaxed(val, GITS_BASE_GVA + offset); +} + +static u32 its_read_u32(unsigned long offset) +{ + return readl_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u32(unsigned long offset, u32 val) +{ + writel_relaxed(val, GITS_BASE_GVA + offset); +} + +static unsigned long its_find_baser(unsigned int type) +{ + int i; + + for (i = 0; i < GITS_BASER_NR_REGS; i++) { + u64 baser; + unsigned long offset = GITS_BASER + (i * sizeof(baser)); + + baser = its_read_u64(offset); + if (GITS_BASER_TYPE(baser) == type) + return offset; + } + + GUEST_FAIL("Couldn't find an ITS BASER of type %u", type); + return -1; +} + +static void its_install_table(unsigned int type, vm_paddr_t base, size_t size) +{ + unsigned long offset = its_find_baser(type); + u64 baser; + + baser = ((size / SZ_64K) - 1) | + GITS_BASER_PAGE_SIZE_64K | + GITS_BASER_InnerShareable | + base | + GITS_BASER_RaWaWb | + GITS_BASER_VALID; + + its_write_u64(offset, baser); +} + +static void its_install_cmdq(vm_paddr_t base, size_t size) +{ + u64 cbaser; + + cbaser = ((size / SZ_4K) - 1) | + GITS_CBASER_InnerShareable | + base | + GITS_CBASER_RaWaWb | + GITS_CBASER_VALID; + + its_write_u64(GITS_CBASER, cbaser); +} + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size) +{ + u32 ctlr; + + its_install_table(GITS_BASER_TYPE_COLLECTION, coll_tbl, coll_tbl_sz); + its_install_table(GITS_BASER_TYPE_DEVICE, device_tbl, device_tbl_sz); + its_install_cmdq(cmdq, cmdq_size); + + ctlr = its_read_u32(GITS_CTLR); + ctlr |= GITS_CTLR_ENABLE; + its_write_u32(GITS_CTLR, ctlr); +} + +struct its_cmd_block { + union { + u64 raw_cmd[4]; + __le64 raw_cmd_le[4]; + }; +}; + +static inline void its_fixup_cmd(struct its_cmd_block *cmd) +{ + /* Let's fixup BE commands */ + cmd->raw_cmd_le[0] = cpu_to_le64(cmd->raw_cmd[0]); + cmd->raw_cmd_le[1] = cpu_to_le64(cmd->raw_cmd[1]); + cmd->raw_cmd_le[2] = cpu_to_le64(cmd->raw_cmd[2]); + cmd->raw_cmd_le[3] = cpu_to_le64(cmd->raw_cmd[3]); +} + +static void its_mask_encode(u64 *raw_cmd, u64 val, int h, int l) +{ + u64 mask = GENMASK_ULL(h, l); + *raw_cmd &= ~mask; + *raw_cmd |= (val << l) & mask; +} + +static void its_encode_cmd(struct its_cmd_block *cmd, u8 cmd_nr) +{ + its_mask_encode(&cmd->raw_cmd[0], cmd_nr, 7, 0); +} + +static void its_encode_devid(struct its_cmd_block *cmd, u32 devid) +{ + its_mask_encode(&cmd->raw_cmd[0], devid, 63, 32); +} + +static void its_encode_event_id(struct its_cmd_block *cmd, u32 id) +{ + its_mask_encode(&cmd->raw_cmd[1], id, 31, 0); +} + +static void its_encode_phys_id(struct its_cmd_block *cmd, u32 phys_id) +{ + its_mask_encode(&cmd->raw_cmd[1], phys_id, 63, 32); +} + +static void its_encode_size(struct its_cmd_block *cmd, u8 size) +{ + its_mask_encode(&cmd->raw_cmd[1], size, 4, 0); +} + +static void its_encode_itt(struct its_cmd_block *cmd, u64 itt_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 51, 8); +} + +static void its_encode_valid(struct its_cmd_block *cmd, int valid) +{ + its_mask_encode(&cmd->raw_cmd[2], !!valid, 63, 63); +} + +static void its_encode_target(struct its_cmd_block *cmd, u64 target_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16); +} + +static void its_encode_collection(struct its_cmd_block *cmd, u16 col) +{ + its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); +} + +#define GITS_CMDQ_POLL_ITERATIONS 0 + +static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) +{ + u64 cwriter = its_read_u64(GITS_CWRITER); + struct its_cmd_block *dst = cmdq_base + cwriter; + u64 cbaser = its_read_u64(GITS_CBASER); + size_t cmdq_size; + u64 next; + int i; + + cmdq_size = ((cbaser & 0xFF) + 1) * SZ_4K; + + its_fixup_cmd(cmd); + + WRITE_ONCE(*dst, *cmd); + dsb(ishst); + next = (cwriter + sizeof(*cmd)) % cmdq_size; + its_write_u64(GITS_CWRITER, next); + + /* + * Polling isn't necessary considering KVM's ITS emulation at the time + * of writing this, as the CMDQ is processed synchronously after a write + * to CWRITER. + */ + for (i = 0; its_read_u64(GITS_CREADR) != next; i++) { + __GUEST_ASSERT(i < GITS_CMDQ_POLL_ITERATIONS, + "ITS didn't process command at offset %lu after %d iterations\n", + cwriter, i); + + cpu_relax(); + } +} + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPD); + its_encode_devid(&cmd, device_id); + its_encode_size(&cmd, ilog2(itt_size) - 1); + its_encode_itt(&cmd, itt_base); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPC); + its_encode_collection(&cmd, collection_id); + its_encode_target(&cmd, vcpu_id); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPTI); + its_encode_devid(&cmd, device_id); + its_encode_event_id(&cmd, event_id); + its_encode_phys_id(&cmd, intid); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_invall_cmd(void *cmdq_base, u32 collection_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_INVALL); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index a9eb17295b..0ac7cc89f3 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -11,6 +11,8 @@ #include "guest_modes.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" + #include #include diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 184378d593..4427f43f73 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -3,8 +3,10 @@ * ARM Generic Interrupt Controller (GIC) v3 host support */ +#include #include #include +#include #include #include @@ -19,8 +21,6 @@ * Input args: * vm - KVM VM * nr_vcpus - Number of vCPUs supported by this VM - * gicd_base_gpa - Guest Physical Address of the Distributor region - * gicr_base_gpa - Guest Physical Address of the Redistributor region * * Output args: None * @@ -30,11 +30,10 @@ * redistributor regions of the guest. Since it depends on the number of * vCPUs for the VM, it must be called after all the vCPUs have been created. */ -int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, - uint64_t gicd_base_gpa, uint64_t gicr_base_gpa) +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs) { int gic_fd; - uint64_t redist_attr; + uint64_t attr; struct list_head *iter; unsigned int nr_gic_pages, nr_vcpus_created = 0; @@ -60,18 +59,19 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + attr = GICD_BASE_GPA; kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST, &gicd_base_gpa); + KVM_VGIC_V3_ADDR_TYPE_DIST, &attr); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE); - virt_map(vm, gicd_base_gpa, gicd_base_gpa, nr_gic_pages); + virt_map(vm, GICD_BASE_GPA, GICD_BASE_GPA, nr_gic_pages); /* Redistributor setup */ - redist_attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, gicr_base_gpa, 0, 0); + attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, GICR_BASE_GPA, 0, 0); kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &redist_attr); + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &attr); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_REDIST_SIZE * nr_vcpus); - virt_map(vm, gicr_base_gpa, gicr_base_gpa, nr_gic_pages); + virt_map(vm, GICR_BASE_GPA, GICR_BASE_GPA, nr_gic_pages); kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); @@ -168,3 +168,21 @@ void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) { vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER); } + +int vgic_its_setup(struct kvm_vm *vm) +{ + int its_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_ITS); + u64 attr; + + attr = GITS_BASE_GPA; + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &attr); + + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + virt_map(vm, GITS_BASE_GPA, GITS_BASE_GPA, + vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_ITS_SIZE)); + + return its_fd; +} diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index 2bd25b191d..b49690658c 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Google LLC. */ - -#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/ - #include "test_util.h" #include diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b2262b5fad..ad00e47618 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -4,11 +4,10 @@ * * Copyright (C) 2018, Google LLC. */ - -#define _GNU_SOURCE /* for program_invocation_name */ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #include #include @@ -20,6 +19,9 @@ #define KVM_UTIL_MIN_PFN 2 +uint32_t guest_random_seed; +struct guest_random_state guest_rng; + static int vcpu_mmap_sz(void); int open_path_or_exit(const char *path, int flags) @@ -276,7 +278,6 @@ struct kvm_vm *____vm_create(struct vm_shape shape) vm->mode = shape.mode; vm->type = shape.type; - vm->subtype = shape.subtype; vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; @@ -433,6 +434,10 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, slot0 = memslot2region(vm, 0); ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); + pr_info("Random seed: 0x%x\n", guest_random_seed); + guest_rng = new_guest_random_state(guest_random_seed); + sync_global_to_guest(vm, guest_rng); + kvm_arch_vm_post_create(vm); return vm; @@ -930,6 +935,10 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, errno, strerror(errno)); } +#define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \ + __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \ + "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)") + int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva, uint32_t guest_memfd, uint64_t guest_memfd_offset) @@ -944,6 +953,8 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag .guest_memfd_offset = guest_memfd_offset, }; + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); } @@ -970,6 +981,8 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, size_t mem_size = npages * vm->page_size; size_t alignment; + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); @@ -2306,6 +2319,8 @@ void __attribute((constructor)) kvm_selftest_init(void) /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + guest_random_seed = random(); + kvm_selftest_arch_init(); } diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index cf2c739713..313277486a 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -2,14 +2,13 @@ /* * Copyright (C) 2020, Google LLC. */ -#define _GNU_SOURCE - #include #include #include "kvm_util.h" #include "memstress.h" #include "processor.h" +#include "ucall_common.h" struct memstress_args memstress_args; @@ -56,7 +55,7 @@ void memstress_guest_code(uint32_t vcpu_idx) uint64_t page; int i; - rand_state = new_guest_random_state(args->random_seed + vcpu_idx); + rand_state = new_guest_random_state(guest_random_seed + vcpu_idx); gva = vcpu_args->gva; pages = vcpu_args->pages; @@ -76,7 +75,7 @@ void memstress_guest_code(uint32_t vcpu_idx) addr = gva + (page * args->guest_page_size); - if (guest_random_u32(&rand_state) % 100 < args->write_percent) + if (__guest_random_bool(&rand_state, args->write_percent)) *(uint64_t *)addr = 0x0123456789ABCDEF; else READ_ONCE(*(uint64_t *)addr); @@ -243,12 +242,6 @@ void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) sync_global_to_guest(vm, memstress_args.write_percent); } -void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) -{ - memstress_args.random_seed = random_seed; - sync_global_to_guest(vm, memstress_args.random_seed); -} - void memstress_set_random_access(struct kvm_vm *vm, bool random_access) { memstress_args.random_access = random_access; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index e8211f5d68..6ae47b3d6b 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -10,6 +10,7 @@ #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN 0xac0000 @@ -502,3 +503,15 @@ bool guest_sbi_probe_extension(int extid, long *out_val) return true; } + +unsigned long get_host_sbi_spec_version(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_GET_SPEC_VERSION, 0, + 0, 0, 0, 0, 0); + + GUEST_ASSERT(!ret.error); + + return ret.value; +} diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 14ee17151a..b5035c63d5 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -9,6 +9,7 @@ #include "kvm_util.h" #include "processor.h" +#include "sbi.h" void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 5a8f8becb1..8ed0b74ae8 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -4,8 +4,6 @@ * * Copyright (C) 2020, Google LLC. */ - -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index f5af65a41c..42151e5719 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -1,9 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only -#include "kvm_util.h" #include "linux/types.h" #include "linux/bitmap.h" #include "linux/atomic.h" +#include "kvm_util.h" +#include "ucall_common.h" + + #define GUEST_UCALL_FAILED -1 struct ucall_header { diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index f4eef6eb2d..7c9de84144 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -6,9 +6,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019-2022 Google LLC */ - -#define _GNU_SOURCE /* for pipe2 */ - #include #include #include @@ -16,6 +13,7 @@ #include #include #include +#include #include #include "kvm_util.h" @@ -27,76 +25,69 @@ static void *uffd_handler_thread_fn(void *arg) { - struct uffd_desc *uffd_desc = (struct uffd_desc *)arg; - int uffd = uffd_desc->uffd; - int pipefd = uffd_desc->pipefds[0]; - useconds_t delay = uffd_desc->delay; + struct uffd_reader_args *reader_args = (struct uffd_reader_args *)arg; + int uffd = reader_args->uffd; int64_t pages = 0; struct timespec start; struct timespec ts_diff; + struct epoll_event evt; + int epollfd; + + epollfd = epoll_create(1); + TEST_ASSERT(epollfd >= 0, "Failed to create epollfd."); + + evt.events = EPOLLIN | EPOLLEXCLUSIVE; + evt.data.u32 = 0; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, uffd, &evt), + "Failed to add uffd to epollfd"); + + evt.events = EPOLLIN; + evt.data.u32 = 1; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, reader_args->pipe, &evt), + "Failed to add pipe to epollfd"); clock_gettime(CLOCK_MONOTONIC, &start); while (1) { struct uffd_msg msg; - struct pollfd pollfd[2]; - char tmp_chr; int r; - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd; - pollfd[1].events = POLLIN; + r = epoll_wait(epollfd, &evt, 1, -1); + TEST_ASSERT(r == 1, + "Unexpected number of events (%d) from epoll, errno = %d", + r, errno); - r = poll(pollfd, 2, -1); - switch (r) { - case -1: - pr_info("poll err"); - continue; - case 0: - continue; - case 1: - break; - default: - pr_info("Polling uffd returned %d", r); - return NULL; - } - - if (pollfd[0].revents & POLLERR) { - pr_info("uffd revents has POLLERR"); - return NULL; - } + if (evt.data.u32 == 1) { + char tmp_chr; - if (pollfd[1].revents & POLLIN) { - r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on pipe."); + r = read(reader_args->pipe, &tmp_chr, 1); TEST_ASSERT(r == 1, - "Error reading pipefd in UFFD thread"); + "Error reading pipefd in uffd reader thread"); break; } - if (!(pollfd[0].revents & POLLIN)) - continue; + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on uffd."); r = read(uffd, &msg, sizeof(msg)); if (r == -1) { - if (errno == EAGAIN) - continue; - pr_info("Read of uffd got errno %d\n", errno); - return NULL; + TEST_ASSERT(errno == EAGAIN, + "Error reading from UFFD: errno = %d", errno); + continue; } - if (r != sizeof(msg)) { - pr_info("Read on uffd returned unexpected size: %d bytes", r); - return NULL; - } + TEST_ASSERT(r == sizeof(msg), + "Read on uffd returned unexpected number of bytes (%d)", r); if (!(msg.event & UFFD_EVENT_PAGEFAULT)) continue; - if (delay) - usleep(delay); - r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg); - if (r < 0) - return NULL; + if (reader_args->delay) + usleep(reader_args->delay); + r = reader_args->handler(reader_args->uffd_mode, uffd, &msg); + TEST_ASSERT(r >= 0, + "Reader thread handler fn returned negative value %d", r); pages++; } @@ -110,6 +101,7 @@ static void *uffd_handler_thread_fn(void *arg) struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler) { struct uffd_desc *uffd_desc; @@ -118,14 +110,25 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; - int ret; + int ret, i; PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", is_minor ? "MINOR" : "MISSING", is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); uffd_desc = malloc(sizeof(struct uffd_desc)); - TEST_ASSERT(uffd_desc, "malloc failed"); + TEST_ASSERT(uffd_desc, "Failed to malloc uffd descriptor"); + + uffd_desc->pipefds = calloc(sizeof(int), num_readers); + TEST_ASSERT(uffd_desc->pipefds, "Failed to alloc pipes"); + + uffd_desc->readers = calloc(sizeof(pthread_t), num_readers); + TEST_ASSERT(uffd_desc->readers, "Failed to alloc reader threads"); + + uffd_desc->reader_args = calloc(sizeof(struct uffd_reader_args), num_readers); + TEST_ASSERT(uffd_desc->reader_args, "Failed to alloc reader_args"); + + uffd_desc->num_readers = num_readers; /* In order to get minor faults, prefault via the alias. */ if (is_minor) @@ -148,18 +151,28 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == expected_ioctls, "missing userfaultfd ioctls"); - ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(!ret, "Failed to set up pipefd"); - - uffd_desc->uffd_mode = uffd_mode; uffd_desc->uffd = uffd; - uffd_desc->delay = delay; - uffd_desc->handler = handler; - pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn, - uffd_desc); + for (i = 0; i < uffd_desc->num_readers; ++i) { + int pipes[2]; - PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", - hva, hva + len); + ret = pipe2((int *) &pipes, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!ret, "Failed to set up pipefd %i for uffd_desc %p", + i, uffd_desc); + + uffd_desc->pipefds[i] = pipes[1]; + + uffd_desc->reader_args[i].uffd_mode = uffd_mode; + uffd_desc->reader_args[i].uffd = uffd; + uffd_desc->reader_args[i].delay = delay; + uffd_desc->reader_args[i].handler = handler; + uffd_desc->reader_args[i].pipe = pipes[0]; + + pthread_create(&uffd_desc->readers[i], NULL, uffd_handler_thread_fn, + &uffd_desc->reader_args[i]); + + PER_VCPU_DEBUG("Created uffd thread %i for HVA range [%p, %p)\n", + i, hva, hva + len); + } return uffd_desc; } @@ -167,19 +180,26 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void uffd_stop_demand_paging(struct uffd_desc *uffd) { char c = 0; - int ret; + int i; - ret = write(uffd->pipefds[1], &c, 1); - TEST_ASSERT(ret == 1, "Unable to write to pipefd"); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(write(uffd->pipefds[i], &c, 1) == 1, + "Unable to write to pipefd %i for uffd_desc %p", i, uffd); - ret = pthread_join(uffd->thread, NULL); - TEST_ASSERT(ret == 0, "Pthread_join failed."); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(!pthread_join(uffd->readers[i], NULL), + "Pthread_join failed on reader %i for uffd_desc %p", i, uffd); close(uffd->uffd); - close(uffd->pipefds[1]); - close(uffd->pipefds[0]); + for (i = 0; i < uffd->num_readers; ++i) { + close(uffd->pipefds[i]); + close(uffd->reader_args[i].pipe); + } + free(uffd->pipefds); + free(uffd->readers); + free(uffd->reader_args); free(uffd); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 74a4c736c9..594b061aef 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -15,14 +15,16 @@ #define NUM_INTERRUPTS 256 #endif -#define DEFAULT_CODE_SELECTOR 0x8 -#define DEFAULT_DATA_SELECTOR 0x10 +#define KERNEL_CS 0x8 +#define KERNEL_DS 0x10 +#define KERNEL_TSS 0x18 #define MAX_NR_CPUID_ENTRIES 100 vm_vaddr_t exception_handlers; bool host_cpu_is_amd; bool host_cpu_is_intel; +bool is_forced_emulation_enabled; static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { @@ -417,7 +419,7 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp) static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) { - void *gdt = addr_gva2hva(vm, vm->gdt); + void *gdt = addr_gva2hva(vm, vm->arch.gdt); struct desc64 *desc = gdt + (segp->selector >> 3) * 8; desc->limit0 = segp->limit & 0xFFFF; @@ -437,27 +439,10 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->base3 = segp->base >> 32; } - -/* - * Set Long Mode Flat Kernel Code Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a code segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_CS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed @@ -466,30 +451,12 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, segp->g = true; segp->l = true; segp->present = 1; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } -/* - * Set Long Mode Flat Kernel Data Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a data segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_DS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed @@ -497,8 +464,6 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, */ segp->g = true; segp->present = true; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) @@ -516,72 +481,153 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); } -static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) -{ - if (!vm->gdt) - vm->gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - - dt->base = vm->gdt; - dt->limit = getpagesize(); -} - -static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, - int selector) +static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp) { - if (!vm->tss) - vm->tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - memset(segp, 0, sizeof(*segp)); - segp->base = vm->tss; + segp->base = base; segp->limit = 0x67; - segp->selector = selector; + segp->selector = KERNEL_TSS; segp->type = 0xb; segp->present = 1; - kvm_seg_fill_gdt_64bit(vm, segp); } -static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; + TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); - sregs.idt.limit = 0; + sregs.idt.base = vm->arch.idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->arch.gdt; + sregs.gdt.limit = getpagesize() - 1; + + sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; + sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; + sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); + + kvm_seg_set_unusable(&sregs.ldt); + kvm_seg_set_kernel_code_64bit(&sregs.cs); + kvm_seg_set_kernel_data_64bit(&sregs.ds); + kvm_seg_set_kernel_data_64bit(&sregs.es); + kvm_seg_set_kernel_data_64bit(&sregs.gs); + kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); + + sregs.cr3 = vm->pgd; + vcpu_sregs_set(vcpu, &sregs); +} + +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; +} + +static bool kvm_fixup_exception(struct ex_regs *regs) +{ + if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) + return false; - kvm_setup_gdt(vm, &sregs.gdt); + if (regs->vector == DE_VECTOR) + return false; - switch (vm->mode) { - case VM_MODE_PXXV48_4K: - sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; - sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; - sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); + regs->rip = regs->r11; + regs->r9 = regs->vector; + regs->r10 = regs->error_code; + return true; +} - kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); - break; +void route_exception(struct ex_regs *regs) +{ + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; - default: - TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; } - sregs.cr3 = vm->pgd; - vcpu_sregs_set(vcpu, &sregs); + if (kvm_fixup_exception(regs)) + return; + + ucall_assert(UCALL_UNHANDLED, + "Unhandled exception in guest", __FILE__, __LINE__, + "Unhandled exception '0x%lx' at guest RIP '0x%lx'", + regs->vector, regs->rip); +} + +static void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + extern void *idt_handlers; + struct kvm_segment seg; + int i; + + vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; + + kvm_seg_set_kernel_code_64bit(&seg); + kvm_seg_fill_gdt_64bit(vm, &seg); + + kvm_seg_set_kernel_data_64bit(&seg); + kvm_seg_fill_gdt_64bit(vm, &seg); + + kvm_seg_set_tss_64bit(vm->arch.tss, &seg); + kvm_seg_fill_gdt_64bit(vm, &seg); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); + + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) + REPORT_GUEST_ASSERT(uc); } void kvm_arch_vm_post_create(struct kvm_vm *vm) { vm_create_irqchip(vm); + vm_init_descriptor_tables(vm); + sync_global_to_guest(vm, host_cpu_is_intel); sync_global_to_guest(vm, host_cpu_is_amd); + sync_global_to_guest(vm, is_forced_emulation_enabled); + + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + struct kvm_sev_init init = { 0 }; - if (vm->subtype == VM_SUBTYPE_SEV) - sev_vm_init(vm); - else if (vm->subtype == VM_SUBTYPE_SEV_ES) - sev_es_vm_init(vm); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } } void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) @@ -621,7 +667,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) vcpu = __vm_vcpu_add(vm, vcpu_id); vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); - vcpu_setup(vm, vcpu); + vcpu_init_sregs(vm, vcpu); /* Setup guest general purpose registers */ vcpu_regs_get(vcpu, ®s); @@ -1081,108 +1127,15 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) void kvm_init_vm_address_properties(struct kvm_vm *vm) { - if (vm->subtype == VM_SUBTYPE_SEV || vm->subtype == VM_SUBTYPE_SEV_ES) { + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + vm->arch.sev_fd = open_sev_dev_path_or_exit(); vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); vm->gpa_tag_mask = vm->arch.c_bit; + } else { + vm->arch.sev_fd = -1; } } -static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, - int dpl, unsigned short selector) -{ - struct idt_entry *base = - (struct idt_entry *)addr_gva2hva(vm, vm->idt); - struct idt_entry *e = &base[vector]; - - memset(e, 0, sizeof(*e)); - e->offset0 = addr; - e->selector = selector; - e->ist = 0; - e->type = 14; - e->dpl = dpl; - e->p = 1; - e->offset1 = addr >> 16; - e->offset2 = addr >> 32; -} - - -static bool kvm_fixup_exception(struct ex_regs *regs) -{ - if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) - return false; - - if (regs->vector == DE_VECTOR) - return false; - - regs->rip = regs->r11; - regs->r9 = regs->vector; - regs->r10 = regs->error_code; - return true; -} - -void route_exception(struct ex_regs *regs) -{ - typedef void(*handler)(struct ex_regs *); - handler *handlers = (handler *)exception_handlers; - - if (handlers && handlers[regs->vector]) { - handlers[regs->vector](regs); - return; - } - - if (kvm_fixup_exception(regs)) - return; - - ucall_assert(UCALL_UNHANDLED, - "Unhandled exception in guest", __FILE__, __LINE__, - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", - regs->vector, regs->rip); -} - -void vm_init_descriptor_tables(struct kvm_vm *vm) -{ - extern void *idt_handlers; - int i; - - vm->idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - /* Handlers have the same address in both address spaces.*/ - for (i = 0; i < NUM_INTERRUPTS; i++) - set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, - DEFAULT_CODE_SELECTOR); -} - -void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) -{ - struct kvm_vm *vm = vcpu->vm; - struct kvm_sregs sregs; - - vcpu_sregs_get(vcpu, &sregs); - sregs.idt.base = vm->idt; - sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; - sregs.gdt.base = vm->gdt; - sregs.gdt.limit = getpagesize() - 1; - kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); - vcpu_sregs_set(vcpu, &sregs); - *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; -} - -void vm_install_exception_handler(struct kvm_vm *vm, int vector, - void (*handler)(struct ex_regs *)) -{ - vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); - - handlers[vector] = (vm_vaddr_t)handler; -} - -void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) -{ - struct ucall uc; - - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) - REPORT_GUEST_ASSERT(uc); -} - const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, uint32_t function, uint32_t index) { @@ -1294,9 +1247,20 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ unsigned long ht_gfn, max_gfn, max_pfn; - uint8_t maxphyaddr; + uint8_t maxphyaddr, guest_maxphyaddr; + + /* + * Use "guest MAXPHYADDR" from KVM if it's available. Guest MAXPHYADDR + * enumerates the max _mappable_ GPA, which can be less than the raw + * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU + * doesn't support 5-level TDP. + */ + guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR); + guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits; + TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits, + "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR"); - max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; + max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1; /* Avoid reserved HyperTransport region on AMD processors. */ if (!host_cpu_is_amd) @@ -1344,6 +1308,7 @@ void kvm_selftest_arch_init(void) { host_cpu_is_intel = this_cpu_is_intel(); host_cpu_is_amd = this_cpu_is_amd(); + is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); } bool sys_clocksource_is_based_on_tsc(void) diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c index e248d3364b..e9535ee20b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/sev.c +++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include @@ -35,6 +34,32 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio } } +void sev_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + +void sev_es_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_ES_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) { struct kvm_sev_launch_start launch_start = { @@ -87,28 +112,30 @@ void sev_vm_launch_finish(struct kvm_vm *vm) TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING); } -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu) { struct vm_shape shape = { - .type = VM_TYPE_DEFAULT, .mode = VM_MODE_DEFAULT, - .subtype = policy & SEV_POLICY_ES ? VM_SUBTYPE_SEV_ES : - VM_SUBTYPE_SEV, + .type = type, }; struct kvm_vm *vm; struct kvm_vcpu *cpus[1]; - uint8_t measurement[512]; vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus); *cpu = cpus[0]; + return vm; +} + +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement) +{ sev_vm_launch(vm, policy); - /* TODO: Validate the measurement is as expected. */ + if (!measurement) + measurement = alloca(256); + sev_vm_launch_measure(vm, measurement); sev_vm_launch_finish(vm); - - return vm; } diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 1a6da7389b..0b9678858b 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 1563619666..05fcf902e0 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -6,9 +6,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2020, Google, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index 0f9cabd99f..2c792228ac 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -7,13 +7,11 @@ * * Copyright (c) 2024, Intel Corporation. */ - -#define _GNU_SOURCE - #include "arch_timer.h" #include "kvm_util.h" #include "processor.h" #include "timer_test.h" +#include "ucall_common.h" static int timer_irq = IRQ_S_TIMER; @@ -85,7 +83,7 @@ struct kvm_vm *test_vm_create(void) int nr_vcpus = test_args.nr_vcpus; vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); - __TEST_REQUIRE(__vcpu_has_ext(vcpus[0], RISCV_ISA_EXT_REG(KVM_RISCV_ISA_EXT_SSTC)), + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpus[0], KVM_RISCV_ISA_EXT_SSTC), "SSTC not available, skipping test\n"); vm_init_vector_tables(vm); diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c new file mode 100644 index 0000000000..0e07128549 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V KVM ebreak test. + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#include "kvm_util.h" +#include "ucall_common.h" + +#define LABEL_ADDRESS(v) ((uint64_t)&(v)) + +extern unsigned char sw_bp_1, sw_bp_2; +static uint64_t sw_bp_addr; + +static void guest_code(void) +{ + asm volatile( + ".option push\n" + ".option norvc\n" + "sw_bp_1: ebreak\n" + "sw_bp_2: ebreak\n" + ".option pop\n" + ); + GUEST_ASSERT_EQ(READ_ONCE(sw_bp_addr), LABEL_ADDRESS(sw_bp_2)); + + GUEST_DONE(); +} + +static void guest_breakpoint_handler(struct ex_regs *regs) +{ + WRITE_ONCE(sw_bp_addr, regs->epc); + regs->epc += 4; +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint64_t pc; + struct kvm_guest_debug debug = { + .control = KVM_GUESTDBG_ENABLE, + }; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_vector_tables(vm); + vcpu_init_vector_tables(vcpu); + vm_install_exception_handler(vm, EXC_BREAKPOINT, + guest_breakpoint_handler); + + /* + * Enable the guest debug. + * ebreak should exit to the VMM with KVM_EXIT_DEBUG reason. + */ + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &pc); + TEST_ASSERT_EQ(pc, LABEL_ADDRESS(sw_bp_1)); + + /* skip sw_bp_1 */ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), pc + 4); + + /* + * Disable all debug controls. + * Guest should handle the ebreak without exiting to the VMM. + */ + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); + + vcpu_run(vcpu); + + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index b882b7b9b7..222198dd6d 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -43,6 +43,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_V: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT: @@ -408,6 +409,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off) KVM_ISA_EXT_ARR(V), KVM_ISA_EXT_ARR(SMSTATEEN), KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), KVM_ISA_EXT_ARR(SVNAPOT), @@ -931,6 +933,7 @@ KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F); KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D); KVM_ISA_EXT_SIMPLE_CONFIG(h, H); KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN); +KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF); KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC); KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL); KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT); @@ -986,6 +989,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_fp_d, &config_h, &config_smstateen, + &config_sscofpmf, &config_sstc, &config_svinval, &config_svnapot, diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c new file mode 100644 index 0000000000..f299cbfd23 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -0,0 +1,682 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sbi_pmu_test.c - Tests the riscv64 SBI PMU functionality. + * + * Copyright (c) 2024, Rivos Inc. + */ + +#include +#include +#include +#include +#include +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" +#include "sbi.h" +#include "arch_timer.h" +#include "ucall_common.h" + +/* Maximum counters(firmware + hardware) */ +#define RISCV_MAX_PMU_COUNTERS 64 +union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS]; + +/* Snapshot shared memory data */ +#define PMU_SNAPSHOT_GPA_BASE BIT(30) +static void *snapshot_gva; +static vm_paddr_t snapshot_gpa; + +static int vcpu_shared_irq_count; +static int counter_in_use; + +/* Cache the available counters in a bitmask */ +static unsigned long counter_mask_available; + +static bool illegal_handler_invoked; + +#define SBI_PMU_TEST_BASIC BIT(0) +#define SBI_PMU_TEST_EVENTS BIT(1) +#define SBI_PMU_TEST_SNAPSHOT BIT(2) +#define SBI_PMU_TEST_OVERFLOW BIT(3) + +static int disabled_tests; + +unsigned long pmu_csr_read_num(int csr_num) +{ +#define switchcase_csr_read(__csr_num, __val) {\ + case __csr_num: \ + __val = csr_read(__csr_num); \ + break; } +#define switchcase_csr_read_2(__csr_num, __val) {\ + switchcase_csr_read(__csr_num + 0, __val) \ + switchcase_csr_read(__csr_num + 1, __val)} +#define switchcase_csr_read_4(__csr_num, __val) {\ + switchcase_csr_read_2(__csr_num + 0, __val) \ + switchcase_csr_read_2(__csr_num + 2, __val)} +#define switchcase_csr_read_8(__csr_num, __val) {\ + switchcase_csr_read_4(__csr_num + 0, __val) \ + switchcase_csr_read_4(__csr_num + 4, __val)} +#define switchcase_csr_read_16(__csr_num, __val) {\ + switchcase_csr_read_8(__csr_num + 0, __val) \ + switchcase_csr_read_8(__csr_num + 8, __val)} +#define switchcase_csr_read_32(__csr_num, __val) {\ + switchcase_csr_read_16(__csr_num + 0, __val) \ + switchcase_csr_read_16(__csr_num + 16, __val)} + + unsigned long ret = 0; + + switch (csr_num) { + switchcase_csr_read_32(CSR_CYCLE, ret) + switchcase_csr_read_32(CSR_CYCLEH, ret) + default : + break; + } + + return ret; +#undef switchcase_csr_read_32 +#undef switchcase_csr_read_16 +#undef switchcase_csr_read_8 +#undef switchcase_csr_read_4 +#undef switchcase_csr_read_2 +#undef switchcase_csr_read +} + +static inline void dummy_func_loop(uint64_t iter) +{ + int i = 0; + + while (i < iter) { + asm volatile("nop"); + i++; + } +} + +static void start_counter(unsigned long counter, unsigned long start_flags, + unsigned long ival) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, counter, 1, start_flags, + ival, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to start counter %ld\n", counter); +} + +/* This should be invoked only for reset counter use case */ +static void stop_reset_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, + stop_flags | SBI_PMU_STOP_FLAG_RESET, 0, 0, 0); + __GUEST_ASSERT(ret.error == SBI_ERR_ALREADY_STOPPED, + "Unable to stop counter %ld\n", counter); +} + +static void stop_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, stop_flags, + 0, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to stop counter %ld error %ld\n", + counter, ret.error); +} + +static void guest_illegal_exception_handler(struct ex_regs *regs) +{ + __GUEST_ASSERT(regs->cause == EXC_INST_ILLEGAL, + "Unexpected exception handler %lx\n", regs->cause); + + illegal_handler_invoked = true; + /* skip the trapping instruction */ + regs->epc += 4; +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int irq_num = regs->cause & ~CAUSE_IRQ_FLAG; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + unsigned long overflown_mask; + unsigned long counter_val = 0; + + /* Validate that we are in the correct irq handler */ + GUEST_ASSERT_EQ(irq_num, IRQ_PMU_OVF); + + /* Stop all counters first to avoid further interrupts */ + stop_counter(counter_in_use, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + csr_clear(CSR_SIP, BIT(IRQ_PMU_OVF)); + + overflown_mask = READ_ONCE(snapshot_data->ctr_overflow_mask); + GUEST_ASSERT(overflown_mask & 0x01); + + WRITE_ONCE(vcpu_shared_irq_count, vcpu_shared_irq_count+1); + + counter_val = READ_ONCE(snapshot_data->ctr_values[0]); + /* Now start the counter to mimick the real driver behavior */ + start_counter(counter_in_use, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_val); +} + +static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask, + unsigned long cflags, + unsigned long event) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, cmask, + cflags, event, 0, 0); + __GUEST_ASSERT(ret.error == 0, "config matching failed %ld\n", ret.error); + GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS); + GUEST_ASSERT(BIT(ret.value) & counter_mask_available); + + return ret.value; +} + +static unsigned long get_num_counters(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_NUM_COUNTERS, 0, 0, 0, 0, 0, 0); + + __GUEST_ASSERT(ret.error == 0, "Unable to retrieve number of counters from SBI PMU"); + __GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS, + "Invalid number of counters %ld\n", ret.value); + + return ret.value; +} + +static void update_counter_info(int num_counters) +{ + int i = 0; + struct sbiret ret; + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo_arr[i].value = ret.value; + counter_mask_available |= BIT(i); + } + + GUEST_ASSERT(counter_mask_available > 0); +} + +static unsigned long read_fw_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, idx, 0, 0, 0, 0, 0); + GUEST_ASSERT(ret.error == 0); + return ret.value; +} + +static unsigned long read_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + unsigned long counter_val = 0; + + __GUEST_ASSERT(ctrinfo.type < 2, "Invalid counter type %d", ctrinfo.type); + + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) + counter_val = pmu_csr_read_num(ctrinfo.csr); + else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) + counter_val = read_fw_counter(idx, ctrinfo); + + return counter_val; +} + +static inline void verify_sbi_requirement_assert(void) +{ + long out_val = 0; + bool probe; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + if (get_host_sbi_spec_version() < sbi_mk_version(2, 0)) + __GUEST_ASSERT(0, "SBI implementation version doesn't support PMU Snapshot"); +} + +static void snapshot_set_shmem(vm_paddr_t gpa, unsigned long flags) +{ + unsigned long lo = (unsigned long)gpa; +#if __riscv_xlen == 32 + unsigned long hi = (unsigned long)(gpa >> 32); +#else + unsigned long hi = gpa == -1 ? -1 : 0; +#endif + struct sbiret ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, + lo, hi, flags, 0, 0, 0); + + GUEST_ASSERT(ret.value == 0 && ret.error == 0); +} + +static void test_pmu_event(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, ULONG_MAX/2); + stop_counter(counter, 0); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + stop_counter(counter, 0); + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_snapshot(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + /* The counter value is updated w.r.t relative index of cbase */ + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + WRITE_ONCE(snapshot_data->ctr_values[0], ULONG_MAX/2); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_pre = READ_ONCE(snapshot_data->ctr_values[0]); + + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_overflow(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_post; + unsigned long counter_init_value = ULONG_MAX - 10000; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_in_use = counter; + + /* The counter value is updated w.r.t relative index of cbase passed to start/stop */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + udelay(msecs_to_usecs(2000)); + /* irq handler should have stopped the counter */ + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + /* The counter value after stopping should be less the init value due to overflow */ + __GUEST_ASSERT(counter_value_post < counter_init_value, + "counter_value_post %lx counter_init_value %lx for counter\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_invalid_event(void) +{ + struct sbiret ret; + unsigned long event = 0x1234; /* A random event */ + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, 0, + counter_mask_available, 0, event, 0, 0); + GUEST_ASSERT_EQ(ret.error, SBI_ERR_NOT_SUPPORTED); +} + +static void test_pmu_events(void) +{ + int num_counters = 0; + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Sanity testing for any random invalid event */ + test_invalid_event(); + + /* Only these two events are guaranteed to be present */ + test_pmu_event(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_basic_sanity(void) +{ + long out_val = 0; + bool probe; + struct sbiret ret; + int num_counters = 0, i; + union sbi_pmu_ctr_info ctrinfo; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + num_counters = get_num_counters(); + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, + 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo.value = ret.value; + + /** + * Accessibility check of hardware and read capability of firmware counters. + * The spec doesn't mandate any initial value. No need to check any value. + */ + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) { + pmu_csr_read_num(ctrinfo.csr); + GUEST_ASSERT(illegal_handler_invoked); + } else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) { + read_fw_counter(i, ctrinfo); + } + } + + GUEST_DONE(); +} + +static void test_pmu_events_snaphost(void) +{ + int num_counters = 0; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + int i; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Validate shared memory access */ + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_overflow_mask), 0); + for (i = 0; i < num_counters; i++) { + if (counter_mask_available & (BIT(i))) + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_values[i]), 0); + } + /* Only these two events are guranteed to be present */ + test_pmu_event_snapshot(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event_snapshot(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_events_overflow(void) +{ + int num_counters = 0; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + csr_set(CSR_IE, BIT(IRQ_PMU_OVF)); + local_irq_enable(); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* + * Qemu supports overflow for cycle/instruction. + * This test may fail on any platform that do not support overflow for these two events. + */ + test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 1); + + test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 2); + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + case UCALL_SYNC: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + break; + } +} + +void test_vm_destroy(struct kvm_vm *vm) +{ + memset(ctrinfo_arr, 0, sizeof(union sbi_pmu_ctr_info) * RISCV_MAX_PMU_COUNTERS); + counter_mask_available = 0; + kvm_vm_free(vm); +} + +static void test_vm_basic_test(void *guest_code) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + vm_init_vector_tables(vm); + /* Illegal instruction handler is required to verify read access without configuration */ + vm_install_exception_handler(vm, EXC_INST_ILLEGAL, guest_illegal_exception_handler); + + vcpu_init_vector_tables(vcpu); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu = NULL; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_setup_snapshot_mem(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + /* PMU Snapshot requires single page only */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, PMU_SNAPSHOT_GPA_BASE, 1, 1, 0); + /* PMU_SNAPSHOT_GPA_BASE is identity mapped */ + virt_map(vm, PMU_SNAPSHOT_GPA_BASE, PMU_SNAPSHOT_GPA_BASE, 1); + + snapshot_gva = (void *)(PMU_SNAPSHOT_GPA_BASE); + snapshot_gpa = addr_gva2gpa(vcpu->vm, (vm_vaddr_t)snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gpa); +} + +static void test_vm_events_snapshot_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_overflow(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpu, KVM_RISCV_ISA_EXT_SSCOFPMF), + "Sscofpmf is not available, skipping overflow test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + vm_init_vector_tables(vm); + vm_install_interrupt_handler(vm, guest_irq_handler); + + vcpu_init_vector_tables(vcpu); + /* Initialize guest timer frequency. */ + vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency), &timer_freq); + sync_global_to_guest(vm, timer_freq); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-d ]\n", name); + pr_info("\t-d: Test to disable. Available tests are 'basic', 'events', 'snapshot', 'overflow'\n"); + pr_info("\t-h: print this help screen\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "hd:")) != -1) { + switch (opt) { + case 'd': + if (!strncmp("basic", optarg, 5)) + disabled_tests |= SBI_PMU_TEST_BASIC; + else if (!strncmp("events", optarg, 6)) + disabled_tests |= SBI_PMU_TEST_EVENTS; + else if (!strncmp("snapshot", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_SNAPSHOT; + else if (!strncmp("overflow", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_OVERFLOW; + else + goto done; + break; + case 'h': + default: + goto done; + } + } + + return true; +done: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) +{ + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + if (!(disabled_tests & SBI_PMU_TEST_BASIC)) { + test_vm_basic_test(test_pmu_basic_sanity); + pr_info("SBI PMU basic test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_EVENTS)) { + test_vm_events_test(test_pmu_events); + pr_info("SBI PMU event verification test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_SNAPSHOT)) { + test_vm_events_snapshot_test(test_pmu_events_snaphost); + pr_info("SBI PMU event verification with snapshot test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_OVERFLOW)) { + test_vm_events_overflow(test_pmu_events_overflow); + pr_info("SBI PMU event verification with overflow test : PASS\n"); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 28f97fb520..e5898678bf 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -1,5 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ + +/* + * Include rseq.c without _GNU_SOURCE defined, before including any headers, so + * that rseq.c is compiled with its configuration, not KVM selftests' config. + */ +#undef _GNU_SOURCE +#include "../rseq/rseq.c" +#define _GNU_SOURCE + #include #include #include @@ -19,8 +27,7 @@ #include "kvm_util.h" #include "processor.h" #include "test_util.h" - -#include "../rseq/rseq.c" +#include "ucall_common.h" /* * Any bug related to task migration is likely to be timing-dependent; perform @@ -186,12 +193,35 @@ static void calc_min_max_cpu(void) "Only one usable CPU, task migration not possible"); } +static void help(const char *name) +{ + puts(""); + printf("usage: %s [-h] [-u]\n", name); + printf(" -u: Don't sanity check the number of successful KVM_RUNs\n"); + puts(""); + exit(0); +} + int main(int argc, char *argv[]) { + bool skip_sanity_check = false; int r, i, snapshot; struct kvm_vm *vm; struct kvm_vcpu *vcpu; u32 cpu, rseq_cpu; + int opt; + + while ((opt = getopt(argc, argv, "hu")) != -1) { + switch (opt) { + case 'u': + skip_sanity_check = true; + break; + case 'h': + default: + help(argv[0]); + break; + } + } r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, @@ -254,9 +284,17 @@ int main(int argc, char *argv[]) * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly * conservative ratio on x86-64, which can do _more_ KVM_RUNs than * migrations given the 1us+ delay in the migration task. + * + * Another reason why it may have small migration:KVM_RUN ratio is that, + * on systems with large low power mode wakeup latency, it may happen + * quite often that the scheduler is not able to wake up the target CPU + * before the vCPU thread is scheduled to another CPU. */ - TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), - "Only performed %d KVM_RUNs, task stalled too much?", i); + TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2), + "Only performed %d KVM_RUNs, task stalled too much?\n\n" + " Try disabling deep sleep states to reduce CPU wakeup latency,\n" + " e.g. via cpuidle.off=1 or setting /dev/cpu_dma_latency to '0',\n" + " or run with -u to disable this sanity check.", i); pthread_join(migration_thread, NULL); diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c index 626a2b8a20..b390338447 100644 --- a/tools/testing/selftests/kvm/s390x/cmma_test.c +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -7,8 +7,6 @@ * Authors: * Nico Boehr */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include @@ -18,6 +16,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" #define MAIN_PAGE_COUNT 512 diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 48cb910e66..f2df7416be 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -15,6 +15,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" enum mop_target { LOGICAL, diff --git a/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c new file mode 100644 index 0000000000..bba0d9a6dc --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test shared zeropage handling (with/without storage keys) + * + * Copyright (C) 2024, Red Hat, Inc. + */ +#include + +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "kselftest.h" +#include "ucall_common.h" + +static void set_storage_key(void *addr, uint8_t skey) +{ + asm volatile("sske %0,%1" : : "d" (skey), "a" (addr)); +} + +static void guest_code(void) +{ + /* Issue some storage key instruction. */ + set_storage_key((void *)0, 0x98); + GUEST_DONE(); +} + +/* + * Returns 1 if the shared zeropage is mapped, 0 if something else is mapped. + * Returns < 0 on error or if nothing is mapped. + */ +static int maps_shared_zeropage(int pagemap_fd, void *addr) +{ + struct page_region region; + struct pm_scan_arg arg = { + .start = (uintptr_t)addr, + .end = (uintptr_t)addr + 4096, + .vec = (uintptr_t)®ion, + .vec_len = 1, + .size = sizeof(struct pm_scan_arg), + .category_mask = PAGE_IS_PFNZERO, + .category_anyof_mask = PAGE_IS_PRESENT, + .return_mask = PAGE_IS_PFNZERO, + }; + return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); +} + +int main(int argc, char *argv[]) +{ + char *mem, *page0, *page1, *page2, tmp; + const size_t pagesize = getpagesize(); + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + int pagemap_fd; + + ksft_print_header(); + ksft_set_plan(3); + + /* + * We'll use memory that is not mapped into the VM for simplicity. + * Shared zeropages are enabled/disabled per-process. + */ + mem = mmap(0, 3 * pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); + TEST_ASSERT(mem != MAP_FAILED, "mmap() failed"); + + /* Disable THP. Ignore errors on older kernels. */ + madvise(mem, 3 * pagesize, MADV_NOHUGEPAGE); + + page0 = mem; + page1 = page0 + pagesize; + page2 = page1 + pagesize; + + /* Can we even detect shared zeropages? */ + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + TEST_REQUIRE(pagemap_fd >= 0); + + tmp = *page0; + asm volatile("" : "+r" (tmp)); + TEST_REQUIRE(maps_shared_zeropage(pagemap_fd, page0) == 1); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* Verify that we get the shared zeropage after VM creation. */ + tmp = *page1; + asm volatile("" : "+r" (tmp)); + ksft_test_result(maps_shared_zeropage(pagemap_fd, page1) == 1, + "Shared zeropages should be enabled\n"); + + /* + * Let our VM execute a storage key instruction that should + * unshare all shared zeropages. + */ + vcpu_run(vcpu); + get_ucall(vcpu, &uc); + TEST_ASSERT_EQ(uc.cmd, UCALL_DONE); + + /* Verify that we don't have a shared zeropage anymore. */ + ksft_test_result(!maps_shared_zeropage(pagemap_fd, page1), + "Shared zeropage should be gone\n"); + + /* Verify that we don't get any new shared zeropages. */ + tmp = *page2; + asm volatile("" : "+r" (tmp)); + ksft_test_result(!maps_shared_zeropage(pagemap_fd, page2), + "Shared zeropages should be disabled\n"); + + kvm_vm_free(vm); + + ksft_finished(); +} diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 43fb25ddc3..53def355cc 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -10,8 +10,6 @@ * * Test expected behavior of the KVM_CAP_SYNC_REGS functionality. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index c73f948c9b..7a742a673b 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -8,6 +8,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" #define PAGE_SHIFT 12 #define PAGE_SIZE (1 << PAGE_SHIFT) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index bd57d991e2..bb8002084f 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include @@ -221,8 +220,20 @@ static void test_move_memory_region(void) static void guest_code_delete_memory_region(void) { + struct desc_ptr idt; uint64_t val; + /* + * Clobber the IDT so that a #PF due to the memory region being deleted + * escalates to triple-fault shutdown. Because the memory region is + * deleted, there will be no valid mappings. As a result, KVM will + * repeatedly intercepts the state-2 page fault that occurs when trying + * to vector the guest's #PF. I.e. trying to actually handle the #PF + * in the guest will never succeed, and so isn't an option. + */ + memset(&idt, 0, sizeof(idt)); + __asm__ __volatile__("lidt %0" :: "m"(idt)); + GUEST_SYNC(0); /* Spin until the memory region is deleted. */ @@ -339,7 +350,7 @@ static void test_invalid_memory_region_flags(void) #ifdef __x86_64__ if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); else #endif vm = vm_create_barebones(); @@ -462,7 +473,7 @@ static void test_add_private_memory_region(void) pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); @@ -471,7 +482,7 @@ static void test_add_private_memory_region(void) test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail"); close(memfd); - vm2 = vm_create_barebones_protected_vm(); + vm2 = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0); test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail"); @@ -499,7 +510,7 @@ static void test_add_overlapping_private_memory_regions(void) pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index bae0c5026f..a8d3afa0b8 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -4,20 +4,22 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE #include #include #include #include #include #include -#ifndef __riscv +#ifdef __riscv +#include "sbi.h" +#else #include #endif #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #define NR_VCPUS 4 #define ST_GPA_BASE (1 << 30) @@ -83,20 +85,18 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - int i; - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" steal: %lld\n", st->steal); - pr_info(" version: %d\n", st->version); - pr_info(" flags: %d\n", st->flags); - pr_info(" preempted: %d\n", st->preempted); - pr_info(" u8_pad: "); - for (i = 0; i < 3; ++i) - pr_info("%d", st->u8_pad[i]); - pr_info("\n pad: "); - for (i = 0; i < 11; ++i) - pr_info("%d", st->pad[i]); - pr_info("\n"); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" steal: %lld\n", st->steal); + ksft_print_msg(" version: %d\n", st->version); + ksft_print_msg(" flags: %d\n", st->flags); + ksft_print_msg(" preempted: %d\n", st->preempted); + ksft_print_msg(" u8_pad: %d %d %d\n", + st->u8_pad[0], st->u8_pad[1], st->u8_pad[2]); + ksft_print_msg(" pad: %d %d %d %d %d %d %d %d %d %d %d\n", + st->pad[0], st->pad[1], st->pad[2], st->pad[3], + st->pad[4], st->pad[5], st->pad[6], st->pad[7], + st->pad[8], st->pad[9], st->pad[10]); } #elif defined(__aarch64__) @@ -199,10 +199,10 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" rev: %d\n", st->rev); - pr_info(" attr: %d\n", st->attr); - pr_info(" st_time: %ld\n", st->st_time); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" rev: %d\n", st->rev); + ksft_print_msg(" attr: %d\n", st->attr); + ksft_print_msg(" st_time: %ld\n", st->st_time); } #elif defined(__riscv) @@ -366,7 +366,9 @@ int main(int ac, char **av) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); + ksft_print_header(); TEST_REQUIRE(is_steal_time_supported(vcpus[0])); + ksft_set_plan(NR_VCPUS); /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { @@ -407,14 +409,15 @@ int main(int ac, char **av) run_delay, stolen_time); if (verbose) { - pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, - guest_stolen_time[i], stolen_time); - if (stolen_time == run_delay) - pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); - pr_info("\n"); + ksft_print_msg("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld%s\n", + i, guest_stolen_time[i], stolen_time, + stolen_time == run_delay ? + " (BONUS: guest test-stolen-time even exactly matches test-run_delay)" : ""); steal_time_dump(vm, i); } + ksft_test_result_pass("vcpu%d\n", i); } - return 0; + /* Print results and exit() accordingly */ + ksft_finished(); } diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index eae521f050..903940c54d 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -6,8 +6,6 @@ * * Tests for amx #NM exception and save/restore. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include @@ -246,8 +244,6 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); /* Register #NM handler */ - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); /* amx cfg for guest_code */ diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c index ee3b384b99..2929c067c2 100644 --- a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c +++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c @@ -17,6 +17,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #define VCPUS 2 #define SLOTS 2 diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c index 6c2e5e0ceb..81055476d3 100644 --- a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c +++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c @@ -4,12 +4,9 @@ * * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "flds_emulation.h" - #include "test_util.h" +#include "ucall_common.h" #define MMIO_GPA 0x700000000 #define MMIO_GVA MMIO_GPA diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index f3c2239228..762628f7d4 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -110,8 +110,6 @@ static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk) { struct kvm_vm *vm = vcpu->vm; - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); if (disable_quirk) diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c index df351ae170..10b1b0ba37 100644 --- a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c @@ -2,8 +2,6 @@ /* * Copyright (C) 2023, Google LLC. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "test_util.h" diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 5c27efbf40..4f5881d4ef 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -7,8 +7,6 @@ * This work is licensed under the terms of the GNU GPL, version 2. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c index 4c7257ecd2..e192720bfe 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -4,7 +4,6 @@ * * Tests for Enlightened VMCS, including nested guest state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include @@ -258,8 +257,6 @@ int main(int argc, char *argv[]) vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index b923a285e9..068e9c6971 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -156,9 +156,6 @@ static void guest_test_msrs_access(void) vcpu_init_cpuid(vcpu, prev_cpuid); } - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - /* TODO: Make this entire test easier to maintain. */ if (stage >= 21) vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0); @@ -532,9 +529,6 @@ static void guest_test_hcalls_access(void) while (true) { vm = vm_create_with_one_vcpu(&vcpu, guest_hcall); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - /* Hypercall input/output */ hcall_page = vm_vaddr_alloc_pages(vm, 2); memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c index f1617762c2..22c0c12458 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c @@ -5,8 +5,6 @@ * Copyright (C) 2022, Red Hat, Inc. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include @@ -256,16 +254,13 @@ int main(int argc, char *argv[]) hcall_page = vm_vaddr_alloc_pages(vm, 2); memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); - vm_init_descriptor_tables(vm); vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code); - vcpu_init_descriptor_tables(vcpu[1]); vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1); vcpu_set_hv_cpuid(vcpu[1]); vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code); - vcpu_init_descriptor_tables(vcpu[2]); vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2); vcpu_set_hv_cpuid(vcpu[2]); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index c9b18707ed..b987a3d797 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -4,7 +4,6 @@ * * Tests for Hyper-V extensions to SVM. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c index 05b56095cf..077cd0ec30 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c @@ -5,8 +5,6 @@ * Copyright (C) 2022, Red Hat, Inc. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 40cc59f4e6..78878b3a27 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -183,9 +183,6 @@ int main(void) vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - enter_guest(vcpu); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 853802641e..2b550eff35 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -75,14 +75,12 @@ int main(int argc, char *argv[]) struct ucall uc; int testcase; + TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - while (1) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c index 3670331adf..3eb0313ffa 100644 --- a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c +++ b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "test_util.h" #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 17bbb96fc4..e7efb2b35f 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -5,9 +5,6 @@ * * Copyright (C) 2022, Google LLC. */ - -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh index 7cbb409801..caad084b8b 100755 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh @@ -13,10 +13,21 @@ NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_reco NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms) HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) +# If we're already root, the host might not have sudo. +if [ $(whoami) == "root" ]; then + function do_sudo () { + "$@" + } +else + function do_sudo () { + sudo "$@" + } +fi + set +e function sudo_echo () { - echo "$1" | sudo tee -a "$2" > /dev/null + echo "$1" | do_sudo tee -a "$2" > /dev/null } NXECUTABLE="$(dirname $0)/nx_huge_pages_test" diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 87011965dc..eda88080c1 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -9,8 +9,6 @@ * Verifies expected behavior of controlling guest access to * MSR_PLATFORM_INFO. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include @@ -26,36 +24,18 @@ static void guest_code(void) { uint64_t msr_platform_info; + uint8_t vector; - for (;;) { - msr_platform_info = rdmsr(MSR_PLATFORM_INFO); - GUEST_SYNC(msr_platform_info); - asm volatile ("inc %r11"); - } -} - -static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu) -{ - struct ucall uc; - - vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, true); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + GUEST_SYNC(true); + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); + GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - get_ucall(vcpu, &uc); - TEST_ASSERT(uc.cmd == UCALL_SYNC, - "Received ucall other than UCALL_SYNC: %lu", uc.cmd); - TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == - MSR_PLATFORM_INFO_MAX_TURBO_RATIO, - "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", - MSR_PLATFORM_INFO_MAX_TURBO_RATIO); -} + GUEST_SYNC(false); + vector = rdmsr_safe(MSR_PLATFORM_INFO, &msr_platform_info); + GUEST_ASSERT_EQ(vector, GP_VECTOR); -static void test_msr_platform_info_disabled(struct kvm_vcpu *vcpu) -{ - vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, false); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -63,6 +43,7 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t msr_platform_info; + struct ucall uc; TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO)); @@ -71,8 +52,26 @@ int main(int argc, char *argv[]) msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO); vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - test_msr_platform_info_enabled(vcpu); - test_msr_platform_info_disabled(vcpu); + + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, uc.args[1]); + break; + case UCALL_DONE: + goto done; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + break; + } + } + +done: vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 26c85815f7..96446134c0 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -2,8 +2,6 @@ /* * Copyright (C) 2023, Tencent, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "pmu.h" @@ -21,7 +19,6 @@ static uint8_t kvm_pmu_version; static bool kvm_has_perf_caps; -static bool is_forced_emulation_enabled; static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, void *guest_code, @@ -31,11 +28,7 @@ static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vm *vm; vm = vm_create_with_one_vcpu(vcpu, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(*vcpu); - sync_global_to_guest(vm, kvm_pmu_version); - sync_global_to_guest(vm, is_forced_emulation_enabled); /* * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling @@ -630,7 +623,6 @@ int main(int argc, char *argv[]) kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM); - is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); test_intel_counters(); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 3c85d1ae98..26b3e7efe5 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -9,9 +9,6 @@ * Verifies the expected behavior of allow lists and deny lists for * virtual PMU events. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "kvm_util.h" #include "pmu.h" #include "processor.h" @@ -337,9 +334,6 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); vcpu = vm_vcpu_add(vm, 0, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - TEST_ASSERT(!sanity_check_pmu(vcpu), "Guest should not be able to use disabled PMU."); @@ -876,9 +870,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - TEST_REQUIRE(sanity_check_pmu(vcpu)); if (use_amd_pmu()) diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c index e0f642d2a3..82a8d88b53 100644 --- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2022, Google LLC. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 366cf18600..d691d86e5b 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -4,7 +4,6 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 3610981d91..c021c0795a 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -10,7 +10,6 @@ * That bug allowed a user-mode program that called the KVM_SET_SREGS * ioctl to put a VCPU's local APIC into an invalid state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c new file mode 100644 index 0000000000..3fb967f40c --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "kselftest.h" + +#define SVM_SEV_FEAT_DEBUG_SWAP 32u + +/* + * Some features may have hidden dependencies, or may only work + * for certain VM types. Err on the side of safety and don't + * expect that all supported features can be passed one by one + * to KVM_SEV_INIT2. + * + * (Well, right now there's only one...) + */ +#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP + +int kvm_fd; +u64 supported_vmsa_features; +bool have_sev_es; + +static int __sev_ioctl(int vm_fd, int cmd_id, void *data) +{ + struct kvm_sev_cmd cmd = { + .id = cmd_id, + .data = (uint64_t)data, + .sev_fd = open_sev_dev_path_or_exit(), + }; + int ret; + + ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd); + TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS, + "%d failed: fw error: %d\n", + cmd_id, cmd.error); + + return ret; +} + +static void test_init2(unsigned long vm_type, struct kvm_sev_init *init) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == 0, + "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d", + ret, errno); + kvm_vm_free(vm); +} + +static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "KVM_SEV_INIT2 should fail, %s.", + msg); + kvm_vm_free(vm); +} + +void test_vm_types(void) +{ + test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){}); + + /* + * TODO: check that unsupported types cannot be created. Probably + * a separate selftest. + */ + if (have_sev_es) + test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){}); + + test_init2_invalid(0, &(struct kvm_sev_init){}, + "VM type is KVM_X86_DEFAULT_VM"); + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){}, + "VM type is KVM_X86_SW_PROTECTED_VM"); +} + +void test_flags(uint32_t vm_type) +{ + int i; + + for (i = 0; i < 32; i++) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .flags = BIT(i) }, + "invalid flag"); +} + +void test_features(uint32_t vm_type, uint64_t supported_features) +{ + int i; + + for (i = 0; i < 64; i++) { + if (!(supported_features & BIT_ULL(i))) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, + "unknown feature"); + else if (KNOWN_FEATURES & BIT_ULL(i)) + test_init2(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); + } +} + +int main(int argc, char *argv[]) +{ + int kvm_fd = open_kvm_dev_path_or_exit(); + bool have_sev; + + TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES) == 0); + kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES, + &supported_vmsa_features); + + have_sev = kvm_cpu_has(X86_FEATURE_SEV); + TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)), + "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM); + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)); + have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + + TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)), + "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM); + + test_vm_types(); + + test_flags(KVM_X86_SEV_VM); + if (have_sev_es) + test_flags(KVM_X86_SEV_ES_VM); + + test_features(KVM_X86_SEV_VM, 0); + if (have_sev_es) + test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c index 026779f3ed..7c70c0da4f 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "test_util.h" #include "kvm_util.h" @@ -13,6 +14,8 @@ #include "sev.h" +#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM) + static void guest_sev_es_code(void) { /* TODO: Check CPUID after GHCB-based hypercall support is added. */ @@ -35,13 +38,98 @@ static void guest_sev_code(void) GUEST_DONE(); } +/* Stash state passed via VMSA before any compiled code runs. */ +extern void guest_code_xsave(void); +asm("guest_code_xsave:\n" + "mov $-1, %eax\n" + "mov $-1, %edx\n" + "xsave (%rdi)\n" + "jmp guest_sev_es_code"); + +static void compare_xsave(u8 *from_host, u8 *from_guest) +{ + int i; + bool bad = false; + for (i = 0; i < 4095; i++) { + if (from_host[i] != from_guest[i]) { + printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); + bad = true; + } + } + + if (bad) + abort(); +} + +static void test_sync_vmsa(uint32_t policy) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t gva; + void *hva; + + double x87val = M_PI; + struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 }; + struct kvm_sregs sregs; + struct kvm_xcrs xcrs = { + .nr_xcrs = 1, + .xcrs[0].xcr = 0, + .xcrs[0].value = XFEATURE_MASK_X87_AVX, + }; + + vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu); + gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR, + MEM_REGION_TEST_DATA); + hva = addr_gva2hva(vm, gva); + + vcpu_args_set(vcpu, 1, gva); + + vcpu_sregs_get(vcpu, &sregs); + sregs.cr4 |= X86_CR4_OSFXSR | X86_CR4_OSXSAVE; + vcpu_sregs_set(vcpu, &sregs); + + vcpu_xcrs_set(vcpu, &xcrs); + asm("fninit\n" + "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n" + "fldl %3\n" + "xsave (%2)\n" + "fstp %%st\n" + : "=m"(xsave) + : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val) + : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); + vcpu_xsave_set(vcpu, &xsave); + + vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL); + + /* This page is shared, so make it decrypted. */ + memset(hva, 0, 4096); + + vcpu_run(vcpu); + + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, + "Wanted SYSTEM_EVENT, got %s", + exit_reason_str(vcpu->run->exit_reason)); + TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM); + TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1); + TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ); + + compare_xsave((u8 *)&xsave, (u8 *)hva); + + kvm_vm_free(vm); +} + static void test_sev(void *guest_code, uint64_t policy) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu); + uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM; + + vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu); + + /* TODO: Validate the measurement is as expected. */ + vm_sev_launch(vm, policy, NULL); for (;;) { vcpu_run(vcpu); @@ -82,6 +170,12 @@ int main(int argc, char *argv[]) if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); test_sev(guest_sev_es_code, SEV_POLICY_ES); + + if (kvm_has_cap(KVM_CAP_XCRS) && + (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) { + test_sync_vmsa(0); + test_sync_vmsa(SEV_POLICY_NO_DBG); + } } return 0; diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index 416207c38a..fabeeaddfb 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -5,9 +5,6 @@ * Test that KVM emulates instructions in response to EPT violations when * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "flds_emulation.h" #include "test_util.h" @@ -60,9 +57,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled()); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR); rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index e18b86666e..55c88d664a 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -4,7 +4,6 @@ * * Tests for SMM. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 88b58aab72..1c756db329 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -6,7 +6,6 @@ * * Tests for vCPU state save/restore, including nested guest state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c index 32bef39bec..916e04248f 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -93,9 +93,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c index d6fcdcc3af..00135cbba3 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c @@ -48,12 +48,9 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vcpu, 2, svm_gva, vm->idt); + vcpu_args_set(vcpu, 2, svm_gva, vm->arch.idt); vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 0c7ce3d4e8..7b6481d6c0 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -152,9 +152,6 @@ static void run_test(bool is_nmi) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler); vm_install_exception_handler(vm, INT_NR, guest_int_handler); @@ -166,7 +163,7 @@ static void run_test(bool is_nmi) idt_alt_vm = vm_vaddr_alloc_page(vm); idt_alt = addr_gva2hva(vm, idt_alt_vm); - idt = addr_gva2hva(vm, vm->idt); + idt = addr_gva2hva(vm, vm->arch.idt); memcpy(idt_alt, idt, getpagesize()); } else { idt_alt_vm = 0; diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index adb5593daf..8fa3948b01 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -8,8 +8,6 @@ * including requesting an invalid register set, updates to/from values * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index dcbb3c29fb..57f157c06b 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -17,14 +17,11 @@ * delivered into the guest or not. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include #include -#include "kvm_util_base.h" #include "kvm_util.h" #include "mce.h" #include "processor.h" @@ -285,10 +282,6 @@ int main(int argc, char *argv[]) cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code); cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(ucna_vcpu); - vcpu_init_descriptor_tables(cmcidis_vcpu); - vcpu_init_descriptor_tables(cmci_vcpu); vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index f4f61a2d24..32b2794b78 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -4,8 +4,6 @@ * * Tests for exiting into userspace on registered MSRs */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "kvm_test_harness.h" @@ -13,8 +11,6 @@ #include "kvm_util.h" #include "vmx.h" -static bool fep_available; - #define MSR_NON_EXISTENT 0x474f4f00 static u64 deny_bits = 0; @@ -258,7 +254,7 @@ static void guest_code_filter_allow(void) GUEST_ASSERT(data == 2); GUEST_ASSERT(guest_exception_count == 0); - if (fep_available) { + if (is_forced_emulation_enabled) { /* Let userspace know we aren't done. */ GUEST_SYNC(0); @@ -520,8 +516,6 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) uint64_t cmd; int rc; - sync_global_to_guest(vm, fep_available); - rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER); @@ -531,9 +525,6 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); /* Process guest code userspace exits. */ @@ -551,7 +542,7 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) vcpu_run(vcpu); cmd = process_ucall(vcpu); - if (fep_available) { + if (is_forced_emulation_enabled) { TEST_ASSERT_EQ(cmd, UCALL_SYNC); vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); @@ -774,7 +765,5 @@ KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL) int main(int argc, char *argv[]) { - fep_available = kvm_is_forced_emulation_enabled(); - return test_harness_run(argc, argv); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 977948fd52..fa512d0332 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c index fad3634fd9..3fd6eceab4 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -115,9 +115,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); get_set_sigalrm_vcpu(vcpu); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); /* diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index ea0cb3cae0..7c92536551 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -10,7 +10,6 @@ * and check it can be retrieved with KVM_GET_MSR, also test * the invalid LBR formats are rejected. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include @@ -86,9 +85,6 @@ KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code) struct ucall uc; int r, i; - vm_init_descriptor_tables(vcpu->vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); vcpu_args_set(vcpu, 1, host_cap.capabilities); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index affc328001..00dd2ac07a 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -9,7 +9,6 @@ * value instead of partially decayed timer value * */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 725c206ba0..a76078a08f 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -19,8 +19,6 @@ * Migration is a command line option. When used on non-numa machines will * exit with error. Test is still usefull on non-numa for testing IPIs. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include @@ -410,8 +408,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(¶ms[0].vcpu, halter_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(params[0].vcpu); vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index ab75b873a4..69849acd95 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c index 25a0b0db5c..95ce192d07 100644 --- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c @@ -109,9 +109,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - while (1) { vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index d2ea0435f4..a59b3c799b 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -125,7 +125,7 @@ struct compat_vcpu_runstate_info { uint32_t state; uint64_t state_entry_time; uint64_t time[5]; -} __attribute__((__packed__));; +} __attribute__((__packed__)); struct arch_vcpu_info { unsigned long cr2; @@ -171,8 +171,9 @@ static volatile bool guest_saw_irq; static void evtchn_handler(struct ex_regs *regs) { struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; - vi->evtchn_upcall_pending = 0; - vi->evtchn_pending_sel = 0; + + vcpu_arch_put_guest(vi->evtchn_upcall_pending, 0); + vcpu_arch_put_guest(vi->evtchn_pending_sel, 0); guest_saw_irq = true; GUEST_SYNC(TEST_GUEST_SAW_IRQ); @@ -380,20 +381,6 @@ wait_for_timer: GUEST_SYNC(TEST_DONE); } -static int cmp_timespec(struct timespec *a, struct timespec *b) -{ - if (a->tv_sec > b->tv_sec) - return 1; - else if (a->tv_sec < b->tv_sec) - return -1; - else if (a->tv_nsec > b->tv_nsec) - return 1; - else if (a->tv_nsec < b->tv_nsec) - return -1; - else - return 0; -} - static struct shared_info *shinfo; static struct vcpu_info *vinfo; static struct kvm_vcpu *vcpu; @@ -449,7 +436,6 @@ static void *juggle_shinfo_state(void *arg) int main(int argc, char *argv[]) { - struct timespec min_ts, max_ts, vm_ts; struct kvm_xen_hvm_attr evt_reset; struct kvm_vm *vm; pthread_t thread; @@ -468,8 +454,6 @@ int main(int argc, char *argv[]) bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA); - clock_gettime(CLOCK_REALTIME, &min_ts); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); /* Map a region for the shared_info page */ @@ -553,8 +537,6 @@ int main(int argc, char *argv[]) }; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler); if (do_runstate_tests) { @@ -1010,7 +992,6 @@ int main(int argc, char *argv[]) vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset); alarm(0); - clock_gettime(CLOCK_REALTIME, &max_ts); /* * Just a *really* basic check that things are being put in the @@ -1019,6 +1000,8 @@ int main(int argc, char *argv[]) */ struct pvclock_wall_clock *wc; struct pvclock_vcpu_time_info *ti, *ti2; + struct kvm_clock_data kcdata; + long long delta; wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00); ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20); @@ -1034,12 +1017,34 @@ int main(int argc, char *argv[]) ti2->tsc_shift, ti2->flags); } - vm_ts.tv_sec = wc->sec; - vm_ts.tv_nsec = wc->nsec; TEST_ASSERT(wc->version && !(wc->version & 1), "Bad wallclock version %x", wc->version); - TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); - TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); + + vm_ioctl(vm, KVM_GET_CLOCK, &kcdata); + + if (kcdata.flags & KVM_CLOCK_REALTIME) { + if (verbose) { + printf("KVM_GET_CLOCK clock: %lld.%09lld\n", + kcdata.clock / NSEC_PER_SEC, kcdata.clock % NSEC_PER_SEC); + printf("KVM_GET_CLOCK realtime: %lld.%09lld\n", + kcdata.realtime / NSEC_PER_SEC, kcdata.realtime % NSEC_PER_SEC); + } + + delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock); + + /* + * KVM_GET_CLOCK gives CLOCK_REALTIME which jumps on leap seconds updates but + * unfortunately KVM doesn't currently offer a CLOCK_TAI alternative. Accept 1s + * delta as testing clock accuracy is not the goal here. The test just needs to + * check that the value in shinfo is somewhat sane. + */ + TEST_ASSERT(llabs(delta) < NSEC_PER_SEC, + "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld", + wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC, + (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC); + } else { + pr_info("Missing KVM_CLOCK_REALTIME, skipping shinfo epoch sanity check\n"); + } TEST_ASSERT(ti->version && !(ti->version & 1), "Bad time_info version %x", ti->version); diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 167c97abff..f331a4e9ba 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -4,8 +4,6 @@ * * Tests for the IA32_XSS MSR. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "test_util.h" diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index a6f89aaea7..3b26bf3cf5 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -9,6 +9,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include @@ -75,7 +76,7 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(4, landlock_create_ruleset(NULL, 0, + ASSERT_EQ(5, landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, @@ -326,4 +327,77 @@ TEST(ruleset_fd_transfer) ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); } +TEST(cred_transfer) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, + }; + int ruleset_fd, dir_fd; + pid_t child; + int status; + + drop_caps(_metadata); + + dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + EXPECT_LE(0, dir_fd); + EXPECT_EQ(0, close(dir_fd)); + + /* Denies opening directories. */ + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + EXPECT_EQ(0, close(ruleset_fd)); + + /* Checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + /* Needed for KEYCTL_SESSION_TO_PARENT permission checks */ + EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL, 0, + 0, 0)) + { + TH_LOG("Failed to join session keyring: %s", strerror(errno)); + } + + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + /* Checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + /* + * KEYCTL_SESSION_TO_PARENT is a no-op unless we have a + * different session keyring in the child, so make that happen. + */ + EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, + NULL, 0, 0, 0)); + + /* + * KEYCTL_SESSION_TO_PARENT installs credentials on the parent + * that never go through the cred_prepare hook, this path uses + * cred_transfer instead. + */ + EXPECT_EQ(0, syscall(__NR_keyctl, KEYCTL_SESSION_TO_PARENT, 0, + 0, 0, 0)); + + /* Re-checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + _exit(_metadata->exit_code); + return; + } + + EXPECT_EQ(child, waitpid(child, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + /* Re-checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config index 0086efaa7b..29af19c4e9 100644 --- a/tools/testing/selftests/landlock/config +++ b/tools/testing/selftests/landlock/config @@ -2,6 +2,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y CONFIG_INET=y CONFIG_IPV6=y +CONFIG_KEYS=y CONFIG_NET=y CONFIG_NET_NS=y CONFIG_OVERLAY_FS=y diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 27744524df..7d063c652b 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -8,22 +8,35 @@ */ #define _GNU_SOURCE +#include #include #include +#include #include #include #include +#include #include #include #include +#include #include #include #include +#include #include #include +#include #include #include +/* + * Intentionally included last to work around header conflict. + * See https://sourceware.org/glibc/wiki/Synchronizing_Headers. + */ +#include +#include + #include "common.h" #ifndef renameat2 @@ -35,6 +48,13 @@ int renameat2(int olddirfd, const char *oldpath, int newdirfd, } #endif +#ifndef open_tree +int open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} +#endif + #ifndef RENAME_EXCHANGE #define RENAME_EXCHANGE (1 << 1) #endif @@ -536,9 +556,10 @@ TEST_F_FORK(layout1, inval) LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL_DEV) -#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE +#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL_DEV #define ACCESS_ALL ( \ ACCESS_FILE | \ @@ -743,6 +764,9 @@ static int create_ruleset(struct __test_metadata *const _metadata, } for (i = 0; rules[i].path; i++) { + if (!rules[i].access) + continue; + add_path_beneath(_metadata, ruleset_fd, rules[i].access, rules[i].path); } @@ -2384,6 +2408,43 @@ TEST_F_FORK(layout1, refer_denied_by_default4) layer_dir_s1d1_refer); } +/* + * Tests walking through a denied root mount. + */ +TEST_F_FORK(layout1, refer_mount_root_deny) +{ + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_MAKE_DIR, + }; + int root_fd, ruleset_fd; + + /* Creates a mount object from a non-mount point. */ + set_cap(_metadata, CAP_SYS_ADMIN); + root_fd = + open_tree(AT_FDCWD, dir_s1d1, + AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + clear_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_LE(0, root_fd); + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + EXPECT_EQ(0, close(ruleset_fd)); + + /* Link denied by Landlock: EACCES. */ + EXPECT_EQ(-1, linkat(root_fd, ".", root_fd, "does_not_exist", 0)); + EXPECT_EQ(EACCES, errno); + + /* renameat2() always returns EBUSY. */ + EXPECT_EQ(-1, renameat2(root_fd, ".", root_fd, "does_not_exist", 0)); + EXPECT_EQ(EBUSY, errno); + + EXPECT_EQ(0, close(root_fd)); +} + TEST_F_FORK(layout1, reparent_link) { const struct rule layer1[] = { @@ -3451,7 +3512,7 @@ TEST_F_FORK(layout1, truncate_unhandled) LANDLOCK_ACCESS_FS_WRITE_FILE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3534,7 +3595,7 @@ TEST_F_FORK(layout1, truncate) LANDLOCK_ACCESS_FS_TRUNCATE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3760,7 +3821,7 @@ TEST_F_FORK(ftruncate, open_and_ftruncate) }; int fd, ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, variant->handled, rules); ASSERT_LE(0, ruleset_fd); enforce_ruleset(_metadata, ruleset_fd); @@ -3837,20 +3898,469 @@ TEST_F_FORK(ftruncate, open_and_ftruncate_in_different_processes) ASSERT_EQ(0, close(socket_fds[1])); } -TEST(memfd_ftruncate) +/* Invokes the FS_IOC_GETFLAGS IOCTL and returns its errno or 0. */ +static int test_fs_ioc_getflags_ioctl(int fd) { - int fd; + uint32_t flags; + + if (ioctl(fd, FS_IOC_GETFLAGS, &flags) < 0) + return errno; + return 0; +} + +TEST(memfd_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd, i; + + /* + * We exercise the same test both with and without Landlock enabled, to + * ensure that it behaves the same in both cases. + */ + for (i = 0; i < 2; i++) { + /* Creates a new memfd. */ + fd = memfd_create("name", MFD_CLOEXEC); + ASSERT_LE(0, fd); + + /* + * Checks that operations associated with the opened file + * (ftruncate, ioctl) are permitted on file descriptors that are + * created in ways other than open(2). + */ + EXPECT_EQ(0, test_ftruncate(fd)); + EXPECT_EQ(0, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + } +} + +static int test_fionread_ioctl(int fd) +{ + size_t sz = 0; + + if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES) + return errno; + return 0; +} + +TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd; + + /* + * Checks that for files opened with O_PATH, both ioctl(2) and + * ftruncate(2) yield EBADF, as it is documented in open(2) for the + * O_PATH flag. + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Checks that after enabling Landlock, + * - the file can still be opened with O_PATH + * - both ioctl and truncate still yield EBADF (not EACCES). + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); +} + +/* + * ioctl_error - generically call the given ioctl with a pointer to a + * sufficiently large zeroed-out memory region. + * + * Returns the IOCTLs error, or 0. + */ +static int ioctl_error(struct __test_metadata *const _metadata, int fd, + unsigned int cmd) +{ + char buf[128]; /* sufficiently large */ + int res, stdinbak_fd; + + /* + * Depending on the IOCTL command, parts of the zeroed-out buffer might + * be interpreted as file descriptor numbers. We do not want to + * accidentally operate on file descriptor 0 (stdin), so we temporarily + * move stdin to a different FD and close FD 0 for the IOCTL call. + */ + stdinbak_fd = dup(0); + ASSERT_LT(0, stdinbak_fd); + ASSERT_EQ(0, close(0)); + + /* Invokes the IOCTL with a zeroed-out buffer. */ + bzero(&buf, sizeof(buf)); + res = ioctl(fd, cmd, &buf); + + /* Restores the old FD 0 and closes the backup FD. */ + ASSERT_EQ(0, dup2(stdinbak_fd, 0)); + ASSERT_EQ(0, close(stdinbak_fd)); + + if (res < 0) + return errno; + + return 0; +} + +/* Define some linux/falloc.h IOCTL commands which are not available in uapi headers. */ +struct space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserved area */ +}; + +#define FS_IOC_RESVSP _IOW('X', 40, struct space_resv) +#define FS_IOC_UNRESVSP _IOW('X', 41, struct space_resv) +#define FS_IOC_RESVSP64 _IOW('X', 42, struct space_resv) +#define FS_IOC_UNRESVSP64 _IOW('X', 43, struct space_resv) +#define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv) + +/* + * Tests a series of blanket-permitted and denied IOCTLs. + */ +TEST_F_FORK(layout1, blanket_permitted_ioctls) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + int ruleset_fd, fd; + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); - fd = memfd_create("name", MFD_CLOEXEC); + fd = open("/dev/null", O_RDWR | O_CLOEXEC); ASSERT_LE(0, fd); /* - * Checks that ftruncate is permitted on file descriptors that are - * created in ways other than open(2). + * Checks permitted commands. + * These ones may return errors, but should not be blocked by Landlock. */ - EXPECT_EQ(0, test_ftruncate(fd)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONBIO)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOASYNC)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOQSIZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIFREEZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FITHAW)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_FIEMAP)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIGETBSZ)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIDEDUPERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSUUID)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSSYSFSPATH)); + + /* + * Checks blocked commands. + * A call to a blocked IOCTL command always returns EACCES. + */ + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIONREAD)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_SETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSGETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSSETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIBMAP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_ZERO_RANGE)); + + /* Default case is also blocked. */ + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, 0xc00ffeee)); + + ASSERT_EQ(0, close(fd)); +} + +/* + * Named pipes are not governed by the LANDLOCK_ACCESS_FS_IOCTL_DEV right, + * because they are not character or block devices. + */ +TEST_F_FORK(layout1, named_pipe_ioctl) +{ + pid_t child_pid; + int fd, ruleset_fd; + const char *const path = file1_s1d1; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + ASSERT_EQ(0, unlink(path)); + ASSERT_EQ(0, mkfifo(path, 0600)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* The child process opens the pipe for writing. */ + child_pid = fork(); + ASSERT_NE(-1, child_pid); + if (child_pid == 0) { + fd = open(path, O_WRONLY); + close(fd); + exit(0); + } + + fd = open(path, O_RDONLY); + ASSERT_LE(0, fd); + + /* FIONREAD is implemented by pipefifo_fops. */ + EXPECT_EQ(0, test_fionread_ioctl(fd)); ASSERT_EQ(0, close(fd)); + ASSERT_EQ(0, unlink(path)); + + ASSERT_EQ(child_pid, waitpid(child_pid, NULL, 0)); +} + +/* For named UNIX domain sockets, no IOCTL restrictions apply. */ +TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) +{ + const char *const path = file1_s1d1; + int srv_fd, cli_fd, ruleset_fd; + socklen_t size; + struct sockaddr_un srv_un, cli_un; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + /* Sets up a server */ + srv_un.sun_family = AF_UNIX; + strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); + + ASSERT_EQ(0, unlink(path)); + srv_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, srv_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path); + ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size)); + ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Sets up a client connection to it */ + cli_un.sun_family = AF_UNIX; + cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, cli_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size)); + + bzero(&cli_un, sizeof(cli_un)); + cli_un.sun_family = AF_UNIX; + strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + + ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size)); + + /* FIONREAD and other IOCTLs should not be forbidden. */ + EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); + + ASSERT_EQ(0, close(cli_fd)); +} + +/* clang-format off */ +FIXTURE(ioctl) {}; + +FIXTURE_SETUP(ioctl) {}; + +FIXTURE_TEARDOWN(ioctl) {}; +/* clang-format on */ + +FIXTURE_VARIANT(ioctl) +{ + const __u64 handled; + const __u64 allowed; + const mode_t open_mode; + /* + * FIONREAD is used as a characteristic device-specific IOCTL command. + * It is implemented in fs/ioctl.c for regular files, + * but we do not blanket-permit it for devices. + */ + const int expected_fionread_result; +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_none) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = 0, + .open_mode = O_RDWR, + .expected_fionread_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_i) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, unhandled) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_EXECUTE, + .allowed = LANDLOCK_ACCESS_FS_EXECUTE, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +TEST_F_FORK(ioctl, handle_dir_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd); + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + +TEST_F_FORK(ioctl, handle_dir_access_dir) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int dir_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Ignore variant->open_mode for this test, as we intend to open a + * directory. If the directory can not be opened, the variant is + * infeasible to test with an opened directory. + */ + dir_fd = open("/dev", O_RDONLY); + if (dir_fd < 0) + return; + + /* + * Checks that IOCTL commands return the expected errors. + * We do not use the expected values from the fixture here. + * + * When using IOCTL on a directory, no Landlock restrictions apply. + */ + EXPECT_EQ(0, test_fionread_ioctl(dir_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(dir_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(dir_fd)); +} + +TEST_F_FORK(ioctl, handle_file_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev/zero", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd) + { + TH_LOG("Failed to open /dev/zero: %s", strerror(errno)); + } + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); } /* clang-format off */ diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 8ae203d8ed..429535816d 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -52,10 +52,33 @@ endif selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST)))) top_srcdir = $(selfdir)/../../.. +# msg: emit succinct information message describing current building step +# $1 - generic step name (e.g., CC, LINK, etc); +# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; +# $3 - target (assumed to be file); only file name will be emitted; +# $4 - optional extra arg, emitted as-is, if provided. +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; +MAKEFLAGS += --no-print-directory +endif + ifeq ($(KHDR_INCLUDES),) KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include endif +# In order to use newer items that haven't yet been added to the user's system +# header files, add $(TOOLS_INCLUDES) to the compiler invocation in each +# each selftest. +# You may need to add files to that location, or to refresh an existing file. In +# order to do that, run "make headers" from $(top_srcdir), then copy the +# header file that you want from $(top_srcdir)/usr/include/... , to the matching +# subdir in $(TOOLS_INCLUDE). +TOOLS_INCLUDES := -isystem $(top_srcdir)/tools/include/uapi + # The following are built by lib.mk common compile rules. # TEST_CUSTOM_PROGS should be used by tests that require # custom build rule and prevent common build rule use. @@ -184,7 +207,8 @@ endif ifeq ($(OVERRIDE_TARGETS),) LOCAL_HDRS += $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h $(OUTPUT)/%:%.c $(LOCAL_HDRS) - $(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ $(OUTPUT)/%.o:%.S $(COMPILE.S) $^ -o $@ diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c index a9cc17facf..4e14dba812 100644 --- a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c +++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c @@ -69,5 +69,5 @@ int main(int argc, char **argv) /* Multi-threaded */ test_mt_membarrier(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c index 4cdc8b1d12..fa3f1d6c37 100644 --- a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c +++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c @@ -24,5 +24,5 @@ int main(int argc, char **argv) test_membarrier_get_registrations(/*cmd=*/0); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 93798c8c5d..dbc171a380 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -306,7 +306,7 @@ int main(int argc, char **argv) * then the kernel did a page-replacement or canceled the read() (or * whatever magic it did..). In that case, the memfd object is still * all zero. - * In case the memfd-object was *not* sealed, the read() was successfull + * In case the memfd-object was *not* sealed, the read() was successful * and the memfd object must *not* be all zero. * Note that in real scenarios, there might be a mixture of both, but * in this test-cases, we have explicit 200ms delays which should be diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 18f585684e..95af2d78fd 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1528,7 +1528,7 @@ static void test_share_open(char *banner, char *b_suffix) /* * Test sharing via fork() - * Test whether seal-modifications work as expected with forked childs. + * Test whether seal-modifications work as expected with forked children. */ static void test_share_fork(char *banner, char *b_suffix) { diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index d26e962f2a..0b9ab98760 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -47,3 +47,5 @@ mkdirty va_high_addr_switch hugetlb_fault_after_madv hugetlb_madv_vs_map +mseal_test +seal_elf diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 410495e0a6..3b49bc3d0a 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -32,7 +32,7 @@ endif # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) LDLIBS = -lrt -lpthread -lm TEST_GEN_FILES = cow @@ -59,6 +59,8 @@ TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mrelease_test TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += mremap_test +TEST_GEN_FILES += mseal_test +TEST_GEN_FILES += seal_elf TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += thuge-gen diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index fe078d6e18..32c6ccc2a6 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -199,7 +199,7 @@ static int child_vmsplice_memcmp_fn(char *mem, size_t size, typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, - child_fn fn) + child_fn fn, bool xfail) { struct comm_pipes comm_pipes; char buf; @@ -247,33 +247,47 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, else ret = -EINVAL; - ksft_test_result(!ret, "No leak from parent into child\n"); + if (!ret) { + ksft_test_result_pass("No leak from parent into child\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from parent into child\n"); + } else { + ksft_test_result_fail("Leak from parent into child\n"); + } close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_cow_in_parent(char *mem, size_t size) +static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); } -static void test_cow_in_parent_mprotect(char *mem, size_t size) +static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); } -static void test_vmsplice_in_child(char *mem, size_t size) +static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn, + is_hugetlb); } -static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +static void test_vmsplice_in_child_mprotect(char *mem, size_t size, + bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn, + is_hugetlb); } static void do_test_vmsplice_in_parent(char *mem, size_t size, - bool before_fork) + bool before_fork, bool xfail) { struct iovec iov = { .iov_base = mem, @@ -355,8 +369,18 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size, } } - ksft_test_result(!memcmp(old, new, transferred), - "No leak from child into parent\n"); + if (!memcmp(old, new, transferred)) { + ksft_test_result_pass("No leak from child into parent\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from child into parent\n"); + } else { + ksft_test_result_fail("Leak from child into parent\n"); + } close_pipe: close(fds[0]); close(fds[1]); @@ -367,14 +391,14 @@ free: free(new); } -static void test_vmsplice_before_fork(char *mem, size_t size) +static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, true); + do_test_vmsplice_in_parent(mem, size, true, is_hugetlb); } -static void test_vmsplice_after_fork(char *mem, size_t size) +static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, false); + do_test_vmsplice_in_parent(mem, size, false, is_hugetlb); } #ifdef LOCAL_CONFIG_HAVE_LIBURING @@ -529,12 +553,12 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_iouring_ro(char *mem, size_t size) +static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, false); } -static void test_iouring_fork(char *mem, size_t size) +static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, true); } @@ -678,37 +702,41 @@ free_tmp: free(tmp); } -static void test_ro_pin_on_shared(char *mem, size_t size) +static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); } -static void test_ro_fast_pin_on_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); } -static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); } -static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); } -static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); } -static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); } -typedef void (*test_fn)(char *mem, size_t size); +typedef void (*test_fn)(char *mem, size_t size, bool hugetlb); static void do_run_with_base_page(test_fn fn, bool swapout) { @@ -740,7 +768,7 @@ static void do_run_with_base_page(test_fn fn, bool swapout) } } - fn(mem, pagesize); + fn(mem, pagesize, false); munmap: munmap(mem, pagesize); } @@ -904,7 +932,7 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) break; } - fn(mem, size); + fn(mem, size, false); munmap: munmap(mmap_mem, mmap_size); if (mremap_mem != MAP_FAILED) @@ -997,7 +1025,7 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) } munmap(dummy, hugetlbsize); - fn(mem, hugetlbsize); + fn(mem, hugetlbsize, true); munmap: munmap(mem, hugetlbsize); } @@ -1036,7 +1064,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child", - test_vmsplice_in_child + test_vmsplice_in_child, }, /* * vmsplice() test, but do an additional mprotect(PROT_READ)+ @@ -1044,7 +1072,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child with mprotect() optimization", - test_vmsplice_in_child_mprotect + test_vmsplice_in_child_mprotect, }, /* * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after @@ -1322,23 +1350,31 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_anon_thp_collapse_unshared(char *mem, size_t size) +static void test_anon_thp_collapse_unshared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); } -static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); } -static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); } -static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); } diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index d7eaca5bbe..9423ad439a 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -118,15 +118,22 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) return; } - /* - * Fault in the page writable such that GUP-fast can eventually pin - * it immediately. - */ + /* Fault in the page such that GUP-fast can pin it directly. */ memset(mem, 0, size); switch (type) { case TEST_TYPE_RO: case TEST_TYPE_RO_FAST: + /* + * Cover more cases regarding unsharing decisions when + * long-term R/O pinning by mapping the page R/O. + */ + ret = mprotect(mem, size, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + /* FALLTHROUGH */ case TEST_TYPE_RW: case TEST_TYPE_RW_FAST: { struct pin_longterm_test args; @@ -228,6 +235,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) assert(false); } +munmap: munmap(mem, size); } diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index d01e8d4901..8f122a0f08 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -27,9 +27,9 @@ #include "vm_util.h" #include "../kselftest.h" -#define MMAP_SIZE (1 << 21) #define INLOOP_ITER 100 +size_t mmap_size; char *huge_ptr; /* Touch the memory while it is being madvised() */ @@ -44,7 +44,7 @@ void *touch(void *unused) void *madv(void *unused) { for (int i = 0; i < INLOOP_ITER; i++) - madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + madvise(huge_ptr, mmap_size, MADV_DONTNEED); return NULL; } @@ -59,7 +59,7 @@ void *map_extra(void *unused) void *ptr; for (int i = 0; i < INLOOP_ITER; i++) { - ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); @@ -93,14 +93,16 @@ int main(void) free_hugepages); } + mmap_size = default_huge_page_size(); + while (max--) { - huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + huge_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if ((unsigned long)huge_ptr == -1) { - ksft_exit_skip("Failed to allocated huge page\n"); - return KSFT_SKIP; + ksft_test_result_fail("Failed to allocate huge page\n"); + return KSFT_FAIL; } pthread_create(&thread1, NULL, madv, NULL); @@ -117,7 +119,7 @@ int main(void) } /* Unmap and restart */ - munmap(huge_ptr, MMAP_SIZE); + munmap(huge_ptr, mmap_size); } return KSFT_PASS; diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 508287560c..b61803e36d 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -28,6 +28,15 @@ #define MiB (1024 * KiB) #define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child" +#define MAP_MERGE_FAIL ((void *)-1) +#define MAP_MERGE_SKIP ((void *)-2) + +enum ksm_merge_mode { + KSM_MERGE_PRCTL, + KSM_MERGE_MADVISE, + KSM_MERGE_NONE, /* PRCTL already set */ +}; + static int mem_fd; static int ksm_fd; static int ksm_full_scans_fd; @@ -146,33 +155,34 @@ static int ksm_unmerge(void) return 0; } -static char *mmap_and_merge_range(char val, unsigned long size, int prot, - bool use_prctl) +static char *__mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) { char *map; + char *err_map = MAP_MERGE_FAIL; int ret; /* Stabilize accounting by disabling KSM completely. */ if (ksm_unmerge()) { - ksft_test_result_fail("Disabling (unmerging) KSM failed\n"); - return MAP_FAILED; + ksft_print_msg("Disabling (unmerging) KSM failed\n"); + return err_map; } if (get_my_merging_pages() > 0) { - ksft_test_result_fail("Still pages merged\n"); - return MAP_FAILED; + ksft_print_msg("Still pages merged\n"); + return err_map; } map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return MAP_FAILED; + ksft_print_msg("mmap() failed\n"); + return err_map; } /* Don't use THP. Ignore if THP are not around on a kernel. */ if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + ksft_print_msg("MADV_NOHUGEPAGE failed\n"); goto unmap; } @@ -180,27 +190,36 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, memset(map, val, size); if (mprotect(map, size, prot)) { - ksft_test_result_skip("mprotect() failed\n"); + ksft_print_msg("mprotect() failed\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } - if (use_prctl) { + switch (mode) { + case KSM_MERGE_PRCTL: ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { - ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE not supported\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } else if (ret) { - ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE=1 failed\n"); goto unmap; } - } else if (madvise(map, size, MADV_MERGEABLE)) { - ksft_test_result_fail("MADV_MERGEABLE failed\n"); - goto unmap; + break; + case KSM_MERGE_MADVISE: + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_print_msg("MADV_MERGEABLE failed\n"); + goto unmap; + } + break; + case KSM_MERGE_NONE: + break; } /* Run KSM to trigger merging and wait. */ if (ksm_merge()) { - ksft_test_result_fail("Running KSM failed\n"); + ksft_print_msg("Running KSM failed\n"); goto unmap; } @@ -209,14 +228,31 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, * accounted differently (depending on kernel support). */ if (val && !get_my_merging_pages()) { - ksft_test_result_fail("No pages got merged\n"); + ksft_print_msg("No pages got merged\n"); goto unmap; } return map; unmap: munmap(map, size); - return MAP_FAILED; + return err_map; +} + +static char *mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) +{ + char *map; + char *ret = MAP_FAILED; + + map = __mmap_and_merge_range(val, size, prot, mode); + if (map == MAP_MERGE_FAIL) + ksft_test_result_fail("Merging memory failed"); + else if (map == MAP_MERGE_SKIP) + ksft_test_result_skip("Merging memory skipped"); + else + ret = map; + + return ret; } static void test_unmerge(void) @@ -226,7 +262,7 @@ static void test_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -264,7 +300,7 @@ static void test_unmerge_zero_pages(void) } /* Let KSM deduplicate zero pages. */ - map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -312,7 +348,7 @@ static void test_unmerge_discarded(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -344,7 +380,7 @@ static void test_unmerge_uffd_wp(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -439,6 +475,36 @@ static void test_prctl(void) ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n"); } +static int test_child_ksm(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + /* Test if KSM is enabled for the process. */ + if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1) + return -1; + + /* Test if merge could really happen. */ + map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE); + if (map == MAP_MERGE_FAIL) + return -2; + else if (map == MAP_MERGE_SKIP) + return -3; + + munmap(map, size); + return 0; +} + +static void test_child_ksm_err(int status) +{ + if (status == -1) + ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + else if (status == -2) + ksft_test_result_fail("Merge in child failed\n"); + else if (status == -3) + ksft_test_result_skip("Merge in child skipped\n"); +} + /* Verify that prctl ksm flag is inherited. */ static void test_prctl_fork(void) { @@ -458,7 +524,7 @@ static void test_prctl_fork(void) child_pid = fork(); if (!child_pid) { - exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0)); + exit(test_child_ksm()); } else if (child_pid < 0) { ksft_test_result_fail("fork() failed\n"); return; @@ -467,8 +533,11 @@ static void test_prctl_fork(void) if (waitpid(child_pid, &status, 0) < 0) { ksft_test_result_fail("waitpid() failed\n"); return; - } else if (WEXITSTATUS(status) != 1) { - ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + } + + status = WEXITSTATUS(status); + if (status) { + test_child_ksm_err(status); return; } @@ -480,12 +549,6 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } -static int ksm_fork_exec_child(void) -{ - /* Test if KSM is enabled for the process. */ - return prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) == 1; -} - static void test_prctl_fork_exec(void) { int ret, status; @@ -518,7 +581,7 @@ static void test_prctl_fork_exec(void) if (WIFEXITED(status)) { status = WEXITSTATUS(status); if (status) { - ksft_test_result_fail("KSM not enabled\n"); + test_child_ksm_err(status); return; } } else { @@ -545,7 +608,7 @@ static void test_prctl_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, true); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_PRCTL); if (map == MAP_FAILED) return; @@ -568,7 +631,7 @@ static void test_prot_none(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0x11, size, PROT_NONE, false); + map = mmap_and_merge_range(0x11, size, PROT_NONE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) goto unmap; @@ -593,13 +656,34 @@ unmap: munmap(map, size); } +static void init_global_file_handles(void) +{ + mem_fd = open("/proc/self/mem", O_RDWR); + if (mem_fd < 0) + ksft_exit_fail_msg("opening /proc/self/mem failed\n"); + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); + proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", + O_RDONLY); + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); +} + int main(int argc, char **argv) { unsigned int tests = 8; int err; if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { - exit(ksm_fork_exec_child() == 1 ? 0 : 1); + init_global_file_handles(); + exit(test_child_ksm()); } #ifdef __NR_userfaultfd @@ -611,22 +695,7 @@ int main(int argc, char **argv) pagesize = getpagesize(); - mem_fd = open("/proc/self/mem", O_RDWR); - if (mem_fd < 0) - ksft_exit_fail_msg("opening /proc/self/mem failed\n"); - ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); - if (ksm_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); - ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); - if (ksm_full_scans_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); - proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); - proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", - O_RDONLY); - ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + init_global_file_handles(); test_unmerge(); test_unmerge_zero_pages(); diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c index b74813fdc9..d53de24860 100644 --- a/tools/testing/selftests/mm/map_fixed_noreplace.c +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -67,7 +67,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error: munmap failed!?\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n"); addr = base_addr + page_size; size = 3 * page_size; @@ -76,7 +77,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 3*PAGE_SIZE at base+PAGE_SIZE\n"); /* * Exact same mapping again: @@ -93,7 +95,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n"); /* * Second mapping contained within first: @@ -111,7 +114,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+PAGE_SIZE\n"); /* * Overlap end of existing mapping: @@ -128,7 +132,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+(3*PAGE_SIZE)\n"); /* * Overlap start of existing mapping: @@ -145,7 +150,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE bytes at base\n"); /* * Adjacent to start of existing mapping: @@ -162,7 +168,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() PAGE_SIZE at base\n"); /* * Adjacent to end of existing mapping: @@ -179,7 +186,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() PAGE_SIZE at base+(4*PAGE_SIZE)\n"); addr = base_addr; size = 5 * page_size; diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 9b298f6a04..9a0597310a 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -83,6 +84,45 @@ static void test_mlock_limit(int fd) pass("mlock limit is respected\n"); } +static void test_vmsplice(int fd, const char *desc) +{ + ssize_t transferred; + struct iovec iov; + int pipefd[2]; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + goto close_pipe; + } + + /* + * vmsplice() may use GUP-fast, which must also fail. Prefault the + * page table, so GUP-fast could find it. + */ + memset(mem, PATTERN, page_size); + + iov.iov_base = mem; + iov.iov_len = page_size; + transferred = vmsplice(pipefd[1], &iov, 1, 0); + + if (transferred < 0 && errno == EFAULT) + pass("vmsplice is blocked as expected with %s\n", desc); + else + fail("vmsplice: unexpected memory access with %s\n", desc); + + munmap(mem, page_size); +close_pipe: + close(pipefd[0]); + close(pipefd[1]); +} + static void try_process_vm_read(int fd, int pipefd[2]) { struct iovec liov, riov; @@ -187,7 +227,6 @@ static void test_remote_access(int fd, const char *name, return; } - ftruncate(fd, page_size); memset(mem, PATTERN, page_size); if (write(pipefd[1], &mem, sizeof(mem)) < 0) { @@ -258,7 +297,7 @@ static void prepare(void) strerror(errno)); } -#define NUM_TESTS 4 +#define NUM_TESTS 6 int main(int argc, char *argv[]) { @@ -277,9 +316,17 @@ int main(int argc, char *argv[]) ksft_exit_fail_msg("memfd_secret failed: %s\n", strerror(errno)); } + if (ftruncate(fd, page_size)) + ksft_exit_fail_msg("ftruncate failed: %s\n", strerror(errno)); test_mlock_limit(fd); test_file_apis(fd); + /* + * We have to run the first vmsplice test before any secretmem page was + * allocated for this fd. + */ + test_vmsplice(fd, "fresh page"); + test_vmsplice(fd, "existing page"); test_process_vm_read(fd); test_ptrace(fd); diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 26f744188a..7f0d50fa36 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -20,8 +20,6 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) FILE *file; int ret = 1; char line[1024] = {0}; - char *end_addr; - char *stop; unsigned long start; unsigned long end; @@ -37,21 +35,10 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) memset(area, 0, sizeof(struct vm_boundaries)); while(fgets(line, 1024, file)) { - end_addr = strchr(line, '-'); - if (!end_addr) { + if (sscanf(line, "%lx-%lx", &start, &end) != 2) { ksft_print_msg("cannot parse /proc/self/maps\n"); goto out; } - *end_addr = '\0'; - end_addr++; - stop = strchr(end_addr, ' '); - if (!stop) { - ksft_print_msg("cannot parse /proc/self/maps\n"); - goto out; - } - - sscanf(line, "%lx", &start); - sscanf(end_addr, "%lx", &end); if (start <= addr && end > addr) { area->start = start; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 2f8b991f78..1b03bcfaef 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -23,6 +23,7 @@ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) #define SIZE_KB(k) ((size_t)k * 1024) @@ -69,6 +70,27 @@ enum { .expect_failure = should_fail \ } +/* compute square root using binary search */ +static unsigned long get_sqrt(unsigned long val) +{ + unsigned long low = 1; + + /* assuming rand_size is less than 1TB */ + unsigned long high = (1UL << 20); + + while (low <= high) { + unsigned long mid = low + (high - low) / 2; + unsigned long temp = mid * mid; + + if (temp == val) + return mid; + if (temp < val) + low = mid + 1; + high = mid - 1; + } + return low; +} + /* * Returns false if the requested remap region overlaps with an * existing mapping (e.g text, stack) else returns true. @@ -126,19 +148,21 @@ static unsigned long long get_mmap_min_addr(void) * Using /proc/self/maps, assert that the specified address range is contained * within a single mapping. */ -static bool is_range_mapped(FILE *maps_fp, void *start, void *end) +static bool is_range_mapped(FILE *maps_fp, unsigned long start, + unsigned long end) { char *line = NULL; size_t len = 0; bool success = false; + unsigned long first_val, second_val; rewind(maps_fp); while (getline(&line, &len, maps_fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); + if (sscanf(line, "%lx-%lx", &first_val, &second_val) != 2) { + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + break; + } if (first_val <= start && second_val >= end) { success = true; @@ -233,7 +257,8 @@ static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: @@ -272,7 +297,8 @@ static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: @@ -296,7 +322,7 @@ out: * * |DDDDddddSSSSssss| */ -static void mremap_move_within_range(char pattern_seed) +static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr) { char *test_name = "mremap mremap move within range"; void *src, *dest; @@ -316,10 +342,7 @@ static void mremap_move_within_range(char pattern_seed) src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1)); /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); dest = src - SIZE_MB(2); @@ -357,14 +380,14 @@ out: /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, - char pattern_seed) + char *rand_addr) { void *addr, *src_addr, *dest_addr, *dest_preamble_addr; - int d; - unsigned long long t; + unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; unsigned long long threshold; + unsigned long num_chunks; if (threshold_mb == VALIDATION_NO_THRESHOLD) threshold = c.region_size; @@ -378,9 +401,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) - memset((char *) src_addr + t, (char) rand(), 1); + memcpy(src_addr, rand_addr, threshold); /* Mask to zero out lower bits of address for alignment */ align_mask = ~(c.dest_alignment - 1); @@ -420,9 +441,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for the dest preamble block. */ - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) - memset((char *) dest_preamble_addr + d, (char) rand(), 1); + memcpy(dest_preamble_addr, rand_addr, c.dest_preamble_size); } clock_gettime(CLOCK_MONOTONIC, &t_start); @@ -436,15 +455,42 @@ static long long remap_region(struct config c, unsigned int threshold_mb, goto clean_up_dest_preamble; } - /* Verify byte pattern after remapping */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) { - char c = (char) rand(); + /* + * Verify byte pattern after remapping. Employ an algorithm with a + * square root time complexity in threshold: divide the range into + * chunks, if memcmp() returns non-zero, only then perform an + * iteration in that chunk to find the mismatch index. + */ + num_chunks = get_sqrt(threshold); + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = threshold / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size)) + continue; + + /* brute force iteration only over mismatch segment */ + for (t = shift; t < shift + chunk_size; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { + ksft_print_msg("Data after remap doesn't match at offset %llu\n", + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, + ((char *) dest_addr)[t] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + } - if (((char *) dest_addr)[t] != c) { + /* + * if threshold is not divisible by num_chunks, then check the + * last chunk + */ + for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, ((char *) dest_addr)[t] & 0xff); ret = -1; goto clean_up_dest; @@ -452,22 +498,44 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Verify the dest preamble byte pattern after remapping */ - if (c.dest_preamble_size) { - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) { - char c = (char) rand(); - - if (((char *) dest_preamble_addr)[d] != c) { - ksft_print_msg("Preamble data after remap doesn't match at offset %d\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); + if (!c.dest_preamble_size) + goto no_preamble; + + num_chunks = get_sqrt(c.dest_preamble_size); + + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = c.dest_preamble_size / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_preamble_addr + shift, rand_addr + shift, + chunk_size)) + continue; + + /* brute force iteration only over mismatched segment */ + for (d = shift; d < shift + chunk_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); ret = -1; goto clean_up_dest; } } } + for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + +no_preamble: start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; ret = end_ns - start_ns; @@ -494,7 +562,8 @@ out: * the beginning of the mapping just because the aligned * down address landed on a mapping that maybe does not exist. */ -static void mremap_move_1mb_from_start(char pattern_seed) +static void mremap_move_1mb_from_start(unsigned int pattern_seed, + char *rand_addr) { char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; void *src = NULL, *dest = NULL; @@ -520,10 +589,7 @@ static void mremap_move_1mb_from_start(char pattern_seed) } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); /* * Unmap the beginning of dest so that the aligned address @@ -568,10 +634,10 @@ out: static void run_mremap_test_case(struct test test_case, int *failures, unsigned int threshold_mb, - unsigned int pattern_seed) + unsigned int pattern_seed, char *rand_addr) { long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed); + rand_addr); if (remap_time < 0) { if (test_case.expect_failure) @@ -642,7 +708,15 @@ int main(int argc, char **argv) int failures = 0; int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + + /* hard-coded test configs */ + size_t max_test_variable_region_size = _2GB; + size_t max_test_constant_region_size = _2MB; + size_t dest_preamble_size = 10 * _4MB; + unsigned int pattern_seed; + char *rand_addr; + size_t rand_size; int num_expand_tests = 2; int num_misc_tests = 2; struct test test_cases[MAX_TEST] = {}; @@ -659,6 +733,31 @@ int main(int argc, char **argv) ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", threshold_mb, pattern_seed); + /* + * set preallocated random array according to test configs; see the + * functions for the logic of setting the size + */ + if (!threshold_mb) + rand_size = MAX(max_test_variable_region_size, + max_test_constant_region_size); + else + rand_size = MAX(MIN(threshold_mb * _1MB, + max_test_variable_region_size), + max_test_constant_region_size); + rand_size = MAX(dest_preamble_size, rand_size); + + rand_addr = (char *)mmap(NULL, rand_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (rand_addr == MAP_FAILED) { + perror("mmap"); + ksft_exit_fail_msg("cannot mmap rand_addr\n"); + } + + /* fill stream of random bytes */ + srand(pattern_seed); + for (unsigned long i = 0; i < rand_size; ++i) + rand_addr[i] = (char) rand(); + page_size = sysconf(_SC_PAGESIZE); /* Expected mremap failures */ @@ -730,13 +829,13 @@ int main(int argc, char **argv) for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed); + pattern_seed, rand_addr); maps_fp = fopen("/proc/self/maps", "r"); if (maps_fp == NULL) { - ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); - exit(KSFT_FAIL); + munmap(rand_addr, rand_size); + ksft_exit_fail_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); } mremap_expand_merge(maps_fp, page_size); @@ -744,17 +843,20 @@ int main(int argc, char **argv) fclose(maps_fp); - mremap_move_within_range(pattern_seed); - mremap_move_1mb_from_start(pattern_seed); + mremap_move_within_range(pattern_seed, rand_addr); + mremap_move_1mb_from_start(pattern_seed, rand_addr); if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed); + threshold_mb, pattern_seed, + rand_addr); } + munmap(rand_addr, rand_size); + if (failures > 0) ksft_exit_fail(); else diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c new file mode 100644 index 0000000000..41998cf1dc --- /dev/null +++ b/tools/testing/selftests/mm/mseal_test.c @@ -0,0 +1,1894 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * need those definition for manually build using gcc. + * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 mseal_test.c -o mseal_test + */ +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#ifndef PKEY_BITS_PER_PKEY +#define PKEY_BITS_PER_PKEY 2 +#endif + +#ifndef PKEY_MASK +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif + +#define FAIL_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + +#define SKIP_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + + +#define TEST_END_CHECK() {\ + ksft_test_result_pass("%s\n", __func__);\ + return;\ +test_end:\ + return;\ +} + +#ifndef u64 +#define u64 unsigned long long +#endif + +static unsigned long get_vma_size(void *addr, int *prot) +{ + FILE *maps; + char line[256]; + int size = 0; + uintptr_t addr_start, addr_end; + char protstr[5]; + *prot = 0; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) + return 0; + + while (fgets(line, sizeof(line), maps)) { + if (sscanf(line, "%lx-%lx %4s", &addr_start, &addr_end, protstr) == 3) { + if (addr_start == (uintptr_t) addr) { + size = addr_end - addr_start; + if (protstr[0] == 'r') + *prot |= 0x4; + if (protstr[1] == 'w') + *prot |= 0x2; + if (protstr[2] == 'x') + *prot |= 0x1; + break; + } + } + } + fclose(maps); + return size; +} + +/* + * define sys_xyx to call syscall directly. + */ +static int sys_mseal(void *start, size_t len) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mseal, start, len, 0); + return sret; +} + +static int sys_mprotect(void *ptr, size_t size, unsigned long prot) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mprotect, ptr, size, prot); + return sret; +} + +static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + errno = 0; + sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); + return sret; +} + +static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, unsigned long offset) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mmap, addr, len, prot, + flags, fd, offset); + return sret; +} + +static int sys_munmap(void *ptr, size_t size) +{ + int sret; + + errno = 0; + sret = syscall(__NR_munmap, ptr, size); + return sret; +} + +static int sys_madvise(void *start, size_t len, int types) +{ + int sret; + + errno = 0; + sret = syscall(__NR_madvise, start, len, types); + return sret; +} + +static int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(__NR_pkey_alloc, flags, init_val); + + return ret; +} + +static unsigned int __read_pkey_reg(void) +{ + unsigned int pkey_reg = 0; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax, edx; + unsigned int ecx = 0; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; +#endif + return pkey_reg; +} + +static void __write_pkey_reg(u64 pkey_reg) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); +#endif +} + +static unsigned long pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +static u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + unsigned long shift = pkey_bit_position(pkey); + + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static void set_pkey(int pkey, unsigned long pkey_value) +{ + u64 new_pkey_reg; + + new_pkey_reg = set_pkey_bits(__read_pkey_reg(), pkey, pkey_value); + __write_pkey_reg(new_pkey_reg); +} + +static void setup_single_address(int size, void **ptrOut) +{ + void *ptr; + + ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + *ptrOut = ptr; +} + +static void setup_single_address_rw(int size, void **ptrOut) +{ + void *ptr; + unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; + + ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); + *ptrOut = ptr; +} + +static int clean_single_address(void *ptr, int size) +{ + int ret; + ret = munmap(ptr, size); + return ret; +} + +static int seal_single_address(void *ptr, int size) +{ + int ret; + ret = sys_mseal(ptr, size); + return ret; +} + +bool seal_support(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + + ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == (void *) -1) + return false; + + ret = sys_mseal(ptr, page_size); + if (ret < 0) + return false; + + return true; +} + +bool pkey_supported(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + int pkey = sys_pkey_alloc(0, 0); + + if (pkey > 0) + return true; +#endif + return false; +} + +static void test_seal_addseal(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_start(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* munmap 2 pages from ptr. */ + ret = sys_munmap(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail because 2 pages from ptr are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail because 2 pages from ptr are unmapped. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + ret = sys_mseal(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_middle(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* munmap 2 pages from ptr + page. */ + ret = sys_munmap(ptr + page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail, since middle 2 pages are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* we still can add seal to the first page and last page*/ + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mseal(ptr + 3 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_end(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap last 2 pages. */ + ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail since last 2 pages are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* The first 2 pages is not sealed, and can add seals */ + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_multiple_vmas(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split the vma into 3. */ + ret = sys_mprotect(ptr + page_size, 2 * page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will get applied to all 4 pages - 3 VMAs. */ + ret = sys_mprotect(ptr, size, PROT_READ); + FAIL_TEST_IF_FALSE(!ret); + + /* use mprotect to split the vma into 3. */ + ret = sys_mprotect(ptr + page_size, 2 * page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mseal get applied to all 4 pages - 3 VMAs. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_split_start(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split at middle */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first page, this will split the VMA */ + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* add seal to the remain 3 pages */ + ret = sys_mseal(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_split_end(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split at middle */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the last page */ + ret = sys_mseal(ptr + 3 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* Adding seals to the first 3 pages */ + ret = sys_mseal(ptr, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_invalid_input(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(8 * page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + ret = clean_single_address(ptr + 4 * page_size, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* invalid flag */ + ret = syscall(__NR_mseal, ptr, size, 0x20); + FAIL_TEST_IF_FALSE(ret < 0); + + /* unaligned address */ + ret = sys_mseal(ptr + 1, 2 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* length too big */ + ret = sys_mseal(ptr, 5 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* length overflow */ + ret = sys_mseal(ptr, UINT64_MAX/page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* start is not in a valid VMA */ + ret = sys_mseal(ptr - page_size, 5 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + TEST_END_CHECK(); +} + +static void test_seal_zero_length(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_mprotect(ptr, 0, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal 0 length will be OK, same as mprotect */ + ret = sys_mseal(ptr, 0); + FAIL_TEST_IF_FALSE(!ret); + + /* verify the 4 pages are not sealed by previous call. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_zero_address(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + /* use mmap to change protection. */ + ptr = sys_mmap(0, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + FAIL_TEST_IF_FALSE(ptr == 0); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == 4 * page_size); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + /* verify the 4 pages are sealed by previous call. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret); + + TEST_END_CHECK(); +} + +static void test_seal_twice(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + /* apply the same seal will be OK. idempotent. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_start_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* the first page is sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* pages after the first page is not sealed. */ + ret = sys_mprotect(ptr + page_size, page_size * 3, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_end_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* first page is not sealed */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* last 3 page are sealed */ + ret = sys_mprotect(ptr + page_size, page_size * 3, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_unalign_len(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, page_size * 2 - 1); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 2 pages are sealed. */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 2, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_unalign_len_variant_2(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + if (seal) { + ret = seal_single_address(ptr, page_size * 2 + 1); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 3 pages are sealed. */ + ret = sys_mprotect(ptr, page_size * 3, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 3, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = seal_single_address(ptr, page_size * 4); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 2, page_size * 2, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma_with_split(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split as two vma. */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mseal can apply across 2 vma, also split them. */ + if (seal) { + ret = seal_single_address(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + } + + /* the first page is not sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* the second page is sealed. */ + ret = sys_mprotect(ptr + page_size, page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* the third page is sealed. */ + ret = sys_mprotect(ptr + 2 * page_size, page_size, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* the fouth page is not sealed. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_partial_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* seal one page. */ + if (seal) { + ret = seal_single_address(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mprotect first 2 page will fail, since the first page are sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma_with_gap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* use munmap to free two pages in the middle */ + ret = sys_munmap(ptr + page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail, because there is a gap in the address. */ + /* notes, internally mprotect still updated the first page. */ + ret = sys_mprotect(ptr, 4 * page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, 4 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* the first page is not sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + /* the last page is not sealed. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_split(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal all 4 pages. */ + if (seal) { + ret = sys_mseal(ptr, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mprotect is sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + + ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_merge(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split one page. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal first two pages. */ + if (seal) { + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 2 pages are sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* last 2 pages are not sealed. */ + ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + TEST_END_CHECK(); +} + +static void test_seal_munmap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 4 pages are sealed. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +/* + * allocate 4 pages, + * use mprotect to split it as two VMAs + * seal the whole range + * munmap will fail on both + */ +static void test_seal_munmap_two_vma(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_munmap(ptr, page_size * 2); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr + page_size, page_size * 2); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +/* + * allocate a VMA with 4 pages. + * munmap the middle 2 pages. + * seal the whole 4 pages, will fail. + * munmap the first page will be OK. + * munmap the last page will be OK. + */ +static void test_seal_munmap_vma_with_gap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_munmap(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + /* can't have gap in the middle. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + } + + ret = sys_munmap(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr + page_size * 2, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_munmap_start_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap the first page. */ + ret = sys_munmap(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the last 3 pages. */ + if (seal) { + ret = sys_mseal(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* unmap from the first page. */ + ret = sys_munmap(ptr, size); + if (seal) { + FAIL_TEST_IF_FALSE(ret < 0); + + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size * 3); + } else { + /* note: this will be OK, even the first page is */ + /* already unmapped. */ + FAIL_TEST_IF_FALSE(!ret); + + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 0); + } + + TEST_END_CHECK(); +} + +static void test_munmap_end_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap last page. */ + ret = sys_munmap(ptr + page_size * 3, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first 3 pages. */ + if (seal) { + ret = sys_mseal(ptr, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* unmap all pages. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_munmap_middle_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap 2 pages in the middle. */ + ret = sys_munmap(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first page. */ + if (seal) { + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* munmap all 4 pages. */ + ret = sys_munmap(ptr, size); + if (seal) { + FAIL_TEST_IF_FALSE(ret < 0); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + + size = get_vma_size(ptr + page_size * 3, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + } else { + FAIL_TEST_IF_FALSE(!ret); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == 0); + + size = get_vma_size(ptr + page_size * 3, &prot); + FAIL_TEST_IF_FALSE(size == 0); + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_shrink(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* shrink from 4 pages to 2 pages. */ + ret2 = mremap(ptr, size, 2 * page_size, 0, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_expand(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + /* ummap last 2 pages. */ + ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* expand from 2 page to 4 pages. */ + ret2 = mremap(ptr, 2 * page_size, 4 * page_size, 0, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 == ptr); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move(bool seal) +{ + void *ptr, *newPtr; + unsigned long page_size = getpagesize(); + unsigned long size = page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newPtr); + FAIL_TEST_IF_FALSE(newPtr != (void *)-1); + ret = clean_single_address(newPtr, size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* move from ptr to fixed address. */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mmap_overwrite_prot(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to change protection. */ + ret2 = sys_mmap(ptr, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mmap_expand(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 12 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + /* ummap last 4 pages. */ + ret = sys_munmap(ptr + 8 * page_size, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, 8 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to expand. */ + ret2 = sys_mmap(ptr, size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mmap_shrink(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 12 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to shrink. */ + ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_shrink_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move and shrink to fixed address */ + ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_expand_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); + + if (seal) { + ret = sys_mseal(newAddr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move and expand to fixed address */ + ret2 = mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED, + newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); + + if (seal) { + ret = sys_mseal(newAddr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move to fixed address */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_fixed_zero(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * MREMAP_FIXED can move the mapping to zero address + */ + ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 == 0); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_dontunmap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move, and don't unmap src addr. */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * The 0xdeaddead should not have effect on dest addr + * when MREMAP_DONTUNMAP is set. + */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + 0xdeaddead); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + FAIL_TEST_IF_FALSE((long)ret2 != 0xdeaddead); + + } + + TEST_END_CHECK(); +} + + +static void test_seal_merge_and_split(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size; + int ret; + int prot; + + /* (24 RO) */ + setup_single_address(24 * page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect(NONE) to set out boundary */ + /* (1 NONE) (22 RO) (1 NONE) */ + ret = sys_mprotect(ptr, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(!ret); + ret = sys_mprotect(ptr + 23 * page_size, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 22 * page_size); + FAIL_TEST_IF_FALSE(prot == 4); + + /* use mseal to split from beginning */ + /* (1 NONE) (1 RO_SEAL) (21 RO) (1 NONE) */ + ret = sys_mseal(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 2 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 21 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* use mseal to split from the end. */ + /* (1 NONE) (1 RO_SEAL) (20 RO) (1 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 22 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 22 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 2 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 20 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge with prev. */ + /* (1 NONE) (2 RO_SEAL) (19 RO) (1 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 2 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge with after. */ + /* (1 NONE) (2 RO_SEAL) (18 RO) (2 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 21 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 21 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 2 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* split and merge from prev */ + /* (1 NONE) (3 RO_SEAL) (17 RO) (2 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 1 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 3 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + ret = sys_munmap(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(ret < 0); + ret = sys_mprotect(ptr + 2 * page_size, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* split and merge from next */ + /* (1 NONE) (3 RO_SEAL) (16 RO) (3 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 20 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 20 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 3 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge from middle of prev and middle of next. */ + /* (1 NONE) (22 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, 20 * page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 22 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_rw(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address_rw(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't take effect on RW memory. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + /* base seal still apply. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_pkey(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int pkey; + + SKIP_TEST_IF_FALSE(pkey_supported()); + + setup_single_address_rw(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + pkey = sys_pkey_alloc(0, 0); + FAIL_TEST_IF_FALSE(pkey > 0); + + ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't take effect if PKRU allow write. */ + set_pkey(pkey, 0); + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + /* sealing will take effect if PKRU deny write. */ + set_pkey(pkey, PKEY_DISABLE_WRITE); + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* base seal still apply. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_filebacked(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int fd; + unsigned long mapflags = MAP_PRIVATE; + + fd = memfd_create("test", 0); + FAIL_TEST_IF_FALSE(fd > 0); + + ret = fallocate(fd, 0, 0, size); + FAIL_TEST_IF_FALSE(!ret); + + ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0); + FAIL_TEST_IF_FALSE(ptr != MAP_FAILED); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't apply for file backed mapping. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + close(fd); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_shared(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED; + + ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't apply for shared mapping. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +int main(int argc, char **argv) +{ + bool test_seal = seal_support(); + + ksft_print_header(); + + if (!test_seal) + ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); + + if (!pkey_supported()) + ksft_print_msg("PKEY not supported\n"); + + ksft_set_plan(80); + + test_seal_addseal(); + test_seal_unmapped_start(); + test_seal_unmapped_middle(); + test_seal_unmapped_end(); + test_seal_multiple_vmas(); + test_seal_split_start(); + test_seal_split_end(); + test_seal_invalid_input(); + test_seal_zero_length(); + test_seal_twice(); + + test_seal_mprotect(false); + test_seal_mprotect(true); + + test_seal_start_mprotect(false); + test_seal_start_mprotect(true); + + test_seal_end_mprotect(false); + test_seal_end_mprotect(true); + + test_seal_mprotect_unalign_len(false); + test_seal_mprotect_unalign_len(true); + + test_seal_mprotect_unalign_len_variant_2(false); + test_seal_mprotect_unalign_len_variant_2(true); + + test_seal_mprotect_two_vma(false); + test_seal_mprotect_two_vma(true); + + test_seal_mprotect_two_vma_with_split(false); + test_seal_mprotect_two_vma_with_split(true); + + test_seal_mprotect_partial_mprotect(false); + test_seal_mprotect_partial_mprotect(true); + + test_seal_mprotect_two_vma_with_gap(false); + test_seal_mprotect_two_vma_with_gap(true); + + test_seal_mprotect_merge(false); + test_seal_mprotect_merge(true); + + test_seal_mprotect_split(false); + test_seal_mprotect_split(true); + + test_seal_munmap(false); + test_seal_munmap(true); + test_seal_munmap_two_vma(false); + test_seal_munmap_two_vma(true); + test_seal_munmap_vma_with_gap(false); + test_seal_munmap_vma_with_gap(true); + + test_munmap_start_freed(false); + test_munmap_start_freed(true); + test_munmap_middle_freed(false); + test_munmap_middle_freed(true); + test_munmap_end_freed(false); + test_munmap_end_freed(true); + + test_seal_mremap_shrink(false); + test_seal_mremap_shrink(true); + test_seal_mremap_expand(false); + test_seal_mremap_expand(true); + test_seal_mremap_move(false); + test_seal_mremap_move(true); + + test_seal_mremap_shrink_fixed(false); + test_seal_mremap_shrink_fixed(true); + test_seal_mremap_expand_fixed(false); + test_seal_mremap_expand_fixed(true); + test_seal_mremap_move_fixed(false); + test_seal_mremap_move_fixed(true); + test_seal_mremap_move_dontunmap(false); + test_seal_mremap_move_dontunmap(true); + test_seal_mremap_move_fixed_zero(false); + test_seal_mremap_move_fixed_zero(true); + test_seal_mremap_move_dontunmap_anyaddr(false); + test_seal_mremap_move_dontunmap_anyaddr(true); + test_seal_discard_ro_anon(false); + test_seal_discard_ro_anon(true); + test_seal_discard_ro_anon_on_rw(false); + test_seal_discard_ro_anon_on_rw(true); + test_seal_discard_ro_anon_on_shared(false); + test_seal_discard_ro_anon_on_shared(true); + test_seal_discard_ro_anon_on_filebacked(false); + test_seal_discard_ro_anon_on_filebacked(true); + test_seal_mmap_overwrite_prot(false); + test_seal_mmap_overwrite_prot(true); + test_seal_mmap_expand(false); + test_seal_mmap_expand(true); + test_seal_mmap_shrink(false); + test_seal_mmap_shrink(true); + + test_seal_merge_and_split(); + test_seal_zero_address(); + + test_seal_discard_ro_anon_on_pkey(false); + test_seal_discard_ro_anon_on_pkey(true); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 4bdb3a0c7a..3157204b90 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -152,9 +152,13 @@ done < /proc/meminfo # both of these requirements into account and attempt to increase # number of huge pages available. nr_cpus=$(nproc) -hpgsize_MB=$((hpgsize_KB / 1024)) -half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) -needmem_KB=$((half_ufd_size_MB * 2 * 1024)) +uffd_min_KB=$((hpgsize_KB * nr_cpus * 2)) +hugetlb_min_KB=$((256 * 1024)) +if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then + needmem_KB=$uffd_min_KB +else + needmem_KB=$hugetlb_min_KB +fi # set proper nr_hugepages if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then @@ -294,7 +298,8 @@ CATEGORY="userfaultfd" run_test ./uffd-unit-tests uffd_stress_bin=./uffd-stress CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 # Hugetlb tests require source and destination huge pages. Pass in half -# the size ($half_ufd_size_MB), which is used for *each*. +# the size of the free pages we have, which is used for *each*. +half_ufd_size_MB=$((freepgs / 2)) CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c new file mode 100644 index 0000000000..f2babec79b --- /dev/null +++ b/tools/testing/selftests/mm/seal_elf.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * need those definition for manually build using gcc. + * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 seal_elf.c -o seal_elf + */ +#define FAIL_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + +#define SKIP_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + + +#define TEST_END_CHECK() {\ + ksft_test_result_pass("%s\n", __func__);\ + return;\ +test_end:\ + return;\ +} + +#ifndef u64 +#define u64 unsigned long long +#endif + +/* + * define sys_xyx to call syscall directly. + */ +static int sys_mseal(void *start, size_t len) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mseal, start, len, 0); + return sret; +} + +static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, unsigned long offset) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mmap, addr, len, prot, + flags, fd, offset); + return sret; +} + +static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mprotect, ptr, size, prot); + return sret; +} + +static bool seal_support(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + + ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == (void *) -1) + return false; + + ret = sys_mseal(ptr, page_size); + if (ret < 0) + return false; + + return true; +} + +const char somestr[4096] = {"READONLY"}; + +static void test_seal_elf(void) +{ + int ret; + FILE *maps; + char line[512]; + uintptr_t addr_start, addr_end; + char prot[5]; + char filename[256]; + unsigned long page_size = getpagesize(); + unsigned long long ptr = (unsigned long long) somestr; + char *somestr2 = (char *)somestr; + + /* + * Modify the protection of readonly somestr + */ + if (((unsigned long long)ptr % page_size) != 0) + ptr = (unsigned long long)ptr & ~(page_size - 1); + + ksft_print_msg("somestr = %s\n", somestr); + ksft_print_msg("change protection to rw\n"); + ret = sys_mprotect((void *)ptr, page_size, PROT_READ|PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + *somestr2 = 'A'; + ksft_print_msg("somestr is modified to: %s\n", somestr); + ret = sys_mprotect((void *)ptr, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(!ret); + + maps = fopen("/proc/self/maps", "r"); + FAIL_TEST_IF_FALSE(maps); + + /* + * apply sealing to elf binary + */ + while (fgets(line, sizeof(line), maps)) { + if (sscanf(line, "%lx-%lx %4s %*x %*x:%*x %*u %255[^\n]", + &addr_start, &addr_end, prot, filename) == 4) { + if (strlen(filename)) { + /* + * seal the mapping if read only. + */ + if (strstr(prot, "r-")) { + ret = sys_mseal((void *)addr_start, addr_end - addr_start); + FAIL_TEST_IF_FALSE(!ret); + ksft_print_msg("sealed: %lx-%lx %s %s\n", + addr_start, addr_end, prot, filename); + if ((uintptr_t) somestr >= addr_start && + (uintptr_t) somestr <= addr_end) + ksft_print_msg("mapping for somestr found\n"); + } + } + } + } + fclose(maps); + + ret = sys_mprotect((void *)ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + ksft_print_msg("somestr is sealed, mprotect is rejected\n"); + + TEST_END_CHECK(); +} + +int main(int argc, char **argv) +{ + bool test_seal = seal_support(); + + ksft_print_header(); + ksft_print_msg("pid=%d\n", getpid()); + + if (!test_seal) + ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); + + ksft_set_plan(1); + + test_seal_elf(); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index d9dbf87974..bdfa5d085f 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -209,5 +209,5 @@ int main(int argc, char **argv) close(pagemap_fd); - ksft_exit_pass(); + ksft_finished(); } diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 7bcf8d4825..4e4c1e3112 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -12,6 +12,8 @@ #include #include #include +#include + #include "../kselftest.h" /* @@ -85,7 +87,7 @@ static int validate_lower_address_hint(void) char *ptr; ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) return 0; @@ -93,6 +95,66 @@ static int validate_lower_address_hint(void) return 1; } +static int validate_complete_va_space(void) +{ + unsigned long start_addr, end_addr, prev_end_addr; + char line[400]; + char prot[6]; + FILE *file; + int fd; + + fd = open("va_dump", O_CREAT | O_WRONLY, 0600); + unlink("va_dump"); + if (fd < 0) { + ksft_test_result_skip("cannot create or open dump file\n"); + ksft_finished(); + } + + file = fopen("/proc/self/maps", "r"); + if (file == NULL) + ksft_exit_fail_msg("cannot open /proc/self/maps\n"); + + prev_end_addr = 0; + while (fgets(line, sizeof(line), file)) { + unsigned long hop; + + if (sscanf(line, "%lx-%lx %s[rwxp-]", + &start_addr, &end_addr, prot) != 3) + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + + /* end of userspace mappings; ignore vsyscall mapping */ + if (start_addr & (1UL << 63)) + return 0; + + /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ + if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) + return 1; + + prev_end_addr = end_addr; + + if (prot[0] != 'r') + continue; + + /* + * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. + * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 + * addresses after that. If the address was not held by this + * process, write would fail with errno set to EFAULT. + * Anyways, if write returns anything apart from 1, exit the + * program since that would mean a bug in /proc/self/maps. + */ + hop = 0; + while (start_addr + hop < end_addr) { + if (write(fd, (void *)(start_addr + hop), 1) != 1) + return 1; + lseek(fd, 0, SEEK_SET); + + hop += MAP_CHUNK_SIZE; + } + } + return 0; +} + int main(int argc, char *argv[]) { char *ptr[NR_CHUNKS_LOW]; @@ -105,13 +167,11 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_LOW; i++) { ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } + if (validate_lower_address_hint()) + ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); break; } @@ -127,7 +187,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_HIGH; i++) { hint = hind_addr(); hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (hptr[i] == MAP_FAILED) break; @@ -135,6 +195,10 @@ int main(int argc, char *argv[]) validate_addr(hptr[i], 1); } hchunks = i; + if (validate_complete_va_space()) { + ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); + ksft_finished(); + } for (i = 0; i < lchunks; i++) munmap(ptr[i], MAP_CHUNK_SIZE); diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 2f9d378ede..666ab7d939 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -2,9 +2,9 @@ bind_bhash bind_timewait bind_wildcard -csum cmsg_sender diag_uid +epoll_busy_poll fin_ack_lat gro hwtstamp_config @@ -31,6 +31,7 @@ reuseport_dualstack rxtimestamp sctp_hello scm_pidfd +scm_rights sk_bind_sendto_listen sk_connect_zero_addr socket @@ -42,7 +43,6 @@ tap tcp_fastopen_backup_key tcp_inq tcp_mmap -test_unix_oob timestamping tls toeplitz diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7b6918d5f4..d9393569d0 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -20,7 +20,6 @@ TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS += rxtimestamp.sh -TEST_PROGS += devlink_port_split.py TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh @@ -35,6 +34,7 @@ TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh TEST_PROGS += netns-name.sh +TEST_PROGS += nl_netdev.py TEST_PROGS += srv6_end_dt46_l3vpn_test.sh TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh @@ -43,6 +43,8 @@ TEST_PROGS += srv6_hl2encap_red_l2vpn_test.sh TEST_PROGS += srv6_end_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_x_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_flavors_test.sh +TEST_PROGS += srv6_end_dx4_netfilter_test.sh +TEST_PROGS += srv6_end_dx6_netfilter_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh @@ -67,7 +69,7 @@ TEST_GEN_FILES += ipsec TEST_GEN_FILES += ioam6_parser TEST_GEN_FILES += gro TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap +TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_poll TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen @@ -81,9 +83,6 @@ TEST_PROGS += test_ingress_egress_chaining.sh TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello -TEST_GEN_FILES += csum -TEST_GEN_FILES += nat6to4.o -TEST_GEN_FILES += xdp_dummy.o TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh @@ -93,63 +92,22 @@ TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += fdb_flush.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh +TEST_PROGS += bpf_offload.py TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh +TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) + TEST_INCLUDES := forwarding/lib.sh include ../lib.mk +$(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto $(OUTPUT)/tcp_inq: LDLIBS += -lpthread $(OUTPUT)/bind_bhash: LDLIBS += -lpthread $(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/ -# Rules to generate bpf objs -CLANG ?= clang -SCRATCH_DIR := $(OUTPUT)/tools -BUILD_DIR := $(SCRATCH_DIR)/build -BPFDIR := $(abspath ../../../lib/bpf) -APIDIR := $(abspath ../../../include/uapi) - -CCINCLUDE += -I../bpf -CCINCLUDE += -I../../../../usr/include/ -CCINCLUDE += -I$(SCRATCH_DIR)/include - -BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a - -MAKE_DIRS := $(BUILD_DIR)/libbpf -$(MAKE_DIRS): - mkdir -p $@ - -# Get Clang's default includes on this system, as opposed to those seen by -# '--target=bpf'. This fixes "missing" files on some architectures/distros, -# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. -# -# Use '-idirafter': Don't interfere with include mechanics except where the -# build would have failed anyways. -define get_sys_includes -$(shell $(1) $(2) -v -E - &1 \ - | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -$(shell $(1) $(2) -dM -E - +#include +#include + +#include +#include +#include +#include +#include + +#include "../../kselftest_harness.h" + +#define BUF_SZ 32 + +FIXTURE(msg_oob) +{ + int fd[4]; /* 0: AF_UNIX sender + * 1: AF_UNIX receiver + * 2: TCP sender + * 3: TCP receiver + */ + int signal_fd; + int epoll_fd[2]; /* 0: AF_UNIX receiver + * 1: TCP receiver + */ + bool tcp_compliant; +}; + +FIXTURE_VARIANT(msg_oob) +{ + bool peek; +}; + +FIXTURE_VARIANT_ADD(msg_oob, no_peek) +{ + .peek = false, +}; + +FIXTURE_VARIANT_ADD(msg_oob, peek) +{ + .peek = true +}; + +static void create_unix_socketpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + int ret; + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, self->fd); + ASSERT_EQ(ret, 0); +} + +static void create_tcp_socketpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + struct sockaddr_in addr; + socklen_t addrlen; + int listen_fd; + int ret; + + listen_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(listen_fd, 0); + + ret = listen(listen_fd, -1); + ASSERT_EQ(ret, 0); + + addrlen = sizeof(addr); + ret = getsockname(listen_fd, (struct sockaddr *)&addr, &addrlen); + ASSERT_EQ(ret, 0); + + self->fd[2] = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(self->fd[2], 0); + + ret = connect(self->fd[2], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(ret, 0); + + self->fd[3] = accept(listen_fd, (struct sockaddr *)&addr, &addrlen); + ASSERT_GE(self->fd[3], 0); + + ret = fcntl(self->fd[3], F_SETFL, O_NONBLOCK); + ASSERT_EQ(ret, 0); +} + +static void setup_sigurg(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + struct signalfd_siginfo siginfo; + int pid = getpid(); + sigset_t mask; + int i, ret; + + for (i = 0; i < 2; i++) { + ret = ioctl(self->fd[i * 2 + 1], FIOSETOWN, &pid); + ASSERT_EQ(ret, 0); + } + + ret = sigemptyset(&mask); + ASSERT_EQ(ret, 0); + + ret = sigaddset(&mask, SIGURG); + ASSERT_EQ(ret, 0); + + ret = sigprocmask(SIG_BLOCK, &mask, NULL); + ASSERT_EQ(ret, 0); + + self->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK); + ASSERT_GE(self->signal_fd, 0); + + ret = read(self->signal_fd, &siginfo, sizeof(siginfo)); + ASSERT_EQ(ret, -1); +} + +static void setup_epollpri(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + struct epoll_event event = { + .events = EPOLLPRI, + }; + int i; + + for (i = 0; i < 2; i++) { + int ret; + + self->epoll_fd[i] = epoll_create1(0); + ASSERT_GE(self->epoll_fd[i], 0); + + ret = epoll_ctl(self->epoll_fd[i], EPOLL_CTL_ADD, self->fd[i * 2 + 1], &event); + ASSERT_EQ(ret, 0); + } +} + +static void close_sockets(FIXTURE_DATA(msg_oob) *self) +{ + int i; + + for (i = 0; i < 4; i++) + close(self->fd[i]); +} + +FIXTURE_SETUP(msg_oob) +{ + create_unix_socketpair(_metadata, self); + create_tcp_socketpair(_metadata, self); + + setup_sigurg(_metadata, self); + setup_epollpri(_metadata, self); + + self->tcp_compliant = true; +} + +FIXTURE_TEARDOWN(msg_oob) +{ + close_sockets(self); +} + +static void __epollpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + bool oob_remaining) +{ + struct epoll_event event[2] = {}; + int i, ret[2]; + + for (i = 0; i < 2; i++) + ret[i] = epoll_wait(self->epoll_fd[i], &event[i], 1, 0); + + ASSERT_EQ(ret[0], oob_remaining); + + if (self->tcp_compliant) + ASSERT_EQ(ret[0], ret[1]); + + if (oob_remaining) { + ASSERT_EQ(event[0].events, EPOLLPRI); + + if (self->tcp_compliant) + ASSERT_EQ(event[0].events, event[1].events); + } +} + +static void __sendpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + const void *buf, size_t len, int flags) +{ + int i, ret[2]; + + for (i = 0; i < 2; i++) { + struct signalfd_siginfo siginfo = {}; + int bytes; + + ret[i] = send(self->fd[i * 2], buf, len, flags); + + bytes = read(self->signal_fd, &siginfo, sizeof(siginfo)); + + if (flags & MSG_OOB) { + ASSERT_EQ(bytes, sizeof(siginfo)); + ASSERT_EQ(siginfo.ssi_signo, SIGURG); + + bytes = read(self->signal_fd, &siginfo, sizeof(siginfo)); + } + + ASSERT_EQ(bytes, -1); + } + + ASSERT_EQ(ret[0], len); + ASSERT_EQ(ret[0], ret[1]); +} + +static void __recvpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + const void *expected_buf, int expected_len, + int buf_len, int flags) +{ + int i, ret[2], recv_errno[2], expected_errno = 0; + char recv_buf[2][BUF_SZ] = {}; + bool printed = false; + + ASSERT_GE(BUF_SZ, buf_len); + + errno = 0; + + for (i = 0; i < 2; i++) { + ret[i] = recv(self->fd[i * 2 + 1], recv_buf[i], buf_len, flags); + recv_errno[i] = errno; + } + + if (expected_len < 0) { + expected_errno = -expected_len; + expected_len = -1; + } + + if (ret[0] != expected_len || recv_errno[0] != expected_errno) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("Expected:%s", expected_errno ? strerror(expected_errno) : expected_buf); + + ASSERT_EQ(ret[0], expected_len); + ASSERT_EQ(recv_errno[0], expected_errno); + } + + if (ret[0] != ret[1] || recv_errno[0] != recv_errno[1]) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("TCP :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]); + + printed = true; + + if (self->tcp_compliant) { + ASSERT_EQ(ret[0], ret[1]); + ASSERT_EQ(recv_errno[0], recv_errno[1]); + } + } + + if (expected_len >= 0) { + int cmp; + + cmp = strncmp(expected_buf, recv_buf[0], expected_len); + if (cmp) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("Expected:%s", expected_errno ? strerror(expected_errno) : expected_buf); + + ASSERT_EQ(cmp, 0); + } + + cmp = strncmp(recv_buf[0], recv_buf[1], expected_len); + if (cmp) { + if (!printed) { + TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]); + TH_LOG("TCP :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]); + } + + if (self->tcp_compliant) + ASSERT_EQ(cmp, 0); + } + } +} + +static void __setinlinepair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self) +{ + int i, oob_inline = 1; + + for (i = 0; i < 2; i++) { + int ret; + + ret = setsockopt(self->fd[i * 2 + 1], SOL_SOCKET, SO_OOBINLINE, + &oob_inline, sizeof(oob_inline)); + ASSERT_EQ(ret, 0); + } +} + +static void __siocatmarkpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + bool oob_head) +{ + int answ[2] = {}; + int i; + + for (i = 0; i < 2; i++) { + int ret; + + ret = ioctl(self->fd[i * 2 + 1], SIOCATMARK, &answ[i]); + ASSERT_EQ(ret, 0); + } + + ASSERT_EQ(answ[0], oob_head); + + if (self->tcp_compliant) + ASSERT_EQ(answ[0], answ[1]); +} + +#define sendpair(buf, len, flags) \ + __sendpair(_metadata, self, buf, len, flags) + +#define recvpair(expected_buf, expected_len, buf_len, flags) \ + do { \ + if (variant->peek) \ + __recvpair(_metadata, self, \ + expected_buf, expected_len, \ + buf_len, (flags) | MSG_PEEK); \ + __recvpair(_metadata, self, \ + expected_buf, expected_len, buf_len, flags); \ + } while (0) + +#define epollpair(oob_remaining) \ + __epollpair(_metadata, self, oob_remaining) + +#define siocatmarkpair(oob_head) \ + __siocatmarkpair(_metadata, self, oob_head) + +#define setinlinepair() \ + __setinlinepair(_metadata, self) + +#define tcp_incompliant \ + for (self->tcp_compliant = false; \ + self->tcp_compliant == false; \ + self->tcp_compliant = true) + +TEST_F(msg_oob, non_oob) +{ + sendpair("x", 1, 0); + epollpair(false); + siocatmarkpair(false); + + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, oob) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); +} + +TEST_F(msg_oob, oob_drop) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("", -EAGAIN, 1, 0); /* Drop OOB. */ + epollpair(false); + siocatmarkpair(false); + + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, oob_ahead) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + recvpair("hell", 4, 4, 0); + epollpair(false); + siocatmarkpair(true); +} + +TEST_F(msg_oob, oob_break) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("hell", 4, 5, 0); /* Break at OOB even with enough buffer. */ + epollpair(true); + siocatmarkpair(true); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + recvpair("", -EAGAIN, 1, 0); + siocatmarkpair(false); +} + +TEST_F(msg_oob, oob_ahead_break) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("world", 5, 0); + epollpair(true); + siocatmarkpair(false); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + recvpair("hell", 4, 9, 0); /* Break at OOB even after it's recv()ed. */ + epollpair(false); + siocatmarkpair(true); + + recvpair("world", 5, 5, 0); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, oob_break_drop) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("world", 5, 0); + epollpair(true); + siocatmarkpair(false); + + recvpair("hell", 4, 10, 0); /* Break at OOB even with enough buffer. */ + epollpair(true); + siocatmarkpair(true); + + recvpair("world", 5, 10, 0); /* Drop OOB and recv() the next skb. */ + epollpair(false); + siocatmarkpair(false); + + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, ex_oob_break) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("wor", 3, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("ld", 2, 0); + epollpair(true); + siocatmarkpair(false); + + recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ + epollpair(true); + siocatmarkpair(true); + + recvpair("r", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + recvpair("ld", 2, 2, 0); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, ex_oob_drop) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + epollpair(true); + + tcp_incompliant { + siocatmarkpair(false); + + recvpair("x", 1, 1, 0); /* TCP drops "y" by passing through it. */ + epollpair(true); + siocatmarkpair(true); + + recvpair("y", 1, 1, MSG_OOB); /* TCP returns -EINVAL. */ + epollpair(false); + siocatmarkpair(true); + } +} + +TEST_F(msg_oob, ex_oob_drop_2) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + epollpair(true); + + tcp_incompliant { + siocatmarkpair(false); + } + + recvpair("y", 1, 1, MSG_OOB); + epollpair(false); + + tcp_incompliant { + siocatmarkpair(false); + + recvpair("x", 1, 1, 0); /* TCP returns -EAGAIN. */ + epollpair(false); + siocatmarkpair(true); + } +} + +TEST_F(msg_oob, ex_oob_ahead_break) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("wor", 3, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("r", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + sendpair("ld", 2, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + tcp_incompliant { + recvpair("hellowol", 8, 10, 0); /* TCP recv()s "helloworl", why "r" ?? */ + } + + epollpair(true); + siocatmarkpair(true); + + recvpair("d", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); +} + +TEST_F(msg_oob, ex_oob_siocatmark) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + sendpair("world", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ + epollpair(true); + siocatmarkpair(false); +} + +TEST_F(msg_oob, inline_oob) +{ + setinlinepair(); + + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, 0); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, inline_oob_break) +{ + setinlinepair(); + + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("hell", 4, 5, 0); /* Break at OOB but not at ex-OOB. */ + epollpair(true); + siocatmarkpair(true); + + recvpair("o", 1, 1, 0); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, inline_oob_ahead_break) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("world", 5, 0); + epollpair(true); + siocatmarkpair(false); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + setinlinepair(); + + recvpair("hell", 4, 9, 0); /* Break at OOB even with enough buffer. */ + epollpair(false); + siocatmarkpair(true); + + tcp_incompliant { + recvpair("world", 5, 6, 0); /* TCP recv()s "oworld", ... "o" ??? */ + } + + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, inline_ex_oob_break) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("wor", 3, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + sendpair("ld", 2, 0); + epollpair(true); + siocatmarkpair(false); + + setinlinepair(); + + recvpair("hellowo", 7, 10, 0); /* Break at OOB but not at ex-OOB. */ + epollpair(true); + siocatmarkpair(true); + + recvpair("rld", 3, 3, 0); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, inline_ex_oob_no_drop) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + setinlinepair(); + + sendpair("y", 1, MSG_OOB); /* TCP does NOT drops "x" at this moment. */ + epollpair(true); + siocatmarkpair(false); + + recvpair("x", 1, 1, 0); + epollpair(true); + siocatmarkpair(true); + + recvpair("y", 1, 1, 0); + epollpair(false); + siocatmarkpair(false); +} + +TEST_F(msg_oob, inline_ex_oob_drop) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); /* TCP drops "x" at this moment. */ + epollpair(true); + + setinlinepair(); + + tcp_incompliant { + siocatmarkpair(false); + + recvpair("x", 1, 1, 0); /* TCP recv()s "y". */ + epollpair(true); + siocatmarkpair(true); + + recvpair("y", 1, 1, 0); /* TCP returns -EAGAIN. */ + epollpair(false); + siocatmarkpair(false); + } +} + +TEST_F(msg_oob, inline_ex_oob_siocatmark) +{ + sendpair("hello", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("o", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); + + setinlinepair(); + + sendpair("world", 5, MSG_OOB); + epollpair(true); + siocatmarkpair(false); + + recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ + epollpair(true); + siocatmarkpair(false); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c new file mode 100644 index 0000000000..d663362565 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/scm_rights.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include + +#include "../../kselftest_harness.h" + +FIXTURE(scm_rights) +{ + int fd[32]; +}; + +FIXTURE_VARIANT(scm_rights) +{ + char name[32]; + int type; + int flags; + bool test_listener; +}; + +FIXTURE_VARIANT_ADD(scm_rights, dgram) +{ + .name = "UNIX ", + .type = SOCK_DGRAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = true, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = true, +}; + +static int count_sockets(struct __test_metadata *_metadata, + const FIXTURE_VARIANT(scm_rights) *variant) +{ + int sockets = -1, len, ret; + char *line = NULL; + size_t unused; + FILE *f; + + f = fopen("/proc/net/protocols", "r"); + ASSERT_NE(NULL, f); + + len = strlen(variant->name); + + while (getline(&line, &unused, f) != -1) { + int unused2; + + if (strncmp(line, variant->name, len)) + continue; + + ret = sscanf(line + len, "%d %d", &unused2, &sockets); + ASSERT_EQ(2, ret); + + break; + } + + free(line); + + ret = fclose(f); + ASSERT_EQ(0, ret); + + return sockets; +} + +FIXTURE_SETUP(scm_rights) +{ + int ret; + + ret = unshare(CLONE_NEWNET); + ASSERT_EQ(0, ret); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(scm_rights) +{ + int ret; + + sleep(1); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +static void create_listeners(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + socklen_t addrlen; + int i, ret; + + for (i = 0; i < n * 2; i += 2) { + self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i]); + + addrlen = sizeof(addr.sun_family); + ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + + ret = listen(self->fd[i], -1); + ASSERT_EQ(0, ret); + + addrlen = sizeof(addr); + ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen); + ASSERT_EQ(0, ret); + + self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i + 1]); + + ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + } +} + +static void create_socketpairs(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i += 2) { + ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i); + ASSERT_EQ(0, ret); + } +} + +static void __create_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + ASSERT_LE(n * 2, sizeof(self->fd) / sizeof(self->fd[0])); + + if (variant->test_listener) + create_listeners(_metadata, self, n); + else + create_socketpairs(_metadata, self, variant, n); +} + +static void __close_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i++) { + ret = close(self->fd[i]); + ASSERT_EQ(0, ret); + } +} + +void __send_fd(struct __test_metadata *_metadata, + const FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int inflight, int receiver) +{ +#define MSG "x" +#define MSGLEN 1 + struct { + struct cmsghdr cmsghdr; + int fd[2]; + } cmsg = { + .cmsghdr = { + .cmsg_len = CMSG_LEN(sizeof(cmsg.fd)), + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_RIGHTS, + }, + .fd = { + self->fd[inflight * 2], + self->fd[inflight * 2], + }, + }; + struct iovec iov = { + .iov_base = MSG, + .iov_len = MSGLEN, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &cmsg, + .msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)), + }; + int ret; + + ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags); + ASSERT_EQ(MSGLEN, ret); +} + +#define create_sockets(n) \ + __create_sockets(_metadata, self, variant, n) +#define close_sockets(n) \ + __close_sockets(_metadata, self, n) +#define send_fd(inflight, receiver) \ + __send_fd(_metadata, self, variant, inflight, receiver) + +TEST_F(scm_rights, self_ref) +{ + create_sockets(2); + + send_fd(0, 0); + + send_fd(1, 1); + + close_sockets(2); +} + +TEST_F(scm_rights, triangle) +{ + create_sockets(6); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + + send_fd(3, 4); + send_fd(4, 5); + send_fd(5, 3); + + close_sockets(6); +} + +TEST_F(scm_rights, cross_edge) +{ + create_sockets(8); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + send_fd(1, 3); + send_fd(3, 2); + + send_fd(4, 5); + send_fd(5, 6); + send_fd(6, 4); + send_fd(5, 7); + send_fd(7, 6); + + close_sockets(8); +} + +TEST_F(scm_rights, backtrack_from_scc) +{ + create_sockets(10); + + send_fd(0, 1); + send_fd(0, 4); + send_fd(1, 2); + send_fd(2, 3); + send_fd(3, 1); + + send_fd(5, 6); + send_fd(5, 9); + send_fd(6, 7); + send_fd(7, 8); + send_fd(8, 6); + + close_sockets(10); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/af_unix/test_unix_oob.c b/tools/testing/selftests/net/af_unix/test_unix_oob.c deleted file mode 100644 index a7c51889ac..0000000000 --- a/tools/testing/selftests/net/af_unix/test_unix_oob.c +++ /dev/null @@ -1,436 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int pipefd[2]; -static int signal_recvd; -static pid_t producer_id; -static char sock_name[32]; - -static void sig_hand(int sn, siginfo_t *si, void *p) -{ - signal_recvd = sn; -} - -static int set_sig_handler(int signal) -{ - struct sigaction sa; - - sa.sa_sigaction = sig_hand; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_SIGINFO | SA_RESTART; - - return sigaction(signal, &sa, NULL); -} - -static void set_filemode(int fd, int set) -{ - int flags = fcntl(fd, F_GETFL, 0); - - if (set) - flags &= ~O_NONBLOCK; - else - flags |= O_NONBLOCK; - fcntl(fd, F_SETFL, flags); -} - -static void signal_producer(int fd) -{ - char cmd; - - cmd = 'S'; - write(fd, &cmd, sizeof(cmd)); -} - -static void wait_for_signal(int fd) -{ - char buf[5]; - - read(fd, buf, 5); -} - -static void die(int status) -{ - fflush(NULL); - unlink(sock_name); - kill(producer_id, SIGTERM); - exit(status); -} - -int is_sioctatmark(int fd) -{ - int ans = -1; - - if (ioctl(fd, SIOCATMARK, &ans, sizeof(ans)) < 0) { -#ifdef DEBUG - perror("SIOCATMARK Failed"); -#endif - } - return ans; -} - -void read_oob(int fd, char *c) -{ - - *c = ' '; - if (recv(fd, c, sizeof(*c), MSG_OOB) < 0) { -#ifdef DEBUG - perror("Reading MSG_OOB Failed"); -#endif - } -} - -int read_data(int pfd, char *buf, int size) -{ - int len = 0; - - memset(buf, size, '0'); - len = read(pfd, buf, size); -#ifdef DEBUG - if (len < 0) - perror("read failed"); -#endif - return len; -} - -static void wait_for_data(int pfd, int event) -{ - struct pollfd pfds[1]; - - pfds[0].fd = pfd; - pfds[0].events = event; - poll(pfds, 1, -1); -} - -void producer(struct sockaddr_un *consumer_addr) -{ - int cfd; - char buf[64]; - int i; - - memset(buf, 'x', sizeof(buf)); - cfd = socket(AF_UNIX, SOCK_STREAM, 0); - - wait_for_signal(pipefd[0]); - if (connect(cfd, (struct sockaddr *)consumer_addr, - sizeof(*consumer_addr)) != 0) { - perror("Connect failed"); - kill(0, SIGTERM); - exit(1); - } - - for (i = 0; i < 2; i++) { - /* Test 1: Test for SIGURG and OOB */ - wait_for_signal(pipefd[0]); - memset(buf, 'x', sizeof(buf)); - buf[63] = '@'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - wait_for_signal(pipefd[0]); - - /* Test 2: Test for OOB being overwitten */ - memset(buf, 'x', sizeof(buf)); - buf[63] = '%'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - memset(buf, 'x', sizeof(buf)); - buf[63] = '#'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - wait_for_signal(pipefd[0]); - - /* Test 3: Test for SIOCATMARK */ - memset(buf, 'x', sizeof(buf)); - buf[63] = '@'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - memset(buf, 'x', sizeof(buf)); - buf[63] = '%'; - send(cfd, buf, sizeof(buf), MSG_OOB); - - memset(buf, 'x', sizeof(buf)); - send(cfd, buf, sizeof(buf), 0); - - wait_for_signal(pipefd[0]); - - /* Test 4: Test for 1byte OOB msg */ - memset(buf, 'x', sizeof(buf)); - buf[0] = '@'; - send(cfd, buf, 1, MSG_OOB); - } -} - -int -main(int argc, char **argv) -{ - int lfd, pfd; - struct sockaddr_un consumer_addr, paddr; - socklen_t len = sizeof(consumer_addr); - char buf[1024]; - int on = 0; - char oob; - int atmark; - - lfd = socket(AF_UNIX, SOCK_STREAM, 0); - memset(&consumer_addr, 0, sizeof(consumer_addr)); - consumer_addr.sun_family = AF_UNIX; - sprintf(sock_name, "unix_oob_%d", getpid()); - unlink(sock_name); - strcpy(consumer_addr.sun_path, sock_name); - - if ((bind(lfd, (struct sockaddr *)&consumer_addr, - sizeof(consumer_addr))) != 0) { - perror("socket bind failed"); - exit(1); - } - - pipe(pipefd); - - listen(lfd, 1); - - producer_id = fork(); - if (producer_id == 0) { - producer(&consumer_addr); - exit(0); - } - - set_sig_handler(SIGURG); - signal_producer(pipefd[1]); - - pfd = accept(lfd, (struct sockaddr *) &paddr, &len); - fcntl(pfd, F_SETOWN, getpid()); - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 1: - * veriyf that SIGURG is - * delivered, 63 bytes are - * read, oob is '@', and POLLPRI works. - */ - wait_for_data(pfd, POLLPRI); - read_oob(pfd, &oob); - len = read_data(pfd, buf, 1024); - if (!signal_recvd || len != 63 || oob != '@') { - fprintf(stderr, "Test 1 failed sigurg %d len %d %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 2: - * Verify that the first OOB is over written by - * the 2nd one and the first OOB is returned as - * part of the read, and sigurg is received. - */ - wait_for_data(pfd, POLLIN | POLLPRI); - len = 0; - while (len < 70) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - read_oob(pfd, &oob); - if (!signal_recvd || len != 127 || oob != '#') { - fprintf(stderr, "Test 2 failed, sigurg %d len %d OOB %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 3: - * verify that 2nd oob over writes - * the first one and read breaks at - * oob boundary returning 127 bytes - * and sigurg is received and atmark - * is set. - * oob is '%' and second read returns - * 64 bytes. - */ - len = 0; - wait_for_data(pfd, POLLIN | POLLPRI); - while (len < 150) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - read_oob(pfd, &oob); - - if (!signal_recvd || len != 127 || oob != '%' || atmark != 1) { - fprintf(stderr, - "Test 3 failed, sigurg %d len %d OOB %c atmark %d\n", - signal_recvd, len, oob, atmark); - die(1); - } - - signal_recvd = 0; - - len = read_data(pfd, buf, 1024); - if (len != 64) { - fprintf(stderr, "Test 3.1 failed, sigurg %d len %d OOB %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 4: - * verify that a single byte - * oob message is delivered. - * set non blocking mode and - * check proper error is - * returned and sigurg is - * received and correct - * oob is read. - */ - - set_filemode(pfd, 0); - - wait_for_data(pfd, POLLIN | POLLPRI); - len = read_data(pfd, buf, 1024); - if ((len == -1) && (errno == 11)) - len = 0; - - read_oob(pfd, &oob); - - if (!signal_recvd || len != 0 || oob != '@') { - fprintf(stderr, "Test 4 failed, sigurg %d len %d OOB %c\n", - signal_recvd, len, oob); - die(1); - } - - set_filemode(pfd, 1); - - /* Inline Testing */ - - on = 1; - if (setsockopt(pfd, SOL_SOCKET, SO_OOBINLINE, &on, sizeof(on))) { - perror("SO_OOBINLINE"); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 1 -- Inline: - * Check that SIGURG is - * delivered and 63 bytes are - * read and oob is '@' - */ - - wait_for_data(pfd, POLLIN | POLLPRI); - len = read_data(pfd, buf, 1024); - - if (!signal_recvd || len != 63) { - fprintf(stderr, "Test 1 Inline failed, sigurg %d len %d\n", - signal_recvd, len); - die(1); - } - - len = read_data(pfd, buf, 1024); - - if (len != 1) { - fprintf(stderr, - "Test 1.1 Inline failed, sigurg %d len %d oob %c\n", - signal_recvd, len, oob); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 2 -- Inline: - * Verify that the first OOB is over written by - * the 2nd one and read breaks correctly on - * 2nd OOB boundary with the first OOB returned as - * part of the read, and sigurg is delivered and - * siocatmark returns true. - * next read returns one byte, the oob byte - * and siocatmark returns false. - */ - len = 0; - wait_for_data(pfd, POLLIN | POLLPRI); - while (len < 70) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (len != 127 || atmark != 1 || !signal_recvd) { - fprintf(stderr, "Test 2 Inline failed, len %d atmark %d\n", - len, atmark); - die(1); - } - - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (len != 1 || buf[0] != '#' || atmark == 1) { - fprintf(stderr, "Test 2.1 Inline failed, len %d data %c atmark %d\n", - len, buf[0], atmark); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 3 -- Inline: - * verify that 2nd oob over writes - * the first one and read breaks at - * oob boundary returning 127 bytes - * and sigurg is received and siocatmark - * is true after the read. - * subsequent read returns 65 bytes - * because of oob which should be '%'. - */ - len = 0; - wait_for_data(pfd, POLLIN | POLLPRI); - while (len < 126) - len = recv(pfd, buf, 1024, MSG_PEEK); - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (!signal_recvd || len != 127 || !atmark) { - fprintf(stderr, - "Test 3 Inline failed, sigurg %d len %d data %c\n", - signal_recvd, len, buf[0]); - die(1); - } - - len = read_data(pfd, buf, 1024); - atmark = is_sioctatmark(pfd); - if (len != 65 || buf[0] != '%' || atmark != 0) { - fprintf(stderr, - "Test 3.1 Inline failed, len %d oob %c atmark %d\n", - len, buf[0], atmark); - die(1); - } - - signal_recvd = 0; - signal_producer(pipefd[1]); - - /* Test 4 -- Inline: - * verify that a single - * byte oob message is delivered - * and read returns one byte, the oob - * byte and sigurg is received - */ - wait_for_data(pfd, POLLIN | POLLPRI); - len = read_data(pfd, buf, 1024); - if (!signal_recvd || len != 1 || buf[0] != '@') { - fprintf(stderr, - "Test 4 Inline failed, signal %d len %d data %c\n", - signal_recvd, len, buf[0]); - die(1); - } - die(0); -} diff --git a/tools/testing/selftests/net/bpf.mk b/tools/testing/selftests/net/bpf.mk new file mode 100644 index 0000000000..a4f6755dd8 --- /dev/null +++ b/tools/testing/selftests/net/bpf.mk @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: GPL-2.0 +# Rules to generate bpf objs +CLANG ?= clang +SCRATCH_DIR := $(OUTPUT)/tools +BUILD_DIR := $(SCRATCH_DIR)/build +BPFDIR := $(top_srcdir)/tools/lib/bpf +APIDIR := $(top_srcdir)/tools/include/uapi + +CCINCLUDE += -I$(selfdir)/bpf +CCINCLUDE += -I$(top_srcdir)/usr/include/ +CCINCLUDE += -I$(SCRATCH_DIR)/include + +BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a + +MAKE_DIRS := $(BUILD_DIR)/libbpf +$(MAKE_DIRS): + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ + +# Get Clang's default includes on this system, as opposed to those seen by +# '--target=bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) $(2) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) $(2) -dM -E - 0 and stderr[-1] == "\n": + stderr = stderr[:-1] + raise Exception("Command failed: %s\n%s" % (proc.args, stderr)) + + if include_stderr: + return proc.returncode, stdout, stderr + else: + return proc.returncode, stdout + +def rm(f): + cmd("rm -f %s" % (f)) + if f in files: + files.remove(f) + +def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False): + params = "" + if JSON: + params += "%s " % (flags["json"]) + + if ns: + ns = "ip netns exec %s " % (ns) + elif ns is None: + ns = "" + + if include_stderr: + ret, stdout, stderr = cmd(ns + name + " " + params + args, + fail=fail, include_stderr=True) + else: + ret, stdout = cmd(ns + name + " " + params + args, + fail=fail, include_stderr=False) + + if JSON and len(stdout.strip()) != 0: + out = json.loads(stdout) + else: + out = stdout + + if include_stderr: + return ret, out, stderr + else: + return ret, out + +def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False): + return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, + fail=fail, include_stderr=include_stderr) + +def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True): + _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) + # Remove the base progs + for p in base_progs: + if p in progs: + progs.remove(p) + if exclude_orphaned: + progs = [ p for p in progs if not p['orphaned'] ] + if expected is not None: + if len(progs) != expected: + fail(True, "%d BPF programs loaded, expected %d" % + (len(progs), expected)) + return progs + +def bpftool_map_list(expected=None, ns=""): + _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) + # Remove the base maps + maps = [m for m in maps if m not in base_maps and m.get('name') and m.get('name') not in base_map_names] + if expected is not None: + if len(maps) != expected: + fail(True, "%d BPF maps loaded, expected %d" % + (len(maps), expected)) + return maps + +def bpftool_prog_list_wait(expected=0, n_retry=20): + for i in range(n_retry): + nprogs = len(bpftool_prog_list()) + if nprogs == expected: + return + time.sleep(0.05) + raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) + +def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): + for i in range(n_retry): + maps = bpftool_map_list(ns=ns) + if len(maps) == expected: + return maps + time.sleep(0.05) + raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) + +def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, + fail=True, include_stderr=False): + args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) + if prog_type is not None: + args += " type " + prog_type + if dev is not None: + args += " dev " + dev + if len(maps): + args += " map " + " map ".join(maps) + + res = bpftool(args, fail=fail, include_stderr=include_stderr) + if res[0] == 0: + files.append(file_name) + return res + +def ip(args, force=False, JSON=True, ns="", fail=True, include_stderr=False): + if force: + args = "-force " + args + return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns, + fail=fail, include_stderr=include_stderr) + +def tc(args, JSON=True, ns="", fail=True, include_stderr=False): + return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns, + fail=fail, include_stderr=include_stderr) + +def ethtool(dev, opt, args, fail=True): + return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail) + +def bpf_obj(name, sec="xdp", path=bpf_test_dir,): + return "obj %s sec %s" % (os.path.join(path, name), sec) + +def bpf_pinned(name): + return "pinned %s" % (name) + +def bpf_bytecode(bytecode): + return "bytecode \"%s\"" % (bytecode) + +def mknetns(n_retry=10): + for i in range(n_retry): + name = ''.join([random.choice(string.ascii_letters) for i in range(8)]) + ret, _ = ip("netns add %s" % (name), fail=False) + if ret == 0: + netns.append(name) + return name + return None + +def int2str(fmt, val): + ret = [] + for b in struct.pack(fmt, val): + ret.append(int(b)) + return " ".join(map(lambda x: str(x), ret)) + +def str2int(strtab): + inttab = [] + for i in strtab: + inttab.append(int(i, 16)) + ba = bytearray(inttab) + if len(strtab) == 4: + fmt = "I" + elif len(strtab) == 8: + fmt = "Q" + else: + raise Exception("String array of len %d can't be unpacked to an int" % + (len(strtab))) + return struct.unpack(fmt, ba)[0] + +class DebugfsDir: + """ + Class for accessing DebugFS directories as a dictionary. + """ + + def __init__(self, path): + self.path = path + self._dict = self._debugfs_dir_read(path) + + def __len__(self): + return len(self._dict.keys()) + + def __getitem__(self, key): + if type(key) is int: + key = list(self._dict.keys())[key] + return self._dict[key] + + def __setitem__(self, key, value): + log("DebugFS set %s = %s" % (key, value), "") + log_level_inc() + + cmd("echo '%s' > %s/%s" % (value, self.path, key)) + log_level_dec() + + _, out = cmd('cat %s/%s' % (self.path, key)) + self._dict[key] = out.strip() + + def _debugfs_dir_read(self, path): + dfs = {} + + log("DebugFS state for %s" % (path), "") + log_level_inc(add=2) + + _, out = cmd('ls ' + path) + for f in out.split(): + if f == "ports": + continue + + p = os.path.join(path, f) + if not os.stat(p).st_mode & stat.S_IRUSR: + continue + + if os.path.isfile(p): + # We need to init trap_flow_action_cookie before read it + if f == "trap_flow_action_cookie": + cmd('echo deadbeef > %s/%s' % (path, f)) + _, out = cmd('cat %s/%s' % (path, f)) + dfs[f] = out.strip() + elif os.path.isdir(p): + dfs[f] = DebugfsDir(p) + else: + raise Exception("%s is neither file nor directory" % (p)) + + log_level_dec() + log("DebugFS state", dfs) + log_level_dec() + + return dfs + +class BpfNetdevSimDev(NetdevSimDev): + """ + Class for netdevsim bus device and its attributes. + """ + def __init__(self, port_count=1, ns=None): + super().__init__(port_count, ns=ns) + devs.append(self) + + def _make_port(self, port_index, ifname): + return BpfNetdevSim(self, port_index, ifname, self.ns) + + def dfs_num_bound_progs(self): + path = os.path.join(self.dfs_dir, "bpf_bound_progs") + _, progs = cmd('ls %s' % (path)) + return len(progs.split()) + + def dfs_get_bound_progs(self, expected): + progs = DebugfsDir(os.path.join(self.dfs_dir, "bpf_bound_progs")) + if expected is not None: + if len(progs) != expected: + fail(True, "%d BPF programs bound, expected %d" % + (len(progs), expected)) + return progs + + def remove(self): + super().remove() + devs.remove(self) + + +class BpfNetdevSim(NetdevSim): + """ + Class for netdevsim netdevice and its attributes. + """ + + def __init__(self, nsimdev, port_index, ifname, ns=None): + super().__init__(nsimdev, port_index, ifname, ns=ns) + + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) + self.dfs_refresh() + + def __getitem__(self, key): + return self.dev[key] + + def remove(self): + self.nsimdev.remove_nsim(self) + + def dfs_refresh(self): + self.dfs = DebugfsDir(self.dfs_dir) + return self.dfs + + def dfs_read(self, f): + path = os.path.join(self.dfs_dir, f) + _, data = cmd('cat %s' % (path)) + return data.strip() + + def wait_for_flush(self, bound=0, total=0, n_retry=20): + for i in range(n_retry): + nbound = self.nsimdev.dfs_num_bound_progs() + nprogs = len(bpftool_prog_list()) + if nbound == bound and nprogs == total: + return + time.sleep(0.05) + raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs)) + + def set_ns(self, ns): + name = ns if ns else "1" + ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns) + self.ns = ns + + def set_mtu(self, mtu, fail=True): + return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu), + fail=fail) + + def set_xdp(self, bpf, mode, force=False, JSON=True, verbose=False, + fail=True, include_stderr=False): + if verbose: + bpf += " verbose" + return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf), + force=force, JSON=JSON, + fail=fail, include_stderr=include_stderr) + + def unset_xdp(self, mode, force=False, JSON=True, + fail=True, include_stderr=False): + return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode), + force=force, JSON=JSON, + fail=fail, include_stderr=include_stderr) + + def ip_link_show(self, xdp): + _, link = ip("link show dev %s" % (self['ifname'])) + if len(link) > 1: + raise Exception("Multiple objects on ip link show") + if len(link) < 1: + return {} + fail(xdp != "xdp" in link, + "XDP program not reporting in iplink (reported %s, expected %s)" % + ("xdp" in link, xdp)) + return link[0] + + def tc_add_ingress(self): + tc("qdisc add dev %s ingress" % (self['ifname'])) + + def tc_del_ingress(self): + tc("qdisc del dev %s ingress" % (self['ifname'])) + + def tc_flush_filters(self, bound=0, total=0): + self.tc_del_ingress() + self.tc_add_ingress() + self.wait_for_flush(bound=bound, total=total) + + def tc_show_ingress(self, expected=None): + # No JSON support, oh well... + flags = ["skip_sw", "skip_hw", "in_hw"] + named = ["protocol", "pref", "chain", "handle", "id", "tag"] + + args = "-s filter show dev %s ingress" % (self['ifname']) + _, out = tc(args, JSON=False) + + filters = [] + lines = out.split('\n') + for line in lines: + words = line.split() + if "handle" not in words: + continue + fltr = {} + for flag in flags: + fltr[flag] = flag in words + for name in named: + try: + idx = words.index(name) + fltr[name] = words[idx + 1] + except ValueError: + pass + filters.append(fltr) + + if expected is not None: + fail(len(filters) != expected, + "%d ingress filters loaded, expected %d" % + (len(filters), expected)) + return filters + + def cls_filter_op(self, op, qdisc="ingress", prio=None, handle=None, + chain=None, cls="", params="", + fail=True, include_stderr=False): + spec = "" + if prio is not None: + spec += " prio %d" % (prio) + if handle: + spec += " handle %s" % (handle) + if chain is not None: + spec += " chain %d" % (chain) + + return tc("filter {op} dev {dev} {qdisc} {spec} {cls} {params}"\ + .format(op=op, dev=self['ifname'], qdisc=qdisc, spec=spec, + cls=cls, params=params), + fail=fail, include_stderr=include_stderr) + + def cls_bpf_add_filter(self, bpf, op="add", prio=None, handle=None, + chain=None, da=False, verbose=False, + skip_sw=False, skip_hw=False, + fail=True, include_stderr=False): + cls = "bpf " + bpf + + params = "" + if da: + params += " da" + if verbose: + params += " verbose" + if skip_sw: + params += " skip_sw" + if skip_hw: + params += " skip_hw" + + return self.cls_filter_op(op=op, prio=prio, handle=handle, cls=cls, + chain=chain, params=params, + fail=fail, include_stderr=include_stderr) + + def set_ethtool_tc_offloads(self, enable, fail=True): + args = "hw-tc-offload %s" % ("on" if enable else "off") + return ethtool(self, "-K", args, fail=fail) + +################################################################################ +def clean_up(): + global files, netns, devs + + for dev in devs: + dev.remove() + for f in files: + cmd("rm -f %s" % (f)) + for ns in netns: + cmd("ip netns delete %s" % (ns)) + files = [] + netns = [] + +def pin_prog(file_name, idx=0): + progs = bpftool_prog_list(expected=(idx + 1)) + prog = progs[idx] + bpftool("prog pin id %d %s" % (prog["id"], file_name)) + files.append(file_name) + + return file_name, bpf_pinned(file_name) + +def pin_map(file_name, idx=0, expected=1): + maps = bpftool_map_list_wait(expected=expected) + m = maps[idx] + bpftool("map pin id %d %s" % (m["id"], file_name)) + files.append(file_name) + + return file_name, bpf_pinned(file_name) + +def check_dev_info_removed(prog_file=None, map_file=None): + bpftool_prog_list(expected=0) + bpftool_prog_list(expected=1, exclude_orphaned=False) + ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) + fail(ret != 0, "failed to show prog with removed device") + + bpftool_map_list_wait(expected=0) + ret, err = bpftool("map show pin %s" % (map_file), fail=False) + fail(ret == 0, "Showing map with removed device did not fail") + fail(err["error"].find("No such device") == -1, + "Showing map with removed device expected ENODEV, error is %s" % + (err["error"])) + +def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): + progs = bpftool_prog_list(expected=1, ns=ns) + prog = progs[0] + + fail("dev" not in prog.keys(), "Device parameters not reported") + dev = prog["dev"] + fail("ifindex" not in dev.keys(), "Device parameters not reported") + fail("ns_dev" not in dev.keys(), "Device parameters not reported") + fail("ns_inode" not in dev.keys(), "Device parameters not reported") + + if not other_ns: + fail("ifname" not in dev.keys(), "Ifname not reported") + fail(dev["ifname"] != sim["ifname"], + "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"])) + else: + fail("ifname" in dev.keys(), "Ifname is reported for other ns") + + maps = bpftool_map_list_wait(expected=2, ns=ns) + for m in maps: + fail("dev" not in m.keys(), "Device parameters not reported") + fail(dev != m["dev"], "Map's device different than program's") + +def check_extack(output, reference, args): + if skip_extack: + return + lines = output.split("\n") + comp = len(lines) >= 2 and lines[1] == 'Error: ' + reference + fail(not comp, "Missing or incorrect netlink extack message") + +def check_extack_nsim(output, reference, args): + check_extack(output, "netdevsim: " + reference, args) + +def check_no_extack(res, needle): + fail((res[1] + res[2]).count(needle) or (res[1] + res[2]).count("Warning:"), + "Found '%s' in command output, leaky extack?" % (needle)) + +def check_verifier_log(output, reference): + lines = output.split("\n") + for l in reversed(lines): + if l == reference: + return + fail(True, "Missing or incorrect message from netdevsim in verifier log") + +def check_multi_basic(two_xdps): + fail(two_xdps["mode"] != 4, "Bad mode reported with multiple programs") + fail("prog" in two_xdps, "Base program reported in multi program mode") + fail(len(two_xdps["attached"]) != 2, + "Wrong attached program count with two programs") + fail(two_xdps["attached"][0]["prog"]["id"] == + two_xdps["attached"][1]["prog"]["id"], + "Offloaded and other programs have the same id") + +def test_spurios_extack(sim, obj, skip_hw, needle): + res = sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=skip_hw, + include_stderr=True) + check_no_extack(res, needle) + res = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, + skip_hw=skip_hw, include_stderr=True) + check_no_extack(res, needle) + res = sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf", + include_stderr=True) + check_no_extack(res, needle) + +def test_multi_prog(simdev, sim, obj, modename, modeid): + start_test("Test multi-attachment XDP - %s + offload..." % + (modename or "default", )) + sim.set_xdp(obj, "offload") + xdp = sim.ip_link_show(xdp=True)["xdp"] + offloaded = sim.dfs_read("bpf_offloaded_id") + fail("prog" not in xdp, "Base program not reported in single program mode") + fail(len(xdp["attached"]) != 1, + "Wrong attached program count with one program") + + sim.set_xdp(obj, modename) + two_xdps = sim.ip_link_show(xdp=True)["xdp"] + + fail(xdp["attached"][0] not in two_xdps["attached"], + "Offload program not reported after other activated") + check_multi_basic(two_xdps) + + offloaded2 = sim.dfs_read("bpf_offloaded_id") + fail(offloaded != offloaded2, + "Offload ID changed after loading other program") + + start_test("Test multi-attachment XDP - replace...") + ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) + fail(ret == 0, "Replaced one of programs without -force") + check_extack(err, "XDP program already attached.", args) + + start_test("Test multi-attachment XDP - remove without mode...") + ret, _, err = sim.unset_xdp("", force=True, + fail=False, include_stderr=True) + fail(ret == 0, "Removed program without a mode flag") + check_extack(err, "More than one program loaded, unset mode is ambiguous.", args) + + sim.unset_xdp("offload") + xdp = sim.ip_link_show(xdp=True)["xdp"] + offloaded = sim.dfs_read("bpf_offloaded_id") + + fail(xdp["mode"] != modeid, "Bad mode reported after multiple programs") + fail("prog" not in xdp, + "Base program not reported after multi program mode") + fail(xdp["attached"][0] not in two_xdps["attached"], + "Offload program not reported after other activated") + fail(len(xdp["attached"]) != 1, + "Wrong attached program count with remaining programs") + fail(offloaded != "0", "Offload ID reported with only other program left") + + start_test("Test multi-attachment XDP - reattach...") + sim.set_xdp(obj, "offload") + two_xdps = sim.ip_link_show(xdp=True)["xdp"] + + fail(xdp["attached"][0] not in two_xdps["attached"], + "Other program not reported after offload activated") + check_multi_basic(two_xdps) + + start_test("Test multi-attachment XDP - device remove...") + simdev.remove() + + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + sim.set_ethtool_tc_offloads(True) + return [simdev, sim] + +# Parse command line +parser = argparse.ArgumentParser() +parser.add_argument("--log", help="output verbose log to given file") +args = parser.parse_args() +if args.log: + logfile = open(args.log, 'w+') + logfile.write("# -*-Org-*-") + +log("Prepare...", "", level=1) +log_level_inc() + +# Check permissions +skip(os.getuid() != 0, "test must be run as root") + +# Check tools +ret, progs = bpftool("prog", fail=False) +skip(ret != 0, "bpftool not installed") +base_progs = progs +_, base_maps = bpftool("map") +base_map_names = [ + 'pid_iter.rodata', # created on each bpftool invocation + 'libbpf_det_bind', # created on each bpftool invocation +] + +# Check netdevsim +if not os.path.isdir("/sys/bus/netdevsim/"): + ret, out = cmd("modprobe netdevsim", fail=False) + skip(ret != 0, "netdevsim module could not be loaded") + +# Check debugfs +_, out = cmd("mount") +if out.find("/sys/kernel/debug type debugfs") == -1: + cmd("mount -t debugfs none /sys/kernel/debug") + +# Check samples are compiled +samples = ["sample_ret0.bpf.o", "sample_map_ret0.bpf.o"] +for s in samples: + ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False) + skip(ret != 0, "sample %s/%s not found, please compile it" % + (bpf_test_dir, s)) + +# Check if iproute2 is built with libmnl (needed by extack support) +_, _, err = cmd("tc qdisc delete dev lo handle 0", + fail=False, include_stderr=True) +if err.find("Error: Failed to find qdisc with specified handle.") == -1: + print("Warning: no extack message in iproute2 output, libmnl missing?") + log("Warning: no extack message in iproute2 output, libmnl missing?", "") + skip_extack = True + +# Check if net namespaces seem to work +ns = mknetns() +skip(ns is None, "Could not create a net namespace") +cmd("ip netns delete %s" % (ns)) +netns = [] + +try: + obj = bpf_obj("sample_ret0.bpf.o") + bytecode = bpf_bytecode("1,6 0 0 4294967295,") + + start_test("Test destruction of generic XDP...") + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + sim.set_xdp(obj, "generic") + simdev.remove() + bpftool_prog_list_wait(expected=0) + + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + sim.tc_add_ingress() + + start_test("Test TC non-offloaded...") + ret, _ = sim.cls_bpf_add_filter(obj, skip_hw=True, fail=False) + fail(ret != 0, "Software TC filter did not load") + + start_test("Test TC non-offloaded isn't getting bound...") + ret, _ = sim.cls_bpf_add_filter(obj, fail=False) + fail(ret != 0, "Software TC filter did not load") + simdev.dfs_get_bound_progs(expected=0) + + sim.tc_flush_filters() + + start_test("Test TC offloads are off by default...") + ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC filter loaded without enabling TC offloads") + check_extack(err, "TC offload is disabled on net device.", args) + sim.wait_for_flush() + + sim.set_ethtool_tc_offloads(True) + sim.dfs["bpf_tc_non_bound_accept"] = "Y" + + start_test("Test TC offload by default...") + ret, _ = sim.cls_bpf_add_filter(obj, fail=False) + fail(ret != 0, "Software TC filter did not load") + simdev.dfs_get_bound_progs(expected=0) + ingress = sim.tc_show_ingress(expected=1) + fltr = ingress[0] + fail(not fltr["in_hw"], "Filter not offloaded by default") + + sim.tc_flush_filters() + + start_test("Test TC cBPF bytcode tries offload by default...") + ret, _ = sim.cls_bpf_add_filter(bytecode, fail=False) + fail(ret != 0, "Software TC filter did not load") + simdev.dfs_get_bound_progs(expected=0) + ingress = sim.tc_show_ingress(expected=1) + fltr = ingress[0] + fail(not fltr["in_hw"], "Bytecode not offloaded by default") + + sim.tc_flush_filters() + sim.dfs["bpf_tc_non_bound_accept"] = "N" + + start_test("Test TC cBPF unbound bytecode doesn't offload...") + ret, _, err = sim.cls_bpf_add_filter(bytecode, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC bytecode loaded for offload") + check_extack_nsim(err, "netdevsim configured to reject unbound programs.", + args) + sim.wait_for_flush() + + start_test("Test non-0 chain offload...") + ret, _, err = sim.cls_bpf_add_filter(obj, chain=1, prio=1, handle=1, + skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "Offloaded a filter to chain other than 0") + check_extack(err, "Driver supports only offload of chain 0.", args) + sim.tc_flush_filters() + + start_test("Test TC replace...") + sim.cls_bpf_add_filter(obj, prio=1, handle=1) + sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_sw=True) + sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_sw=True) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=True) + sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_hw=True) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + start_test("Test TC replace bad flags...") + for i in range(3): + for j in range(3): + ret, _ = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, + skip_sw=(j == 1), skip_hw=(j == 2), + fail=False) + fail(bool(ret) != bool(j), + "Software TC incorrect load in replace test, iteration %d" % + (j)) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + start_test("Test spurious extack from the driver...") + test_spurios_extack(sim, obj, False, "netdevsim") + test_spurios_extack(sim, obj, True, "netdevsim") + + sim.set_ethtool_tc_offloads(False) + + test_spurios_extack(sim, obj, False, "TC offload is disabled") + test_spurios_extack(sim, obj, True, "TC offload is disabled") + + sim.set_ethtool_tc_offloads(True) + + sim.tc_flush_filters() + + start_test("Test TC offloads failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC filter did not reject with TC offloads enabled") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + + start_test("Test TC offloads work...") + ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret != 0, "TC filter did not load with TC offloads enabled") + + start_test("Test TC offload basics...") + dfs = simdev.dfs_get_bound_progs(expected=1) + progs = bpftool_prog_list(expected=1) + ingress = sim.tc_show_ingress(expected=1) + + dprog = dfs[0] + prog = progs[0] + fltr = ingress[0] + fail(fltr["skip_hw"], "TC does reports 'skip_hw' on offloaded filter") + fail(not fltr["in_hw"], "TC does not report 'in_hw' for offloaded filter") + fail(not fltr["skip_sw"], "TC does not report 'skip_sw' back") + + start_test("Test TC offload is device-bound...") + fail(str(prog["id"]) != fltr["id"], "Program IDs don't match") + fail(prog["tag"] != fltr["tag"], "Program tags don't match") + fail(fltr["id"] != dprog["id"], "Program IDs don't match") + fail(dprog["state"] != "xlated", "Offloaded program state not translated") + fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") + + start_test("Test disabling TC offloads is rejected while filters installed...") + ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) + fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...") + sim.set_ethtool_tc_offloads(True) + + start_test("Test qdisc removal frees things...") + sim.tc_flush_filters() + sim.tc_show_ingress(expected=0) + + start_test("Test disabling TC offloads is OK without filters...") + ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) + fail(ret != 0, + "Driver refused to disable TC offloads without filters installed...") + + sim.set_ethtool_tc_offloads(True) + + start_test("Test destroying device gets rid of TC filters...") + sim.cls_bpf_add_filter(obj, skip_sw=True) + simdev.remove() + bpftool_prog_list_wait(expected=0) + + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + sim.set_ethtool_tc_offloads(True) + + start_test("Test destroying device gets rid of XDP...") + sim.set_xdp(obj, "offload") + simdev.remove() + bpftool_prog_list_wait(expected=0) + + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + sim.set_ethtool_tc_offloads(True) + + start_test("Test XDP prog reporting...") + sim.set_xdp(obj, "drv") + ipl = sim.ip_link_show(xdp=True) + progs = bpftool_prog_list(expected=1) + fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], + "Loaded program has wrong ID") + + start_test("Test XDP prog replace without force...") + ret, _ = sim.set_xdp(obj, "drv", fail=False) + fail(ret == 0, "Replaced XDP program without -force") + sim.wait_for_flush(total=1) + + start_test("Test XDP prog replace with force...") + ret, _ = sim.set_xdp(obj, "drv", force=True, fail=False) + fail(ret != 0, "Could not replace XDP program with -force") + bpftool_prog_list_wait(expected=1) + ipl = sim.ip_link_show(xdp=True) + progs = bpftool_prog_list(expected=1) + fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], + "Loaded program has wrong ID") + fail("dev" in progs[0].keys(), + "Device parameters reported for non-offloaded program") + + start_test("Test XDP prog replace with bad flags...") + ret, _, err = sim.set_xdp(obj, "generic", force=True, + fail=False, include_stderr=True) + fail(ret == 0, "Replaced XDP program with a program in different mode") + check_extack(err, + "Native and generic XDP can't be active at the same time.", + args) + + start_test("Test MTU restrictions...") + ret, _ = sim.set_mtu(9000, fail=False) + fail(ret == 0, + "Driver should refuse to increase MTU to 9000 with XDP loaded...") + sim.unset_xdp("drv") + bpftool_prog_list_wait(expected=0) + sim.set_mtu(9000) + ret, _, err = sim.set_xdp(obj, "drv", fail=False, include_stderr=True) + fail(ret == 0, "Driver should refuse to load program with MTU of 9000...") + check_extack_nsim(err, "MTU too large w/ XDP enabled.", args) + sim.set_mtu(1500) + + sim.wait_for_flush() + start_test("Test non-offload XDP attaching to HW...") + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/nooffload") + nooffload = bpf_pinned("/sys/fs/bpf/nooffload") + ret, _, err = sim.set_xdp(nooffload, "offload", + fail=False, include_stderr=True) + fail(ret == 0, "attached non-offloaded XDP program to HW") + check_extack_nsim(err, "xdpoffload of non-bound program.", args) + rm("/sys/fs/bpf/nooffload") + + start_test("Test offload XDP attaching to drv...") + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", + dev=sim['ifname']) + offload = bpf_pinned("/sys/fs/bpf/offload") + ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) + fail(ret == 0, "attached offloaded XDP program to drv") + check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args) + rm("/sys/fs/bpf/offload") + sim.wait_for_flush() + + start_test("Test XDP load failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", + dev=sim['ifname'], fail=False, include_stderr=True) + fail(ret == 0, "verifier should fail on load") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + sim.wait_for_flush() + + start_test("Test XDP offload...") + _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True) + ipl = sim.ip_link_show(xdp=True) + link_xdp = ipl["xdp"]["prog"] + progs = bpftool_prog_list(expected=1) + prog = progs[0] + fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID") + + start_test("Test XDP offload is device bound...") + dfs = simdev.dfs_get_bound_progs(expected=1) + dprog = dfs[0] + + fail(prog["id"] != link_xdp["id"], "Program IDs don't match") + fail(prog["tag"] != link_xdp["tag"], "Program tags don't match") + fail(str(link_xdp["id"]) != dprog["id"], "Program IDs don't match") + fail(dprog["state"] != "xlated", "Offloaded program state not translated") + fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") + + start_test("Test removing XDP program many times...") + sim.unset_xdp("offload") + sim.unset_xdp("offload") + sim.unset_xdp("drv") + sim.unset_xdp("drv") + sim.unset_xdp("") + sim.unset_xdp("") + bpftool_prog_list_wait(expected=0) + + start_test("Test attempt to use a program for a wrong device...") + simdev2 = BpfNetdevSimDev() + sim2, = simdev2.nsims + sim2.set_xdp(obj, "offload") + pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") + + ret, _, err = sim.set_xdp(pinned, "offload", + fail=False, include_stderr=True) + fail(ret == 0, "Pinned program loaded for a different device accepted") + check_extack(err, "Program bound to different device.", args) + simdev2.remove() + ret, _, err = sim.set_xdp(pinned, "offload", + fail=False, include_stderr=True) + fail(ret == 0, "Pinned program loaded for a removed device accepted") + check_extack(err, "Program bound to different device.", args) + rm(pin_file) + bpftool_prog_list_wait(expected=0) + + simdev, sim = test_multi_prog(simdev, sim, obj, "", 1) + simdev, sim = test_multi_prog(simdev, sim, obj, "drv", 1) + simdev, sim = test_multi_prog(simdev, sim, obj, "generic", 2) + + start_test("Test mixing of TC and XDP...") + sim.tc_add_ingress() + sim.set_xdp(obj, "offload") + ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "Loading TC when XDP active should fail") + check_extack_nsim(err, "driver and netdev offload states mismatch.", args) + sim.unset_xdp("offload") + sim.wait_for_flush() + + sim.cls_bpf_add_filter(obj, skip_sw=True) + ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) + fail(ret == 0, "Loading XDP when TC active should fail") + check_extack_nsim(err, "TC program is already loaded.", args) + + start_test("Test binding TC from pinned...") + pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") + sim.tc_flush_filters(bound=1, total=1) + sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True) + sim.tc_flush_filters(bound=1, total=1) + + start_test("Test binding XDP from pinned...") + sim.set_xdp(obj, "offload") + pin_file, pinned = pin_prog("/sys/fs/bpf/tmp2", idx=1) + + sim.set_xdp(pinned, "offload", force=True) + sim.unset_xdp("offload") + sim.set_xdp(pinned, "offload", force=True) + sim.unset_xdp("offload") + + start_test("Test offload of wrong type fails...") + ret, _ = sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True, fail=False) + fail(ret == 0, "Managed to attach XDP program to TC") + + start_test("Test asking for TC offload of two filters...") + sim.cls_bpf_add_filter(obj, da=True, skip_sw=True) + ret, _, err = sim.cls_bpf_add_filter(obj, da=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "Managed to offload two TC filters at the same time") + check_extack_nsim(err, "driver and netdev offload states mismatch.", args) + + sim.tc_flush_filters(bound=2, total=2) + + start_test("Test if netdev removal waits for translation...") + delay_msec = 500 + sim.dfs["dev/bpf_bind_verifier_delay"] = delay_msec + start = time.time() + cmd_line = "tc filter add dev %s ingress bpf %s da skip_sw" % \ + (sim['ifname'], obj) + tc_proc = cmd(cmd_line, background=True, fail=False) + # Wait for the verifier to start + while simdev.dfs_num_bound_progs() <= 2: + pass + simdev.remove() + end = time.time() + ret, _ = cmd_result(tc_proc, fail=False) + time_diff = end - start + log("Time", "start:\t%s\nend:\t%s\ndiff:\t%s" % (start, end, time_diff)) + + fail(ret == 0, "Managed to load TC filter on a unregistering device") + delay_sec = delay_msec * 0.001 + fail(time_diff < delay_sec, "Removal process took %s, expected %s" % + (time_diff, delay_sec)) + + # Remove all pinned files and reinstantiate the netdev + clean_up() + bpftool_prog_list_wait(expected=0) + + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + map_obj = bpf_obj("sample_map_ret0.bpf.o") + start_test("Test loading program with maps...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + + start_test("Test bpftool bound info reporting (own ns)...") + check_dev_info(False, "") + + start_test("Test bpftool bound info reporting (other ns)...") + ns = mknetns() + sim.set_ns(ns) + check_dev_info(True, "") + + start_test("Test bpftool bound info reporting (remote ns)...") + check_dev_info(False, ns) + + start_test("Test bpftool bound info reporting (back to own ns)...") + sim.set_ns("") + check_dev_info(False, "") + + prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog") + map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2) + simdev.remove() + + start_test("Test bpftool bound info reporting (removed dev)...") + check_dev_info_removed(prog_file=prog_file, map_file=map_file) + + # Remove all pinned files and reinstantiate the netdev + clean_up() + bpftool_prog_list_wait(expected=0) + + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + + start_test("Test map update (no flags)...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + maps = bpftool_map_list_wait(expected=2) + array = maps[0] if maps[0]["type"] == "array" else maps[1] + htab = maps[0] if maps[0]["type"] == "hash" else maps[1] + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, _ = bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "added too many entries") + + start_test("Test map update (exists)...") + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, err = bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "updated non-existing key") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map update (noexist)...") + for m in maps: + for i in range(2): + ret, err = bpftool("map update id %d key %s value %s noexist" % + (m["id"], int2str("I", i), int2str("Q", i * 3)), + fail=False) + fail(ret == 0, "updated existing key") + fail(err["error"].find("File exists") == -1, + "expected EEXIST, error is '%s'" % (err["error"])) + + start_test("Test map dump...") + for m in maps: + _, entries = bpftool("map dump id %d" % (m["id"])) + for i in range(2): + key = str2int(entries[i]["key"]) + fail(key != i, "expected key %d, got %d" % (key, i)) + val = str2int(entries[i]["value"]) + fail(val != i * 3, "expected value %d, got %d" % (val, i * 3)) + + start_test("Test map getnext...") + for m in maps: + _, entry = bpftool("map getnext id %d" % (m["id"])) + key = str2int(entry["next_key"]) + fail(key != 0, "next key %d, expected %d" % (key, 0)) + _, entry = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 0))) + key = str2int(entry["next_key"]) + fail(key != 1, "next key %d, expected %d" % (key, 1)) + ret, err = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 1)), fail=False) + fail(ret == 0, "got next key past the end of map") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map delete (htab)...") + for i in range(2): + bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i))) + + start_test("Test map delete (array)...") + for i in range(2): + ret, err = bpftool("map delete id %d key %s" % + (htab["id"], int2str("I", i)), fail=False) + fail(ret == 0, "removed entry from an array") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map remove...") + sim.unset_xdp("offload") + bpftool_map_list_wait(expected=0) + simdev.remove() + + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + simdev.remove() + bpftool_map_list_wait(expected=0) + + start_test("Test map creation fail path...") + simdev = BpfNetdevSimDev() + sim, = simdev.nsims + sim.dfs["bpf_map_accept"] = "N" + ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) + fail(ret == 0, + "netdevsim didn't refuse to create a map with offload disabled") + + simdev.remove() + + start_test("Test multi-dev ASIC program reuse...") + simdevA = BpfNetdevSimDev() + simA, = simdevA.nsims + simdevB = BpfNetdevSimDev(3) + simB1, simB2, simB3 = simdevB.nsims + sims = (simA, simB1, simB2, simB3) + simB = (simB1, simB2, simB3) + + bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA", + dev=simA['ifname']) + progA = bpf_pinned("/sys/fs/bpf/nsimA") + bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB", + dev=simB1['ifname']) + progB = bpf_pinned("/sys/fs/bpf/nsimB") + + simA.set_xdp(progA, "offload", JSON=False) + for d in simdevB.nsims: + d.set_xdp(progB, "offload", JSON=False) + + start_test("Test multi-dev ASIC cross-dev replace...") + ret, _ = simA.set_xdp(progB, "offload", force=True, JSON=False, fail=False) + fail(ret == 0, "cross-ASIC program allowed") + for d in simdevB.nsims: + ret, _ = d.set_xdp(progA, "offload", force=True, JSON=False, fail=False) + fail(ret == 0, "cross-ASIC program allowed") + + start_test("Test multi-dev ASIC cross-dev install...") + for d in sims: + d.unset_xdp("offload") + + ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False, + fail=False, include_stderr=True) + fail(ret == 0, "cross-ASIC program allowed") + check_extack(err, "Program bound to different device.", args) + for d in simdevB.nsims: + ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False, + fail=False, include_stderr=True) + fail(ret == 0, "cross-ASIC program allowed") + check_extack(err, "Program bound to different device.", args) + + start_test("Test multi-dev ASIC cross-dev map reuse...") + + mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0] + mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0] + + ret, _ = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", + dev=simB3['ifname'], + maps=["idx 0 id %d" % (mapB)], + fail=False) + fail(ret != 0, "couldn't reuse a map on the same ASIC") + rm("/sys/fs/bpf/nsimB_") + + ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA_", + dev=simA['ifname'], + maps=["idx 0 id %d" % (mapB)], + fail=False, include_stderr=True) + fail(ret == 0, "could reuse a map on a different ASIC") + fail(err.count("offload device mismatch between prog and map") == 0, + "error message missing for cross-ASIC map") + + ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", + dev=simB1['ifname'], + maps=["idx 0 id %d" % (mapA)], + fail=False, include_stderr=True) + fail(ret == 0, "could reuse a map on a different ASIC") + fail(err.count("offload device mismatch between prog and map") == 0, + "error message missing for cross-ASIC map") + + start_test("Test multi-dev ASIC cross-dev destruction...") + bpftool_prog_list_wait(expected=2) + + simdevA.remove() + bpftool_prog_list_wait(expected=1) + + ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] + fail(ifnameB != simB1['ifname'], "program not bound to original device") + simB1.remove() + bpftool_prog_list_wait(expected=1) + + start_test("Test multi-dev ASIC cross-dev destruction - move...") + ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] + fail(ifnameB not in (simB2['ifname'], simB3['ifname']), + "program not bound to remaining devices") + + simB2.remove() + ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] + fail(ifnameB != simB3['ifname'], "program not bound to remaining device") + + simB3.remove() + simdevB.remove() + bpftool_prog_list_wait(expected=0) + + start_test("Test multi-dev ASIC cross-dev destruction - orphaned...") + ret, out = bpftool("prog show %s" % (progB), fail=False) + fail(ret != 0, "couldn't get information about orphaned program") + + print("%s: OK" % (os.path.basename(__file__))) + +finally: + log("Clean up...", "", level=1) + log_level_inc() + clean_up() diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index 161db24e3c..876c2db02a 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -260,15 +260,8 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) SOL_IPV6, IPV6_HOPLIMIT, &opt.v6.hlimit); if (opt.txtime.ena) { - struct sock_txtime so_txtime = { - .clockid = CLOCK_MONOTONIC, - }; __u64 txtime; - if (setsockopt(fd, SOL_SOCKET, SO_TXTIME, - &so_txtime, sizeof(so_txtime))) - error(ERN_SOCKOPT, errno, "setsockopt TXTIME"); - txtime = time_start_mono.tv_sec * (1000ULL * 1000 * 1000) + time_start_mono.tv_nsec + opt.txtime.delay * 1000; @@ -284,13 +277,6 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) memcpy(CMSG_DATA(cmsg), &txtime, sizeof(txtime)); } if (opt.ts.ena) { - __u32 val = SOF_TIMESTAMPING_SOFTWARE | - SOF_TIMESTAMPING_OPT_TSONLY; - - if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, - &val, sizeof(val))) - error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING"); - cmsg = (struct cmsghdr *)(cbuf + cmsg_len); cmsg_len += CMSG_SPACE(sizeof(__u32)); if (cbuf_sz < cmsg_len) @@ -426,6 +412,24 @@ static void ca_set_sockopts(int fd) setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opt.sockopt.priority, sizeof(opt.sockopt.priority))) error(ERN_SOCKOPT, errno, "setsockopt SO_PRIORITY"); + + if (opt.txtime.ena) { + struct sock_txtime so_txtime = { + .clockid = CLOCK_MONOTONIC, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_TXTIME, + &so_txtime, sizeof(so_txtime))) + error(ERN_SOCKOPT, errno, "setsockopt TXTIME"); + } + if (opt.ts.ena) { + __u32 val = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_TSONLY; + + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, + &val, sizeof(val))) + error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING"); + } } int main(int argc, char *argv[]) diff --git a/tools/testing/selftests/net/cmsg_time.sh b/tools/testing/selftests/net/cmsg_time.sh index af85267ad1..1d7e756644 100755 --- a/tools/testing/selftests/net/cmsg_time.sh +++ b/tools/testing/selftests/net/cmsg_time.sh @@ -66,10 +66,13 @@ for i in "-4 $TGT4" "-6 $TGT6"; do awk '/SND/ { if ($3 > 1000) print "OK"; }') check_result $? "$ts" "OK" "$prot - TXTIME abs" - ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 | + [ "$KSFT_MACHINE_SLOW" = yes ] && delay=8000 || delay=1000 + + ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d $delay | awk '/SND/ {snd=$3} /SCHED/ {sch=$3} - END { if (snd - sch > 500) print "OK"; }') + END { if (snd - sch > '$((delay/2))') print "OK"; + else print snd, "-", sch, "<", '$((delay/2))'; }') check_result $? "$ts" "OK" "$prot - TXTIME rel" done done diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 04de7a6ba6..d4891f7a2b 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -101,3 +101,5 @@ CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_CRYPTO_ARIA=y CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RPFILTER=m diff --git a/tools/testing/selftests/net/csum.c b/tools/testing/selftests/net/csum.c deleted file mode 100644 index 90eb06fefa..0000000000 --- a/tools/testing/selftests/net/csum.c +++ /dev/null @@ -1,988 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* Test hardware checksum offload: Rx + Tx, IPv4 + IPv6, TCP + UDP. - * - * The test runs on two machines to exercise the NIC. For this reason it - * is not integrated in kselftests. - * - * CMD=$((./csum -[46] -[tu] -S $SADDR -D $DADDR -[RT] -r 1 $EXTRA_ARGS)) - * - * Rx: - * - * The sender sends packets with a known checksum field using PF_INET(6) - * SOCK_RAW sockets. - * - * good packet: $CMD [-t] - * bad packet: $CMD [-t] -E - * - * The receiver reads UDP packets with a UDP socket. This is not an - * option for TCP packets ('-t'). Optionally insert an iptables filter - * to avoid these entering the real protocol stack. - * - * The receiver also reads all packets with a PF_PACKET socket, to - * observe whether both good and bad packets arrive on the host. And to - * read the optional TP_STATUS_CSUM_VALID bit. This requires setting - * option PACKET_AUXDATA, and works only for CHECKSUM_UNNECESSARY. - * - * Tx: - * - * The sender needs to build CHECKSUM_PARTIAL packets to exercise tx - * checksum offload. - * - * The sender can sends packets with a UDP socket. - * - * Optionally crafts a packet that sums up to zero to verify that the - * device writes negative zero 0xFFFF in this case to distinguish from - * 0x0000 (checksum disabled), as required by RFC 768. Hit this case - * by choosing a specific source port. - * - * good packet: $CMD -U - * zero csum: $CMD -U -Z - * - * The sender can also build packets with PF_PACKET with PACKET_VNET_HDR, - * to cover more protocols. PF_PACKET requires passing src and dst mac - * addresses. - * - * good packet: $CMD -s $smac -d $dmac -p [-t] - * - * Argument '-z' sends UDP packets with a 0x000 checksum disabled field, - * to verify that the NIC passes these packets unmodified. - * - * Argument '-e' adds a transport mode encapsulation header between - * network and transport header. This will fail for devices that parse - * headers. Should work on devices that implement protocol agnostic tx - * checksum offload (NETIF_F_HW_CSUM). - * - * Argument '-r $SEED' optionally randomizes header, payload and length - * to increase coverage between packets sent. SEED 1 further chooses a - * different seed for each run (and logs this for reproducibility). It - * is advised to enable this for extra coverage in continuous testing. - */ - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kselftest.h" - -static bool cfg_bad_csum; -static int cfg_family = PF_INET6; -static int cfg_num_pkt = 4; -static bool cfg_do_rx = true; -static bool cfg_do_tx = true; -static bool cfg_encap; -static char *cfg_ifname = "eth0"; -static char *cfg_mac_dst; -static char *cfg_mac_src; -static int cfg_proto = IPPROTO_UDP; -static int cfg_payload_char = 'a'; -static int cfg_payload_len = 100; -static uint16_t cfg_port_dst = 34000; -static uint16_t cfg_port_src = 33000; -static uint16_t cfg_port_src_encap = 33001; -static unsigned int cfg_random_seed; -static int cfg_rcvbuf = 1 << 22; /* be able to queue large cfg_num_pkt */ -static bool cfg_send_pfpacket; -static bool cfg_send_udp; -static int cfg_timeout_ms = 2000; -static bool cfg_zero_disable; /* skip checksum: set to zero (udp only) */ -static bool cfg_zero_sum; /* create packet that adds up to zero */ - -static struct sockaddr_in cfg_daddr4 = {.sin_family = AF_INET}; -static struct sockaddr_in cfg_saddr4 = {.sin_family = AF_INET}; -static struct sockaddr_in6 cfg_daddr6 = {.sin6_family = AF_INET6}; -static struct sockaddr_in6 cfg_saddr6 = {.sin6_family = AF_INET6}; - -#define ENC_HEADER_LEN (sizeof(struct udphdr) + sizeof(struct udp_encap_hdr)) -#define MAX_HEADER_LEN (sizeof(struct ipv6hdr) + ENC_HEADER_LEN + sizeof(struct tcphdr)) -#define MAX_PAYLOAD_LEN 1024 - -/* Trivial demo encap. Stand-in for transport layer protocols like ESP or PSP */ -struct udp_encap_hdr { - uint8_t nexthdr; - uint8_t padding[3]; -}; - -/* Ipaddrs, for pseudo csum. Global var is ugly, pass through funcs was worse */ -static void *iph_addr_p; - -static unsigned long gettimeofday_ms(void) -{ - struct timeval tv; - - gettimeofday(&tv, NULL); - return (tv.tv_sec * 1000UL) + (tv.tv_usec / 1000UL); -} - -static uint32_t checksum_nofold(char *data, size_t len, uint32_t sum) -{ - uint16_t *words = (uint16_t *)data; - int i; - - for (i = 0; i < len / 2; i++) - sum += words[i]; - - if (len & 1) - sum += ((unsigned char *)data)[len - 1]; - - return sum; -} - -static uint16_t checksum_fold(void *data, size_t len, uint32_t sum) -{ - sum = checksum_nofold(data, len, sum); - - while (sum > 0xFFFF) - sum = (sum & 0xFFFF) + (sum >> 16); - - return ~sum; -} - -static uint16_t checksum(void *th, uint16_t proto, size_t len) -{ - uint32_t sum; - int alen; - - alen = cfg_family == PF_INET6 ? 32 : 8; - - sum = checksum_nofold(iph_addr_p, alen, 0); - sum += htons(proto); - sum += htons(len); - - /* With CHECKSUM_PARTIAL kernel expects non-inverted pseudo csum */ - if (cfg_do_tx && cfg_send_pfpacket) - return ~checksum_fold(NULL, 0, sum); - else - return checksum_fold(th, len, sum); -} - -static void *build_packet_ipv4(void *_iph, uint8_t proto, unsigned int len) -{ - struct iphdr *iph = _iph; - - memset(iph, 0, sizeof(*iph)); - - iph->version = 4; - iph->ihl = 5; - iph->ttl = 8; - iph->protocol = proto; - iph->saddr = cfg_saddr4.sin_addr.s_addr; - iph->daddr = cfg_daddr4.sin_addr.s_addr; - iph->tot_len = htons(sizeof(*iph) + len); - iph->check = checksum_fold(iph, sizeof(*iph), 0); - - iph_addr_p = &iph->saddr; - - return iph + 1; -} - -static void *build_packet_ipv6(void *_ip6h, uint8_t proto, unsigned int len) -{ - struct ipv6hdr *ip6h = _ip6h; - - memset(ip6h, 0, sizeof(*ip6h)); - - ip6h->version = 6; - ip6h->payload_len = htons(len); - ip6h->nexthdr = proto; - ip6h->hop_limit = 64; - ip6h->saddr = cfg_saddr6.sin6_addr; - ip6h->daddr = cfg_daddr6.sin6_addr; - - iph_addr_p = &ip6h->saddr; - - return ip6h + 1; -} - -static void *build_packet_udp(void *_uh) -{ - struct udphdr *uh = _uh; - - uh->source = htons(cfg_port_src); - uh->dest = htons(cfg_port_dst); - uh->len = htons(sizeof(*uh) + cfg_payload_len); - uh->check = 0; - - /* choose source port so that uh->check adds up to zero */ - if (cfg_zero_sum) { - uh->source = 0; - uh->source = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); - - fprintf(stderr, "tx: changing sport: %hu -> %hu\n", - cfg_port_src, ntohs(uh->source)); - cfg_port_src = ntohs(uh->source); - } - - if (cfg_zero_disable) - uh->check = 0; - else - uh->check = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); - - if (cfg_bad_csum) - uh->check = ~uh->check; - - fprintf(stderr, "tx: sending checksum: 0x%x\n", uh->check); - return uh + 1; -} - -static void *build_packet_tcp(void *_th) -{ - struct tcphdr *th = _th; - - th->source = htons(cfg_port_src); - th->dest = htons(cfg_port_dst); - th->doff = 5; - th->check = 0; - - th->check = checksum(th, IPPROTO_TCP, sizeof(*th) + cfg_payload_len); - - if (cfg_bad_csum) - th->check = ~th->check; - - fprintf(stderr, "tx: sending checksum: 0x%x\n", th->check); - return th + 1; -} - -static char *build_packet_udp_encap(void *_uh) -{ - struct udphdr *uh = _uh; - struct udp_encap_hdr *eh = _uh + sizeof(*uh); - - /* outer dst == inner dst, to simplify BPF filter - * outer src != inner src, to demultiplex on recv - */ - uh->dest = htons(cfg_port_dst); - uh->source = htons(cfg_port_src_encap); - uh->check = 0; - uh->len = htons(sizeof(*uh) + - sizeof(*eh) + - sizeof(struct tcphdr) + - cfg_payload_len); - - eh->nexthdr = IPPROTO_TCP; - - return build_packet_tcp(eh + 1); -} - -static char *build_packet(char *buf, int max_len, int *len) -{ - uint8_t proto; - char *off; - int tlen; - - if (cfg_random_seed) { - int *buf32 = (void *)buf; - int i; - - for (i = 0; i < (max_len / sizeof(int)); i++) - buf32[i] = rand(); - } else { - memset(buf, cfg_payload_char, max_len); - } - - if (cfg_proto == IPPROTO_UDP) - tlen = sizeof(struct udphdr) + cfg_payload_len; - else - tlen = sizeof(struct tcphdr) + cfg_payload_len; - - if (cfg_encap) { - proto = IPPROTO_UDP; - tlen += ENC_HEADER_LEN; - } else { - proto = cfg_proto; - } - - if (cfg_family == PF_INET) - off = build_packet_ipv4(buf, proto, tlen); - else - off = build_packet_ipv6(buf, proto, tlen); - - if (cfg_encap) - off = build_packet_udp_encap(off); - else if (cfg_proto == IPPROTO_UDP) - off = build_packet_udp(off); - else - off = build_packet_tcp(off); - - /* only pass the payload, but still compute headers for cfg_zero_sum */ - if (cfg_send_udp) { - *len = cfg_payload_len; - return off; - } - - *len = off - buf + cfg_payload_len; - return buf; -} - -static int open_inet(int ipproto, int protocol) -{ - int fd; - - fd = socket(cfg_family, ipproto, protocol); - if (fd == -1) - error(1, errno, "socket inet"); - - if (cfg_family == PF_INET6) { - /* may have been updated by cfg_zero_sum */ - cfg_saddr6.sin6_port = htons(cfg_port_src); - - if (bind(fd, (void *)&cfg_saddr6, sizeof(cfg_saddr6))) - error(1, errno, "bind dgram 6"); - if (connect(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) - error(1, errno, "connect dgram 6"); - } else { - /* may have been updated by cfg_zero_sum */ - cfg_saddr4.sin_port = htons(cfg_port_src); - - if (bind(fd, (void *)&cfg_saddr4, sizeof(cfg_saddr4))) - error(1, errno, "bind dgram 4"); - if (connect(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) - error(1, errno, "connect dgram 4"); - } - - return fd; -} - -static int open_packet(void) -{ - int fd, one = 1; - - fd = socket(PF_PACKET, SOCK_RAW, 0); - if (fd == -1) - error(1, errno, "socket packet"); - - if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &one, sizeof(one))) - error(1, errno, "setsockopt packet_vnet_ndr"); - - return fd; -} - -static void send_inet(int fd, const char *buf, int len) -{ - int ret; - - ret = write(fd, buf, len); - if (ret == -1) - error(1, errno, "write"); - if (ret != len) - error(1, 0, "write: %d", ret); -} - -static void eth_str_to_addr(const char *str, unsigned char *eth) -{ - if (sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", - ð[0], ð[1], ð[2], ð[3], ð[4], ð[5]) != 6) - error(1, 0, "cannot parse mac addr %s", str); -} - -static void send_packet(int fd, const char *buf, int len) -{ - struct virtio_net_hdr vh = {0}; - struct sockaddr_ll addr = {0}; - struct msghdr msg = {0}; - struct ethhdr eth; - struct iovec iov[3]; - int ret; - - addr.sll_family = AF_PACKET; - addr.sll_halen = ETH_ALEN; - addr.sll_ifindex = if_nametoindex(cfg_ifname); - if (!addr.sll_ifindex) - error(1, errno, "if_nametoindex %s", cfg_ifname); - - vh.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - if (cfg_family == PF_INET6) { - vh.csum_start = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); - addr.sll_protocol = htons(ETH_P_IPV6); - } else { - vh.csum_start = sizeof(struct ethhdr) + sizeof(struct iphdr); - addr.sll_protocol = htons(ETH_P_IP); - } - - if (cfg_encap) - vh.csum_start += ENC_HEADER_LEN; - - if (cfg_proto == IPPROTO_TCP) { - vh.csum_offset = __builtin_offsetof(struct tcphdr, check); - vh.hdr_len = vh.csum_start + sizeof(struct tcphdr); - } else { - vh.csum_offset = __builtin_offsetof(struct udphdr, check); - vh.hdr_len = vh.csum_start + sizeof(struct udphdr); - } - - eth_str_to_addr(cfg_mac_src, eth.h_source); - eth_str_to_addr(cfg_mac_dst, eth.h_dest); - eth.h_proto = addr.sll_protocol; - - iov[0].iov_base = &vh; - iov[0].iov_len = sizeof(vh); - - iov[1].iov_base = ð - iov[1].iov_len = sizeof(eth); - - iov[2].iov_base = (void *)buf; - iov[2].iov_len = len; - - msg.msg_iov = iov; - msg.msg_iovlen = ARRAY_SIZE(iov); - - msg.msg_name = &addr; - msg.msg_namelen = sizeof(addr); - - ret = sendmsg(fd, &msg, 0); - if (ret == -1) - error(1, errno, "sendmsg packet"); - if (ret != sizeof(vh) + sizeof(eth) + len) - error(1, errno, "sendmsg packet: %u", ret); -} - -static int recv_prepare_udp(void) -{ - int fd; - - fd = socket(cfg_family, SOCK_DGRAM, 0); - if (fd == -1) - error(1, errno, "socket r"); - - if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, - &cfg_rcvbuf, sizeof(cfg_rcvbuf))) - error(1, errno, "setsockopt SO_RCVBUF r"); - - if (cfg_family == PF_INET6) { - if (bind(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) - error(1, errno, "bind r"); - } else { - if (bind(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) - error(1, errno, "bind r"); - } - - return fd; -} - -/* Filter out all traffic that is not cfg_proto with our destination port. - * - * Otherwise background noise may cause PF_PACKET receive queue overflow, - * dropping the expected packets and failing the test. - */ -static void __recv_prepare_packet_filter(int fd, int off_nexthdr, int off_dport) -{ - struct sock_filter filter[] = { - BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), - BPF_STMT(BPF_LD + BPF_B + BPF_ABS, off_nexthdr), - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_encap ? IPPROTO_UDP : cfg_proto, 0, 2), - BPF_STMT(BPF_LD + BPF_H + BPF_ABS, off_dport), - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_port_dst, 1, 0), - BPF_STMT(BPF_RET + BPF_K, 0), - BPF_STMT(BPF_RET + BPF_K, 0xFFFF), - }; - struct sock_fprog prog = {}; - - prog.filter = filter; - prog.len = ARRAY_SIZE(filter); - if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) - error(1, errno, "setsockopt filter"); -} - -static void recv_prepare_packet_filter(int fd) -{ - const int off_dport = offsetof(struct tcphdr, dest); /* same for udp */ - - if (cfg_family == AF_INET) - __recv_prepare_packet_filter(fd, offsetof(struct iphdr, protocol), - sizeof(struct iphdr) + off_dport); - else - __recv_prepare_packet_filter(fd, offsetof(struct ipv6hdr, nexthdr), - sizeof(struct ipv6hdr) + off_dport); -} - -static void recv_prepare_packet_bind(int fd) -{ - struct sockaddr_ll laddr = {0}; - - laddr.sll_family = AF_PACKET; - - if (cfg_family == PF_INET) - laddr.sll_protocol = htons(ETH_P_IP); - else - laddr.sll_protocol = htons(ETH_P_IPV6); - - laddr.sll_ifindex = if_nametoindex(cfg_ifname); - if (!laddr.sll_ifindex) - error(1, 0, "if_nametoindex %s", cfg_ifname); - - if (bind(fd, (void *)&laddr, sizeof(laddr))) - error(1, errno, "bind pf_packet"); -} - -static int recv_prepare_packet(void) -{ - int fd, one = 1; - - fd = socket(PF_PACKET, SOCK_DGRAM, 0); - if (fd == -1) - error(1, errno, "socket p"); - - if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, - &cfg_rcvbuf, sizeof(cfg_rcvbuf))) - error(1, errno, "setsockopt SO_RCVBUF p"); - - /* enable auxdata to recv checksum status (valid vs unknown) */ - if (setsockopt(fd, SOL_PACKET, PACKET_AUXDATA, &one, sizeof(one))) - error(1, errno, "setsockopt auxdata"); - - /* install filter to restrict packet flow to match */ - recv_prepare_packet_filter(fd); - - /* bind to address family to start packet flow */ - recv_prepare_packet_bind(fd); - - return fd; -} - -static int recv_udp(int fd) -{ - static char buf[MAX_PAYLOAD_LEN]; - int ret, count = 0; - - while (1) { - ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); - if (ret == -1 && errno == EAGAIN) - break; - if (ret == -1) - error(1, errno, "recv r"); - - fprintf(stderr, "rx: udp: len=%u\n", ret); - count++; - } - - return count; -} - -static int recv_verify_csum(void *th, int len, uint16_t sport, uint16_t csum_field) -{ - uint16_t csum; - - csum = checksum(th, cfg_proto, len); - - fprintf(stderr, "rx: pkt: sport=%hu len=%u csum=0x%hx verify=0x%hx\n", - sport, len, csum_field, csum); - - /* csum must be zero unless cfg_bad_csum indicates bad csum */ - if (csum && !cfg_bad_csum) { - fprintf(stderr, "pkt: bad csum\n"); - return 1; - } else if (cfg_bad_csum && !csum) { - fprintf(stderr, "pkt: good csum, while bad expected\n"); - return 1; - } - - if (cfg_zero_sum && csum_field != 0xFFFF) { - fprintf(stderr, "pkt: zero csum: field should be 0xFFFF, is 0x%hx\n", csum_field); - return 1; - } - - return 0; -} - -static int recv_verify_packet_tcp(void *th, int len) -{ - struct tcphdr *tcph = th; - - if (len < sizeof(*tcph) || tcph->dest != htons(cfg_port_dst)) - return -1; - - return recv_verify_csum(th, len, ntohs(tcph->source), tcph->check); -} - -static int recv_verify_packet_udp_encap(void *th, int len) -{ - struct udp_encap_hdr *eh = th; - - if (len < sizeof(*eh) || eh->nexthdr != IPPROTO_TCP) - return -1; - - return recv_verify_packet_tcp(eh + 1, len - sizeof(*eh)); -} - -static int recv_verify_packet_udp(void *th, int len) -{ - struct udphdr *udph = th; - - if (len < sizeof(*udph)) - return -1; - - if (udph->dest != htons(cfg_port_dst)) - return -1; - - if (udph->source == htons(cfg_port_src_encap)) - return recv_verify_packet_udp_encap(udph + 1, - len - sizeof(*udph)); - - return recv_verify_csum(th, len, ntohs(udph->source), udph->check); -} - -static int recv_verify_packet_ipv4(void *nh, int len) -{ - struct iphdr *iph = nh; - uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; - - if (len < sizeof(*iph) || iph->protocol != proto) - return -1; - - iph_addr_p = &iph->saddr; - if (proto == IPPROTO_TCP) - return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph)); - else - return recv_verify_packet_udp(iph + 1, len - sizeof(*iph)); -} - -static int recv_verify_packet_ipv6(void *nh, int len) -{ - struct ipv6hdr *ip6h = nh; - uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; - - if (len < sizeof(*ip6h) || ip6h->nexthdr != proto) - return -1; - - iph_addr_p = &ip6h->saddr; - - if (proto == IPPROTO_TCP) - return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h)); - else - return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h)); -} - -/* return whether auxdata includes TP_STATUS_CSUM_VALID */ -static bool recv_verify_packet_csum(struct msghdr *msg) -{ - struct tpacket_auxdata *aux = NULL; - struct cmsghdr *cm; - - if (msg->msg_flags & MSG_CTRUNC) - error(1, 0, "cmsg: truncated"); - - for (cm = CMSG_FIRSTHDR(msg); cm; cm = CMSG_NXTHDR(msg, cm)) { - if (cm->cmsg_level != SOL_PACKET || - cm->cmsg_type != PACKET_AUXDATA) - error(1, 0, "cmsg: level=%d type=%d\n", - cm->cmsg_level, cm->cmsg_type); - - if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata))) - error(1, 0, "cmsg: len=%lu expected=%lu", - cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); - - aux = (void *)CMSG_DATA(cm); - } - - if (!aux) - error(1, 0, "cmsg: no auxdata"); - - return aux->tp_status & TP_STATUS_CSUM_VALID; -} - -static int recv_packet(int fd) -{ - static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; - unsigned long total = 0, bad_csums = 0, bad_validations = 0; - char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; - struct pkt *buf = (void *)_buf; - struct msghdr msg = {0}; - struct iovec iov; - int len, ret; - - iov.iov_base = _buf; - iov.iov_len = sizeof(_buf); - - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - msg.msg_control = ctrl; - msg.msg_controllen = sizeof(ctrl); - - while (1) { - msg.msg_flags = 0; - - len = recvmsg(fd, &msg, MSG_DONTWAIT); - if (len == -1 && errno == EAGAIN) - break; - if (len == -1) - error(1, errno, "recv p"); - - if (cfg_family == PF_INET6) - ret = recv_verify_packet_ipv6(buf, len); - else - ret = recv_verify_packet_ipv4(buf, len); - - if (ret == -1 /* skip: non-matching */) - continue; - - total++; - if (ret == 1) - bad_csums++; - - /* Fail if kernel returns valid for known bad csum. - * Do not fail if kernel does not validate a good csum: - * Absence of validation does not imply invalid. - */ - if (recv_verify_packet_csum(&msg) && cfg_bad_csum) { - fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n"); - bad_validations++; - } - } - - if (bad_csums || bad_validations) - error(1, 0, "rx: errors at pf_packet: total=%lu bad_csums=%lu bad_valids=%lu\n", - total, bad_csums, bad_validations); - - return total; -} - -static void parse_args(int argc, char *const argv[]) -{ - const char *daddr = NULL, *saddr = NULL; - int c; - - while ((c = getopt(argc, argv, "46d:D:eEi:l:L:n:r:PRs:S:tTuUzZ")) != -1) { - switch (c) { - case '4': - cfg_family = PF_INET; - break; - case '6': - cfg_family = PF_INET6; - break; - case 'd': - cfg_mac_dst = optarg; - break; - case 'D': - daddr = optarg; - break; - case 'e': - cfg_encap = true; - break; - case 'E': - cfg_bad_csum = true; - break; - case 'i': - cfg_ifname = optarg; - break; - case 'l': - cfg_payload_len = strtol(optarg, NULL, 0); - break; - case 'L': - cfg_timeout_ms = strtol(optarg, NULL, 0) * 1000; - break; - case 'n': - cfg_num_pkt = strtol(optarg, NULL, 0); - break; - case 'r': - cfg_random_seed = strtol(optarg, NULL, 0); - break; - case 'P': - cfg_send_pfpacket = true; - break; - case 'R': - /* only Rx: used with two machine tests */ - cfg_do_tx = false; - break; - case 's': - cfg_mac_src = optarg; - break; - case 'S': - saddr = optarg; - break; - case 't': - cfg_proto = IPPROTO_TCP; - break; - case 'T': - /* only Tx: used with two machine tests */ - cfg_do_rx = false; - break; - case 'u': - cfg_proto = IPPROTO_UDP; - break; - case 'U': - /* send using real udp socket, - * to exercise tx checksum offload - */ - cfg_send_udp = true; - break; - case 'z': - cfg_zero_disable = true; - break; - case 'Z': - cfg_zero_sum = true; - break; - default: - error(1, 0, "unknown arg %c", c); - } - } - - if (!daddr || !saddr) - error(1, 0, "Must pass -D and -S "); - - if (cfg_do_tx && cfg_send_pfpacket && (!cfg_mac_src || !cfg_mac_dst)) - error(1, 0, "Transmit with pf_packet requires mac addresses"); - - if (cfg_payload_len > MAX_PAYLOAD_LEN) - error(1, 0, "Payload length exceeds max"); - - if (cfg_proto != IPPROTO_UDP && (cfg_zero_sum || cfg_zero_disable)) - error(1, 0, "Only UDP supports zero csum"); - - if (cfg_zero_sum && !cfg_send_udp) - error(1, 0, "Zero checksum conversion requires -U for tx csum offload"); - if (cfg_zero_sum && cfg_bad_csum) - error(1, 0, "Cannot combine zero checksum conversion and invalid checksum"); - if (cfg_zero_sum && cfg_random_seed) - error(1, 0, "Cannot combine zero checksum conversion with randomization"); - - if (cfg_family == PF_INET6) { - cfg_saddr6.sin6_port = htons(cfg_port_src); - cfg_daddr6.sin6_port = htons(cfg_port_dst); - - if (inet_pton(cfg_family, daddr, &cfg_daddr6.sin6_addr) != 1) - error(1, errno, "Cannot parse ipv6 -D"); - if (inet_pton(cfg_family, saddr, &cfg_saddr6.sin6_addr) != 1) - error(1, errno, "Cannot parse ipv6 -S"); - } else { - cfg_saddr4.sin_port = htons(cfg_port_src); - cfg_daddr4.sin_port = htons(cfg_port_dst); - - if (inet_pton(cfg_family, daddr, &cfg_daddr4.sin_addr) != 1) - error(1, errno, "Cannot parse ipv4 -D"); - if (inet_pton(cfg_family, saddr, &cfg_saddr4.sin_addr) != 1) - error(1, errno, "Cannot parse ipv4 -S"); - } - - if (cfg_do_tx && cfg_random_seed) { - /* special case: time-based seed */ - if (cfg_random_seed == 1) - cfg_random_seed = (unsigned int)gettimeofday_ms(); - srand(cfg_random_seed); - fprintf(stderr, "randomization seed: %u\n", cfg_random_seed); - } -} - -static void do_tx(void) -{ - static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; - char *buf; - int fd, len, i; - - buf = build_packet(_buf, sizeof(_buf), &len); - - if (cfg_send_pfpacket) - fd = open_packet(); - else if (cfg_send_udp) - fd = open_inet(SOCK_DGRAM, 0); - else - fd = open_inet(SOCK_RAW, IPPROTO_RAW); - - for (i = 0; i < cfg_num_pkt; i++) { - if (cfg_send_pfpacket) - send_packet(fd, buf, len); - else - send_inet(fd, buf, len); - - /* randomize each packet individually to increase coverage */ - if (cfg_random_seed) { - cfg_payload_len = rand() % MAX_PAYLOAD_LEN; - buf = build_packet(_buf, sizeof(_buf), &len); - } - } - - if (close(fd)) - error(1, errno, "close tx"); -} - -static void do_rx(int fdp, int fdr) -{ - unsigned long count_udp = 0, count_pkt = 0; - long tleft, tstop; - struct pollfd pfd; - - tstop = gettimeofday_ms() + cfg_timeout_ms; - tleft = cfg_timeout_ms; - - do { - pfd.events = POLLIN; - pfd.fd = fdp; - if (poll(&pfd, 1, tleft) == -1) - error(1, errno, "poll"); - - if (pfd.revents & POLLIN) - count_pkt += recv_packet(fdp); - - if (cfg_proto == IPPROTO_UDP) - count_udp += recv_udp(fdr); - - tleft = tstop - gettimeofday_ms(); - } while (tleft > 0); - - if (close(fdr)) - error(1, errno, "close r"); - if (close(fdp)) - error(1, errno, "close p"); - - if (count_pkt < cfg_num_pkt) - error(1, 0, "rx: missing packets at pf_packet: %lu < %u", - count_pkt, cfg_num_pkt); - - if (cfg_proto == IPPROTO_UDP) { - if (cfg_bad_csum && count_udp) - error(1, 0, "rx: unexpected packets at udp"); - if (!cfg_bad_csum && !count_udp) - error(1, 0, "rx: missing packets at udp"); - } -} - -int main(int argc, char *const argv[]) -{ - int fdp = -1, fdr = -1; /* -1 to silence -Wmaybe-uninitialized */ - - parse_args(argc, argv); - - /* open receive sockets before transmitting */ - if (cfg_do_rx) { - fdp = recv_prepare_packet(); - fdr = recv_prepare_udp(); - } - - if (cfg_do_tx) - do_tx(); - - if (cfg_do_rx) - do_rx(fdp, fdr); - - fprintf(stderr, "OK\n"); - return 0; -} diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py deleted file mode 100755 index 2d84c7a0be..0000000000 --- a/tools/testing/selftests/net/devlink_port_split.py +++ /dev/null @@ -1,309 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -from subprocess import PIPE, Popen -import json -import time -import argparse -import collections -import sys - -# -# Test port split configuration using devlink-port lanes attribute. -# The test is skipped in case the attribute is not available. -# -# First, check that all the ports with 1 lane fail to split. -# Second, check that all the ports with more than 1 lane can be split -# to all valid configurations (e.g., split to 2, split to 4 etc.) -# - - -# Kselftest framework requirement - SKIP code is 4 -KSFT_SKIP=4 -Port = collections.namedtuple('Port', 'bus_info name') - - -def run_command(cmd, should_fail=False): - """ - Run a command in subprocess. - Return: Tuple of (stdout, stderr). - """ - - p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) - stdout, stderr = p.communicate() - stdout, stderr = stdout.decode(), stderr.decode() - - if stderr != "" and not should_fail: - print("Error sending command: %s" % cmd) - print(stdout) - print(stderr) - return stdout, stderr - - -class devlink_ports(object): - """ - Class that holds information on the devlink ports, required to the tests; - if_names: A list of interfaces in the devlink ports. - """ - - def get_if_names(dev): - """ - Get a list of physical devlink ports. - Return: Array of tuples (bus_info/port, if_name). - """ - - arr = [] - - cmd = "devlink -j port show" - stdout, stderr = run_command(cmd) - assert stderr == "" - ports = json.loads(stdout)['port'] - - validate_devlink_output(ports, 'flavour') - - for port in ports: - if dev in port: - if ports[port]['flavour'] == 'physical': - arr.append(Port(bus_info=port, name=ports[port]['netdev'])) - - return arr - - def __init__(self, dev): - self.if_names = devlink_ports.get_if_names(dev) - - -def get_max_lanes(port): - """ - Get the $port's maximum number of lanes. - Return: number of lanes, e.g. 1, 2, 4 and 8. - """ - - cmd = "devlink -j port show %s" % port - stdout, stderr = run_command(cmd) - assert stderr == "" - values = list(json.loads(stdout)['port'].values())[0] - - if 'lanes' in values: - lanes = values['lanes'] - else: - lanes = 0 - return lanes - - -def get_split_ability(port): - """ - Get the $port split ability. - Return: split ability, true or false. - """ - - cmd = "devlink -j port show %s" % port.name - stdout, stderr = run_command(cmd) - assert stderr == "" - values = list(json.loads(stdout)['port'].values())[0] - - return values['splittable'] - - -def split(k, port, should_fail=False): - """ - Split $port into $k ports. - If should_fail == True, the split should fail. Otherwise, should pass. - Return: Array of sub ports after splitting. - If the $port wasn't split, the array will be empty. - """ - - cmd = "devlink port split %s count %s" % (port.bus_info, k) - stdout, stderr = run_command(cmd, should_fail=should_fail) - - if should_fail: - if not test(stderr != "", "%s is unsplittable" % port.name): - print("split an unsplittable port %s" % port.name) - return create_split_group(port, k) - else: - if stderr == "": - return create_split_group(port, k) - print("didn't split a splittable port %s" % port.name) - - return [] - - -def unsplit(port): - """ - Unsplit $port. - """ - - cmd = "devlink port unsplit %s" % port - stdout, stderr = run_command(cmd) - test(stderr == "", "Unsplit port %s" % port) - - -def exists(port, dev): - """ - Check if $port exists in the devlink ports. - Return: True is so, False otherwise. - """ - - return any(dev_port.name == port - for dev_port in devlink_ports.get_if_names(dev)) - - -def exists_and_lanes(ports, lanes, dev): - """ - Check if every port in the list $ports exists in the devlink ports and has - $lanes number of lanes after splitting. - Return: True if both are True, False otherwise. - """ - - for port in ports: - max_lanes = get_max_lanes(port) - if not exists(port, dev): - print("port %s doesn't exist in devlink ports" % port) - return False - if max_lanes != lanes: - print("port %s has %d lanes, but %s were expected" - % (port, lanes, max_lanes)) - return False - return True - - -def test(cond, msg): - """ - Check $cond and print a message accordingly. - Return: True is pass, False otherwise. - """ - - if cond: - print("TEST: %-60s [ OK ]" % msg) - else: - print("TEST: %-60s [FAIL]" % msg) - - return cond - - -def create_split_group(port, k): - """ - Create the split group for $port. - Return: Array with $k elements, which are the split port group. - """ - - return list(port.name + "s" + str(i) for i in range(k)) - - -def split_unsplittable_port(port, k): - """ - Test that splitting of unsplittable port fails. - """ - - # split to max - new_split_group = split(k, port, should_fail=True) - - if new_split_group != []: - unsplit(port.bus_info) - - -def split_splittable_port(port, k, lanes, dev): - """ - Test that splitting of splittable port passes correctly. - """ - - new_split_group = split(k, port) - - # Once the split command ends, it takes some time to the sub ifaces' - # to get their names. Use udevadm to continue only when all current udev - # events are handled. - cmd = "udevadm settle" - stdout, stderr = run_command(cmd) - assert stderr == "" - - if new_split_group != []: - test(exists_and_lanes(new_split_group, lanes/k, dev), - "split port %s into %s" % (port.name, k)) - - unsplit(port.bus_info) - - -def validate_devlink_output(devlink_data, target_property=None): - """ - Determine if test should be skipped by checking: - 1. devlink_data contains values - 2. The target_property exist in devlink_data - """ - skip_reason = None - if any(devlink_data.values()): - if target_property: - skip_reason = "{} not found in devlink output, test skipped".format(target_property) - for key in devlink_data: - if target_property in devlink_data[key]: - skip_reason = None - else: - skip_reason = 'devlink output is empty, test skipped' - - if skip_reason: - print(skip_reason) - sys.exit(KSFT_SKIP) - - -def make_parser(): - parser = argparse.ArgumentParser(description='A test for port splitting.') - parser.add_argument('--dev', - help='The devlink handle of the device under test. ' + - 'The default is the first registered devlink ' + - 'handle.') - - return parser - - -def main(cmdline=None): - parser = make_parser() - args = parser.parse_args(cmdline) - - dev = args.dev - if not dev: - cmd = "devlink -j dev show" - stdout, stderr = run_command(cmd) - assert stderr == "" - - validate_devlink_output(json.loads(stdout)) - devs = json.loads(stdout)['dev'] - dev = list(devs.keys())[0] - - cmd = "devlink dev show %s" % dev - stdout, stderr = run_command(cmd) - if stderr != "": - print("devlink device %s can not be found" % dev) - sys.exit(1) - - ports = devlink_ports(dev) - - found_max_lanes = False - for port in ports.if_names: - max_lanes = get_max_lanes(port.name) - - # If max lanes is 0, do not test port splitting at all - if max_lanes == 0: - continue - - # If 1 lane, shouldn't be able to split - elif max_lanes == 1: - test(not get_split_ability(port), - "%s should not be able to split" % port.name) - split_unsplittable_port(port, max_lanes) - - # Else, splitting should pass and all the split ports should exist. - else: - lane = max_lanes - test(get_split_ability(port), - "%s should be able to split" % port.name) - while lane > 1: - split_splittable_port(port, lane, max_lanes, dev) - - lane //= 2 - found_max_lanes = True - - if not found_max_lanes: - print(f"Test not started, no port of device {dev} reports max_lanes") - sys.exit(KSFT_SKIP) - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/net/epoll_busy_poll.c b/tools/testing/selftests/net/epoll_busy_poll.c new file mode 100644 index 0000000000..16e457c2f8 --- /dev/null +++ b/tools/testing/selftests/net/epoll_busy_poll.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* Basic per-epoll context busy poll test. + * + * Only tests the ioctls, but should be expanded to test two connected hosts in + * the future + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "../kselftest_harness.h" + +/* if the headers haven't been updated, we need to define some things */ +#if !defined(EPOLL_IOC_TYPE) +struct epoll_params { + uint32_t busy_poll_usecs; + uint16_t busy_poll_budget; + uint8_t prefer_busy_poll; + + /* pad the struct to a multiple of 64bits */ + uint8_t __pad; +}; + +#define EPOLL_IOC_TYPE 0x8A +#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params) +#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params) +#endif + +FIXTURE(invalid_fd) +{ + int invalid_fd; + struct epoll_params params; +}; + +FIXTURE_SETUP(invalid_fd) +{ + int ret; + + ret = socket(AF_UNIX, SOCK_DGRAM, 0); + EXPECT_NE(-1, ret) + TH_LOG("error creating unix socket"); + + self->invalid_fd = ret; +} + +FIXTURE_TEARDOWN(invalid_fd) +{ + int ret; + + ret = close(self->invalid_fd); + EXPECT_EQ(0, ret); +} + +TEST_F(invalid_fd, test_invalid_fd) +{ + int ret; + + ret = ioctl(self->invalid_fd, EPIOCGPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCGPARAMS on invalid epoll FD should error"); + + EXPECT_EQ(ENOTTY, errno) + TH_LOG("EPIOCGPARAMS on invalid epoll FD should set errno to ENOTTY"); + + memset(&self->params, 0, sizeof(struct epoll_params)); + + ret = ioctl(self->invalid_fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS on invalid epoll FD should error"); + + EXPECT_EQ(ENOTTY, errno) + TH_LOG("EPIOCSPARAMS on invalid epoll FD should set errno to ENOTTY"); +} + +FIXTURE(epoll_busy_poll) +{ + int fd; + struct epoll_params params; + struct epoll_params *invalid_params; + cap_t caps; +}; + +FIXTURE_SETUP(epoll_busy_poll) +{ + int ret; + + ret = epoll_create1(0); + EXPECT_NE(-1, ret) + TH_LOG("epoll_create1 failed?"); + + self->fd = ret; + + self->caps = cap_get_proc(); + EXPECT_NE(NULL, self->caps); +} + +FIXTURE_TEARDOWN(epoll_busy_poll) +{ + int ret; + + ret = close(self->fd); + EXPECT_EQ(0, ret); + + ret = cap_free(self->caps); + EXPECT_NE(-1, ret) + TH_LOG("unable to free capabilities"); +} + +TEST_F(epoll_busy_poll, test_get_params) +{ + /* begin by getting the epoll params from the kernel + * + * the default should be default and all fields should be zero'd by the + * kernel, so set params fields to garbage to test this. + */ + int ret = 0; + + self->params.busy_poll_usecs = 0xff; + self->params.busy_poll_budget = 0xff; + self->params.prefer_busy_poll = 1; + self->params.__pad = 0xf; + + ret = ioctl(self->fd, EPIOCGPARAMS, &self->params); + EXPECT_EQ(0, ret) + TH_LOG("ioctl EPIOCGPARAMS should succeed"); + + EXPECT_EQ(0, self->params.busy_poll_usecs) + TH_LOG("EPIOCGPARAMS busy_poll_usecs should have been 0"); + + EXPECT_EQ(0, self->params.busy_poll_budget) + TH_LOG("EPIOCGPARAMS busy_poll_budget should have been 0"); + + EXPECT_EQ(0, self->params.prefer_busy_poll) + TH_LOG("EPIOCGPARAMS prefer_busy_poll should have been 0"); + + EXPECT_EQ(0, self->params.__pad) + TH_LOG("EPIOCGPARAMS __pad should have been 0"); + + self->invalid_params = (struct epoll_params *)0xdeadbeef; + ret = ioctl(self->fd, EPIOCGPARAMS, self->invalid_params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCGPARAMS should error with invalid params"); + + EXPECT_EQ(EFAULT, errno) + TH_LOG("EPIOCGPARAMS with invalid params should set errno to EFAULT"); +} + +TEST_F(epoll_busy_poll, test_set_invalid) +{ + int ret; + + memset(&self->params, 0, sizeof(struct epoll_params)); + + self->params.__pad = 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS non-zero __pad should error"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS non-zero __pad errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = (uint32_t)INT_MAX + 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error busy_poll_usecs > S32_MAX"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS busy_poll_usecs > S32_MAX errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = 32; + self->params.prefer_busy_poll = 2; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error prefer_busy_poll > 1"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS prefer_busy_poll > 1 errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = 32; + self->params.prefer_busy_poll = 1; + + /* set budget well above kernel's NAPI_POLL_WEIGHT of 64 */ + self->params.busy_poll_budget = UINT16_MAX; + + /* test harness should run with CAP_NET_ADMIN, but let's make sure */ + cap_flag_value_t tmp; + + ret = cap_get_flag(self->caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &tmp); + EXPECT_EQ(0, ret) + TH_LOG("unable to get CAP_NET_ADMIN cap flag"); + + EXPECT_EQ(CAP_SET, tmp) + TH_LOG("expecting CAP_NET_ADMIN to be set for the test harness"); + + /* at this point we know CAP_NET_ADMIN is available, so setting the + * params with a busy_poll_budget > NAPI_POLL_WEIGHT should succeed + */ + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCSPARAMS should allow busy_poll_budget > NAPI_POLL_WEIGHT"); + + /* remove CAP_NET_ADMIN from our effective set */ + cap_value_t net_admin[] = { CAP_NET_ADMIN }; + + ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_CLEAR); + EXPECT_EQ(0, ret) + TH_LOG("couldn't clear CAP_NET_ADMIN"); + + ret = cap_set_proc(self->caps); + EXPECT_EQ(0, ret) + TH_LOG("cap_set_proc should drop CAP_NET_ADMIN"); + + /* this is now expected to fail */ + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error busy_poll_budget > NAPI_POLL_WEIGHT"); + + EXPECT_EQ(EPERM, errno) + TH_LOG("EPIOCSPARAMS errno should be EPERM busy_poll_budget > NAPI_POLL_WEIGHT"); + + /* restore CAP_NET_ADMIN to our effective set */ + ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_SET); + EXPECT_EQ(0, ret) + TH_LOG("couldn't restore CAP_NET_ADMIN"); + + ret = cap_set_proc(self->caps); + EXPECT_EQ(0, ret) + TH_LOG("cap_set_proc should set CAP_NET_ADMIN"); + + self->invalid_params = (struct epoll_params *)0xdeadbeef; + ret = ioctl(self->fd, EPIOCSPARAMS, self->invalid_params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error when epoll_params is invalid"); + + EXPECT_EQ(EFAULT, errno) + TH_LOG("EPIOCSPARAMS should set errno to EFAULT when epoll_params is invalid"); +} + +TEST_F(epoll_busy_poll, test_set_and_get_valid) +{ + int ret; + + memset(&self->params, 0, sizeof(struct epoll_params)); + + self->params.busy_poll_usecs = 25; + self->params.busy_poll_budget = 16; + self->params.prefer_busy_poll = 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCSPARAMS with valid params should not error"); + + /* check that the kernel returns the same values back */ + + memset(&self->params, 0, sizeof(struct epoll_params)); + + ret = ioctl(self->fd, EPIOCGPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCGPARAMS should not error"); + + EXPECT_EQ(25, self->params.busy_poll_usecs) + TH_LOG("params.busy_poll_usecs incorrect"); + + EXPECT_EQ(16, self->params.busy_poll_budget) + TH_LOG("params.busy_poll_budget incorrect"); + + EXPECT_EQ(1, self->params.prefer_busy_poll) + TH_LOG("params.prefer_busy_poll incorrect"); + + EXPECT_EQ(0, self->params.__pad) + TH_LOG("params.__pad was not 0"); +} + +TEST_F(epoll_busy_poll, test_invalid_ioctl) +{ + int invalid_ioctl = EPIOCGPARAMS + 10; + int ret; + + ret = ioctl(self->fd, invalid_ioctl, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("invalid ioctl should return error"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("invalid ioctl should set errno to EINVAL"); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 51157a5559..7c01f58a20 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -9,6 +9,7 @@ PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} RTABLE=100 RTABLE_PEER=101 +RTABLE_VRF=102 GW_IP4=192.51.100.2 SRC_IP=192.51.100.3 GW_IP6=2001:db8:1::2 @@ -17,7 +18,14 @@ SRC_IP6=2001:db8:1::3 DEV_ADDR=192.51.100.1 DEV_ADDR6=2001:db8:1::1 DEV=dummy0 -TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect" +TESTS=" + fib_rule6 + fib_rule4 + fib_rule6_connect + fib_rule4_connect + fib_rule6_vrf + fib_rule4_vrf +" SELFTEST_PATH="" @@ -27,13 +35,18 @@ log_test() local expected=$2 local msg="$3" + $IP rule show | grep -q l3mdev + if [ $? -eq 0 ]; then + msg="$msg (VRF)" + fi + if [ ${rc} -eq ${expected} ]; then nsuccess=$((nsuccess+1)) - printf "\n TEST: %-50s [ OK ]\n" "${msg}" + printf "\n TEST: %-60s [ OK ]\n" "${msg}" else ret=1 nfail=$((nfail+1)) - printf "\n TEST: %-50s [FAIL]\n" "${msg}" + printf "\n TEST: %-60s [FAIL]\n" "${msg}" if [ "${PAUSE_ON_FAIL}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" @@ -130,6 +143,17 @@ cleanup_peer() ip netns del $peerns } +setup_vrf() +{ + $IP link add name vrf0 up type vrf table $RTABLE_VRF + $IP link set dev $DEV master vrf0 +} + +cleanup_vrf() +{ + $IP link del dev vrf0 +} + fib_check_iproute_support() { ip rule help 2>&1 | grep -q $1 @@ -248,6 +272,13 @@ fib_rule6_test() fi } +fib_rule6_vrf_test() +{ + setup_vrf + fib_rule6_test + cleanup_vrf +} + # Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly # taken into account when connecting the socket and when sending packets. fib_rule6_connect_test() @@ -385,6 +416,13 @@ fib_rule4_test() fi } +fib_rule4_vrf_test() +{ + setup_vrf + fib_rule4_test + cleanup_vrf +} + # Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken # into account when connecting the socket and when sending packets. fib_rule4_connect_test() @@ -467,6 +505,8 @@ do fib_rule4_test|fib_rule4) fib_rule4_test;; fib_rule6_connect_test|fib_rule6_connect) fib_rule6_connect_test;; fib_rule4_connect_test|fib_rule4_connect) fib_rule4_connect_test;; + fib_rule6_vrf_test|fib_rule6_vrf) fib_rule6_vrf_test;; + fib_rule4_vrf_test|fib_rule4_vrf) fib_rule4_vrf_test;; help) echo "Test names: $TESTS"; exit 0;; diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 73895711cd..5f3c28fc86 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -1737,53 +1737,53 @@ ipv4_rt_dsfield() # DSCP 0x10 should match the specific route, no matter the ECN bits $IP route get fibmatch 172.16.102.1 dsfield 0x10 | \ - grep -q "via 172.16.103.2" + grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2" log_test $? 0 "IPv4 route with DSCP and ECN:Not-ECT" $IP route get fibmatch 172.16.102.1 dsfield 0x11 | \ - grep -q "via 172.16.103.2" + grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2" log_test $? 0 "IPv4 route with DSCP and ECN:ECT(1)" $IP route get fibmatch 172.16.102.1 dsfield 0x12 | \ - grep -q "via 172.16.103.2" + grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2" log_test $? 0 "IPv4 route with DSCP and ECN:ECT(0)" $IP route get fibmatch 172.16.102.1 dsfield 0x13 | \ - grep -q "via 172.16.103.2" + grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2" log_test $? 0 "IPv4 route with DSCP and ECN:CE" # Unknown DSCP should match the generic route, no matter the ECN bits $IP route get fibmatch 172.16.102.1 dsfield 0x14 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with unknown DSCP and ECN:Not-ECT" $IP route get fibmatch 172.16.102.1 dsfield 0x15 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with unknown DSCP and ECN:ECT(1)" $IP route get fibmatch 172.16.102.1 dsfield 0x16 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with unknown DSCP and ECN:ECT(0)" $IP route get fibmatch 172.16.102.1 dsfield 0x17 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with unknown DSCP and ECN:CE" # Null DSCP should match the generic route, no matter the ECN bits $IP route get fibmatch 172.16.102.1 dsfield 0x00 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with no DSCP and ECN:Not-ECT" $IP route get fibmatch 172.16.102.1 dsfield 0x01 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with no DSCP and ECN:ECT(1)" $IP route get fibmatch 172.16.102.1 dsfield 0x02 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with no DSCP and ECN:ECT(0)" $IP route get fibmatch 172.16.102.1 dsfield 0x03 | \ - grep -q "via 172.16.101.2" + grep -q "172.16.102.0/24 via 172.16.101.2" log_test $? 0 "IPv4 route with no DSCP and ECN:CE" } diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 535865b3d1..fa7b59ff40 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -15,18 +15,12 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ bridge_vlan_unaware.sh \ custom_multipath_hash.sh \ dual_vxlan_bridge.sh \ - ethtool_extended_state.sh \ - ethtool_mm.sh \ - ethtool_rmon.sh \ - ethtool.sh \ gre_custom_multipath_hash.sh \ gre_inner_v4_multipath.sh \ gre_inner_v6_multipath.sh \ gre_multipath_nh_res.sh \ gre_multipath_nh.sh \ gre_multipath.sh \ - hw_stats_l3.sh \ - hw_stats_l3_gre.sh \ ip6_forward_instats_vrf.sh \ ip6gre_custom_multipath_hash.sh \ ip6gre_flat_key.sh \ @@ -43,8 +37,8 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ ipip_hier_gre_key.sh \ ipip_hier_gre_keys.sh \ ipip_hier_gre.sh \ + lib_sh_test.sh \ local_termination.sh \ - loopback.sh \ mirror_gre_bound.sh \ mirror_gre_bridge_1d.sh \ mirror_gre_bridge_1d_vlan.sh \ @@ -113,7 +107,6 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_symmetric.sh TEST_FILES := devlink_lib.sh \ - ethtool_lib.sh \ fib_offload_lib.sh \ forwarding.config.sample \ ip6gre_lib.sh \ diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README index b8a2af8fcf..7fdb6a9ca5 100644 --- a/tools/testing/selftests/net/forwarding/README +++ b/tools/testing/selftests/net/forwarding/README @@ -56,3 +56,36 @@ o Checks shall be added to lib.sh for any external dependencies. o Code shall be checked using ShellCheck [1] prior to submission. 1. https://www.shellcheck.net/ + +Customization +============= + +The forwarding selftests framework uses a number of variables that +influence its behavior and tools it invokes, and how it invokes them, in +various ways. A number of these variables can be overridden. The way these +overridable variables are specified is typically one of the following two +syntaxes: + + : "${VARIABLE:=default_value}" + VARIABLE=${VARIABLE:=default_value} + +Any of these variables can be overridden. Notably net/forwarding/lib.sh and +net/lib.sh contain a number of overridable variables. + +One way of overriding these variables is through the environment: + + PAUSE_ON_FAIL=yes ./some_test.sh + +The variable NETIFS is special. Since it is an array variable, there is no +way to pass it through the environment. Its value can instead be given as +consecutive arguments to the selftest: + + ./some_test.sh swp{1..8} + +A way to customize variables in a persistent fashion is to create a file +named forwarding.config in this directory. lib.sh sources the file if +present, so it can contain any shell code. Typically it will contain +assignments of variables whose value should be overridden. + +forwarding.config.sample is available in the directory as an example of +how forwarding.config might look. diff --git a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh index 0760a34b71..a21b7085da 100755 --- a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh +++ b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh @@ -178,6 +178,22 @@ fdb_del() check_err $? "Failed to remove a FDB entry of type ${type}" } +check_fdb_n_learned_support() +{ + if ! ip link help bridge 2>&1 | grep -q "fdb_max_learned"; then + echo "SKIP: iproute2 too old, missing bridge max learned support" + exit $ksft_skip + fi + + ip link add dev br0 type bridge + local learned=$(fdb_get_n_learned) + ip link del dev br0 + if [ "$learned" == "null" ]; then + echo "SKIP: kernel too old; bridge fdb_n_learned feature not supported." + exit $ksft_skip + fi +} + check_accounting_one_type() { local type=$1 is_counted=$2 overrides_learned=$3 @@ -274,6 +290,8 @@ check_limit() done } +check_fdb_n_learned_support + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index f1de525cfa..62a05bca1e 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -122,6 +122,8 @@ devlink_reload() still_pending=$(devlink resource show "$DEVLINK_DEV" | \ grep -c "size_new") check_err $still_pending "Failed reload - There are still unset sizes" + + udevadm settle } declare -A DEVLINK_ORIG diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/net/forwarding/ethtool.sh deleted file mode 100755 index aa2eafb7b2..0000000000 --- a/tools/testing/selftests/net/forwarding/ethtool.sh +++ /dev/null @@ -1,301 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - same_speeds_autoneg_off - different_speeds_autoneg_off - combination_of_neg_on_and_off - advertise_subset_of_speeds - check_highest_speed_is_chosen - different_speeds_autoneg_on -" -NUM_NETIFS=2 -source lib.sh -source ethtool_lib.sh - -h1_create() -{ - simple_if_init $h1 192.0.2.1/24 -} - -h1_destroy() -{ - simple_if_fini $h1 192.0.2.1/24 -} - -h2_create() -{ - simple_if_init $h2 192.0.2.2/24 -} - -h2_destroy() -{ - simple_if_fini $h2 192.0.2.2/24 -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - h1_create - h2_create -} - -cleanup() -{ - pre_cleanup - - h2_destroy - h1_destroy -} - -same_speeds_autoneg_off() -{ - # Check that when each of the reported speeds is forced, the links come - # up and are operational. - local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0)) - - for speed in "${speeds_arr[@]}"; do - RET=0 - ethtool_set $h1 speed $speed autoneg off - ethtool_set $h2 speed $speed autoneg off - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_err $? "speed $speed autoneg off" - log_test "force of same speed autoneg off" - log_info "speed = $speed" - done - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -different_speeds_autoneg_off() -{ - # Test that when we force different speeds, links are not up and ping - # fails. - RET=0 - - local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0)) - local speed1=${speeds_arr[0]} - local speed2=${speeds_arr[1]} - - ethtool_set $h1 speed $speed1 autoneg off - ethtool_set $h2 speed $speed2 autoneg off - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_fail $? "ping with different speeds" - - log_test "force of different speeds autoneg off" - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -combination_of_neg_on_and_off() -{ - # Test that when one device is forced to a speed supported by both - # endpoints and the other device is configured to autoneg on, the links - # are up and ping passes. - local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) - - for speed in "${speeds_arr[@]}"; do - RET=0 - ethtool_set $h1 speed $speed autoneg off - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_err $? "h1-speed=$speed autoneg off, h2 autoneg on" - log_test "one side with autoneg off and another with autoneg on" - log_info "force speed = $speed" - done - - ethtool -s $h1 autoneg on -} - -hex_speed_value_get() -{ - local speed=$1; shift - - local shift_size=${speed_values[$speed]} - speed=$((0x1 << $"shift_size")) - printf "%#x" "$speed" -} - -subset_of_common_speeds_get() -{ - local dev1=$1; shift - local dev2=$1; shift - local adver=$1; shift - - local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver)) - local speed_to_advertise=0 - local speed_to_remove=${speeds_arr[0]} - speed_to_remove+='base' - - local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver)) - - for speed in ${speeds_mode_arr[@]}; do - if [[ $speed != $speed_to_remove* ]]; then - speed=$(hex_speed_value_get $speed) - speed_to_advertise=$(($speed_to_advertise | \ - $speed)) - fi - - done - - # Convert to hex. - printf "%#x" "$speed_to_advertise" -} - -speed_to_advertise_get() -{ - # The function returns the hex number that is composed by OR-ing all - # the modes corresponding to the provided speed. - local speed_without_mode=$1; shift - local supported_speeds=("$@"); shift - local speed_to_advertise=0 - - speed_without_mode+='base' - - for speed in ${supported_speeds[@]}; do - if [[ $speed == $speed_without_mode* ]]; then - speed=$(hex_speed_value_get $speed) - speed_to_advertise=$(($speed_to_advertise | \ - $speed)) - fi - - done - - # Convert to hex. - printf "%#x" "$speed_to_advertise" -} - -advertise_subset_of_speeds() -{ - # Test that when one device advertises a subset of speeds and another - # advertises a specific speed (but all modes of this speed), the links - # are up and ping passes. - RET=0 - - local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) - ethtool_set $h1 advertise $speed_1_to_advertise - - if [ $RET != 0 ]; then - log_test "advertise subset of speeds" - return - fi - - local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1)) - # Check only speeds that h1 advertised. Remove the first speed. - unset speeds_arr_without_mode[0] - local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1)) - - for speed_value in ${speeds_arr_without_mode[@]}; do - RET=0 - local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \ - "${speeds_arr_with_mode[@]}") - ethtool_set $h2 advertise $speed_2_to_advertise - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" - - log_test "advertise subset of speeds" - log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise" - done - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -check_highest_speed_is_chosen() -{ - # Test that when one device advertises a subset of speeds, the other - # chooses the highest speed. This test checks configuration without - # traffic. - RET=0 - - local max_speed - local chosen_speed - local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) - - ethtool_set $h1 advertise $speed_to_advertise - - if [ $RET != 0 ]; then - log_test "check highest speed" - return - fi - - local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) - - max_speed=${speeds_arr[0]} - for current in ${speeds_arr[@]}; do - if [[ $current -gt $max_speed ]]; then - max_speed=$current - fi - done - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - chosen_speed=$(ethtool $h1 | grep 'Speed:') - chosen_speed=${chosen_speed%"Mb/s"*} - chosen_speed=${chosen_speed#*"Speed: "} - ((chosen_speed == max_speed)) - check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed" - - log_test "check highest speed" - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -different_speeds_autoneg_on() -{ - # Test that when we configure links to advertise different speeds, - # links are not up and ping fails. - RET=0 - - local -a speeds=($(different_speeds_get $h1 $h2 1 1)) - local speed1=${speeds[0]} - local speed2=${speeds[1]} - - speed1=$(hex_speed_value_get $speed1) - speed2=$(hex_speed_value_get $speed2) - - ethtool_set $h1 advertise $speed1 - ethtool_set $h2 advertise $speed2 - - if (($RET)); then - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_fail $? "ping with different speeds autoneg on" - fi - - log_test "advertise different speeds autoneg on" - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -skip_on_veth - -trap cleanup EXIT - -setup_prepare -setup_wait - -declare -gA speed_values -eval "speed_values=($(speeds_arr_get))" - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh deleted file mode 100755 index 17f89c3b7c..0000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - autoneg - autoneg_force_mode - no_cable -" - -NUM_NETIFS=2 -source lib.sh -source ethtool_lib.sh - -TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms - -setup_prepare() -{ - swp1=${NETIFS[p1]} - swp2=${NETIFS[p2]} - swp3=$NETIF_NO_CABLE -} - -ethtool_ext_state() -{ - local dev=$1; shift - local expected_ext_state=$1; shift - local expected_ext_substate=${1:-""}; shift - - local ext_state=$(ethtool $dev | grep "Link detected" \ - | cut -d "(" -f2 | cut -d ")" -f1) - local ext_substate=$(echo $ext_state | cut -sd "," -f2 \ - | sed -e 's/^[[:space:]]*//') - ext_state=$(echo $ext_state | cut -d "," -f1) - - if [[ $ext_state != $expected_ext_state ]]; then - echo "Expected \"$expected_ext_state\", got \"$ext_state\"" - return 1 - fi - if [[ $ext_substate != $expected_ext_substate ]]; then - echo "Expected \"$expected_ext_substate\", got \"$ext_substate\"" - return 1 - fi -} - -autoneg() -{ - local msg - - RET=0 - - ip link set dev $swp1 up - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ - "Autoneg" "No partner detected") - check_err $? "$msg" - - log_test "Autoneg, No partner detected" - - ip link set dev $swp1 down -} - -autoneg_force_mode() -{ - local msg - - RET=0 - - ip link set dev $swp1 up - ip link set dev $swp2 up - - local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0)) - local speed1=${speeds_arr[0]} - local speed2=${speeds_arr[1]} - - ethtool_set $swp1 speed $speed1 autoneg off - ethtool_set $swp2 speed $speed2 autoneg off - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ - "Autoneg" "No partner detected during force mode") - check_err $? "$msg" - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp2 \ - "Autoneg" "No partner detected during force mode") - check_err $? "$msg" - - log_test "Autoneg, No partner detected during force mode" - - ethtool -s $swp2 autoneg on - ethtool -s $swp1 autoneg on - - ip link set dev $swp2 down - ip link set dev $swp1 down -} - -no_cable() -{ - local msg - - RET=0 - - ip link set dev $swp3 up - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp3 "No cable") - check_err $? "$msg" - - log_test "No cable" - - ip link set dev $swp3 down -} - -skip_on_veth - -setup_prepare - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/net/forwarding/ethtool_lib.sh deleted file mode 100644 index b9bfb45085..0000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_lib.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -speeds_arr_get() -{ - cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \ - {sub(/,$/, "") \ - sub(/ETHTOOL_LINK_MODE_/,"") \ - sub(/_BIT/,"") \ - sub(/_Full/,"/Full") \ - sub(/_Half/,"/Half");\ - print "["$1"]="$3}' - - awk "${cmd}" /usr/include/linux/ethtool.h -} - -ethtool_set() -{ - local cmd="$@" - local out=$(ethtool -s $cmd 2>&1 | wc -l) - - check_err $out "error in configuration. $cmd" -} - -dev_linkmodes_params_get() -{ - local dev=$1; shift - local adver=$1; shift - local -a linkmodes_params - local param_count - local arr - - if (($adver)); then - mode="Advertised link modes" - else - mode="Supported link modes" - fi - - local -a dev_linkmodes=($(dev_speeds_get $dev 1 $adver)) - for ((i=0; i<${#dev_linkmodes[@]}; i++)); do - linkmodes_params[$i]=$(echo -e "${dev_linkmodes[$i]}" | \ - # Replaces all non numbers with spaces - sed -e 's/[^0-9]/ /g' | \ - # Squeeze spaces in sequence to 1 space - tr -s ' ') - # Count how many numbers were found in the linkmode - param_count=$(echo "${linkmodes_params[$i]}" | wc -w) - if [[ $param_count -eq 1 ]]; then - linkmodes_params[$i]="${linkmodes_params[$i]} 1" - elif [[ $param_count -ge 3 ]]; then - arr=(${linkmodes_params[$i]}) - # Take only first two params - linkmodes_params[$i]=$(echo "${arr[@]:0:2}") - fi - done - echo ${linkmodes_params[@]} -} - -dev_speeds_get() -{ - local dev=$1; shift - local with_mode=$1; shift - local adver=$1; shift - local speeds_str - - if (($adver)); then - mode="Advertised link modes" - else - mode="Supported link modes" - fi - - speeds_str=$(ethtool "$dev" | \ - # Snip everything before the link modes section. - sed -n '/'"$mode"':/,$p' | \ - # Quit processing the rest at the start of the next section. - # When checking, skip the header of this section (hence the 2,). - sed -n '2,${/^[\t][^ \t]/q};p' | \ - # Drop the section header of the current section. - cut -d':' -f2) - - local -a speeds_arr=($speeds_str) - if [[ $with_mode -eq 0 ]]; then - for ((i=0; i<${#speeds_arr[@]}; i++)); do - speeds_arr[$i]=${speeds_arr[$i]%base*} - done - fi - echo ${speeds_arr[@]} -} - -common_speeds_get() -{ - dev1=$1; shift - dev2=$1; shift - with_mode=$1; shift - adver=$1; shift - - local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver)) - local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver)) - - comm -12 \ - <(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \ - <(printf '%s\n' "${dev2_speeds[@]}" | sort -u) -} - -different_speeds_get() -{ - local dev1=$1; shift - local dev2=$1; shift - local with_mode=$1; shift - local adver=$1; shift - - local -a speeds_arr - - speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver)) - if [[ ${#speeds_arr[@]} < 2 ]]; then - check_err 1 "cannot check different speeds. There are not enough speeds" - fi - - echo ${speeds_arr[0]} ${speeds_arr[1]} -} diff --git a/tools/testing/selftests/net/forwarding/ethtool_mm.sh b/tools/testing/selftests/net/forwarding/ethtool_mm.sh deleted file mode 100755 index 50d5bfb17e..0000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_mm.sh +++ /dev/null @@ -1,340 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - manual_with_verification_h1_to_h2 - manual_with_verification_h2_to_h1 - manual_without_verification_h1_to_h2 - manual_without_verification_h2_to_h1 - manual_failed_verification_h1_to_h2 - manual_failed_verification_h2_to_h1 - lldp -" - -NUM_NETIFS=2 -REQUIRE_MZ=no -PREEMPTIBLE_PRIO=0 -source lib.sh - -traffic_test() -{ - local if=$1; shift - local src=$1; shift - local num_pkts=10000 - local before= - local after= - local delta= - - if [ ${has_pmac_stats[$if]} = false ]; then - src="aggregate" - fi - - before=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) - - $MZ $if -q -c $num_pkts -p 64 -b bcast -t ip -R $PREEMPTIBLE_PRIO - - after=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) - - delta=$((after - before)) - - # Allow an extra 1% tolerance for random packets sent by the stack - [ $delta -ge $num_pkts ] && [ $delta -le $((num_pkts + 100)) ] -} - -manual_with_verification() -{ - local tx=$1; shift - local rx=$1; shift - - RET=0 - - # It isn't completely clear from IEEE 802.3-2018 Figure 99-5: Transmit - # Processing state diagram whether the "send_r" variable (send response - # to verification frame) should be taken into consideration while the - # MAC Merge TX direction is disabled. That being said, at least the - # NXP ENETC does not, and requires tx-enabled on in order to respond to - # the link partner's verification frames. - ethtool --set-mm $rx tx-enabled on - ethtool --set-mm $tx verify-enabled on tx-enabled on - - # Wait for verification to finish - sleep 1 - - ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ - grep -q 'SUCCEEDED' - check_err "$?" "Verification did not succeed" - - ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' - check_err "$?" "pMAC TX is not active" - - traffic_test $tx "pmac" - check_err "$?" "Traffic did not get sent through $tx's pMAC" - - ethtool --set-mm $tx verify-enabled off tx-enabled off - ethtool --set-mm $rx tx-enabled off - - log_test "Manual configuration with verification: $tx to $rx" -} - -manual_with_verification_h1_to_h2() -{ - manual_with_verification $h1 $h2 -} - -manual_with_verification_h2_to_h1() -{ - manual_with_verification $h2 $h1 -} - -manual_without_verification() -{ - local tx=$1; shift - local rx=$1; shift - - RET=0 - - ethtool --set-mm $tx verify-enabled off tx-enabled on - - ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ - grep -q 'DISABLED' - check_err "$?" "Verification is not disabled" - - ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' - check_err "$?" "pMAC TX is not active" - - traffic_test $tx "pmac" - check_err "$?" "Traffic did not get sent through $tx's pMAC" - - ethtool --set-mm $tx verify-enabled off tx-enabled off - - log_test "Manual configuration without verification: $tx to $rx" -} - -manual_without_verification_h1_to_h2() -{ - manual_without_verification $h1 $h2 -} - -manual_without_verification_h2_to_h1() -{ - manual_without_verification $h2 $h1 -} - -manual_failed_verification() -{ - local tx=$1; shift - local rx=$1; shift - - RET=0 - - ethtool --set-mm $rx pmac-enabled off - ethtool --set-mm $tx verify-enabled on tx-enabled on - - # Wait for verification to time out - sleep 1 - - ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ - grep -q 'SUCCEEDED' - check_fail "$?" "Verification succeeded when it shouldn't have" - - ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' - check_fail "$?" "pMAC TX is active when it shouldn't have" - - traffic_test $tx "emac" - check_err "$?" "Traffic did not get sent through $tx's eMAC" - - ethtool --set-mm $tx verify-enabled off tx-enabled off - ethtool --set-mm $rx pmac-enabled on - - log_test "Manual configuration with failed verification: $tx to $rx" -} - -manual_failed_verification_h1_to_h2() -{ - manual_failed_verification $h1 $h2 -} - -manual_failed_verification_h2_to_h1() -{ - manual_failed_verification $h2 $h1 -} - -smallest_supported_add_frag_size() -{ - local iface=$1 - local rx_min_frag_size= - - rx_min_frag_size=$(ethtool --json --show-mm $iface | \ - jq '.[]."rx-min-frag-size"') - - if [ $rx_min_frag_size -le 60 ]; then - echo 0 - elif [ $rx_min_frag_size -le 124 ]; then - echo 1 - elif [ $rx_min_frag_size -le 188 ]; then - echo 2 - elif [ $rx_min_frag_size -le 252 ]; then - echo 3 - else - echo "$iface: RX min frag size $rx_min_frag_size cannot be advertised over LLDP" - exit 1 - fi -} - -expected_add_frag_size() -{ - local iface=$1 - local requested=$2 - local min=$(smallest_supported_add_frag_size $iface) - - [ $requested -le $min ] && echo $min || echo $requested -} - -lldp_change_add_frag_size() -{ - local add_frag_size=$1 - local pattern= - - lldptool -T -i $h1 -V addEthCaps addFragSize=$add_frag_size >/dev/null - # Wait for TLVs to be received - sleep 2 - pattern=$(printf "Additional fragment size: %d" \ - $(expected_add_frag_size $h1 $add_frag_size)) - lldptool -i $h2 -t -n -V addEthCaps | grep -q "$pattern" -} - -lldp() -{ - RET=0 - - systemctl start lldpad - - # Configure the interfaces to receive and transmit LLDPDUs - lldptool -L -i $h1 adminStatus=rxtx >/dev/null - lldptool -L -i $h2 adminStatus=rxtx >/dev/null - - # Enable the transmission of Additional Ethernet Capabilities TLV - lldptool -T -i $h1 -V addEthCaps enableTx=yes >/dev/null - lldptool -T -i $h2 -V addEthCaps enableTx=yes >/dev/null - - # Wait for TLVs to be received - sleep 2 - - lldptool -i $h1 -t -n -V addEthCaps | \ - grep -q "Preemption capability active" - check_err "$?" "$h1 pMAC TX is not active" - - lldptool -i $h2 -t -n -V addEthCaps | \ - grep -q "Preemption capability active" - check_err "$?" "$h2 pMAC TX is not active" - - lldp_change_add_frag_size 3 - check_err "$?" "addFragSize 3" - - lldp_change_add_frag_size 2 - check_err "$?" "addFragSize 2" - - lldp_change_add_frag_size 1 - check_err "$?" "addFragSize 1" - - lldp_change_add_frag_size 0 - check_err "$?" "addFragSize 0" - - traffic_test $h1 "pmac" - check_err "$?" "Traffic did not get sent through $h1's pMAC" - - traffic_test $h2 "pmac" - check_err "$?" "Traffic did not get sent through $h2's pMAC" - - systemctl stop lldpad - - log_test "LLDP" -} - -h1_create() -{ - ip link set dev $h1 up - - tc qdisc add dev $h1 root mqprio num_tc 4 map 0 1 2 3 \ - queues 1@0 1@1 1@2 1@3 \ - fp P E E E \ - hw 1 - - ethtool --set-mm $h1 pmac-enabled on tx-enabled off verify-enabled off -} - -h2_create() -{ - ip link set dev $h2 up - - ethtool --set-mm $h2 pmac-enabled on tx-enabled off verify-enabled off - - tc qdisc add dev $h2 root mqprio num_tc 4 map 0 1 2 3 \ - queues 1@0 1@1 1@2 1@3 \ - fp P E E E \ - hw 1 -} - -h1_destroy() -{ - ethtool --set-mm $h1 pmac-enabled off tx-enabled off verify-enabled off - - tc qdisc del dev $h1 root - - ip link set dev $h1 down -} - -h2_destroy() -{ - tc qdisc del dev $h2 root - - ethtool --set-mm $h2 pmac-enabled off tx-enabled off verify-enabled off - - ip link set dev $h2 down -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - h1_create - h2_create -} - -cleanup() -{ - pre_cleanup - - h2_destroy - h1_destroy -} - -check_ethtool_mm_support -check_tc_fp_support -require_command lldptool -bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually" - -for netif in ${NETIFS[@]}; do - ethtool --show-mm $netif 2>&1 &> /dev/null - if [[ $? -ne 0 ]]; then - echo "SKIP: $netif does not support MAC Merge" - exit $ksft_skip - fi - - if check_ethtool_pmac_std_stats_support $netif eth-mac; then - has_pmac_stats[$netif]=true - else - has_pmac_stats[$netif]=false - echo "$netif does not report pMAC statistics, falling back to aggregate" - fi -done - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh b/tools/testing/selftests/net/forwarding/ethtool_rmon.sh deleted file mode 100755 index e78776db85..0000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - rmon_rx_histogram - rmon_tx_histogram -" - -NUM_NETIFS=2 -source lib.sh - -ETH_FCS_LEN=4 -ETH_HLEN=$((6+6+2)) - -declare -A netif_mtu - -ensure_mtu() -{ - local iface=$1; shift - local len=$1; shift - local current=$(ip -j link show dev $iface | jq -r '.[0].mtu') - local required=$((len - ETH_HLEN - ETH_FCS_LEN)) - - if [ $current -lt $required ]; then - ip link set dev $iface mtu $required || return 1 - fi -} - -bucket_test() -{ - local iface=$1; shift - local neigh=$1; shift - local set=$1; shift - local bucket=$1; shift - local len=$1; shift - local num_rx=10000 - local num_tx=20000 - local expected= - local before= - local after= - local delta= - - # Mausezahn does not include FCS bytes in its length - but the - # histogram counters do - len=$((len - ETH_FCS_LEN)) - - before=$(ethtool --json -S $iface --groups rmon | \ - jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") - - # Send 10k one way and 20k in the other, to detect counters - # mapped to the wrong direction - $MZ $neigh -q -c $num_rx -p $len -a own -b bcast -d 10us - $MZ $iface -q -c $num_tx -p $len -a own -b bcast -d 10us - - after=$(ethtool --json -S $iface --groups rmon | \ - jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") - - delta=$((after - before)) - - expected=$([ $set = rx ] && echo $num_rx || echo $num_tx) - - # Allow some extra tolerance for other packets sent by the stack - [ $delta -ge $expected ] && [ $delta -le $((expected + 100)) ] -} - -rmon_histogram() -{ - local iface=$1; shift - local neigh=$1; shift - local set=$1; shift - local nbuckets=0 - local step= - - RET=0 - - while read -r -a bucket; do - step="$set-pkts${bucket[0]}to${bucket[1]} on $iface" - - for if in $iface $neigh; do - if ! ensure_mtu $if ${bucket[0]}; then - log_test_xfail "$if does not support the required MTU for $step" - return - fi - done - - if ! bucket_test $iface $neigh $set $nbuckets ${bucket[0]}; then - check_err 1 "$step failed" - return 1 - fi - log_test "$step" - nbuckets=$((nbuckets + 1)) - done < <(ethtool --json -S $iface --groups rmon | \ - jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null) - - if [ $nbuckets -eq 0 ]; then - log_test_xfail "$iface does not support $set histogram counters" - return - fi -} - -rmon_rx_histogram() -{ - rmon_histogram $h1 $h2 rx - rmon_histogram $h2 $h1 rx -} - -rmon_tx_histogram() -{ - rmon_histogram $h1 $h2 tx - rmon_histogram $h2 $h1 tx -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - for iface in $h1 $h2; do - netif_mtu[$iface]=$(ip -j link show dev $iface | jq -r '.[0].mtu') - ip link set dev $iface up - done -} - -cleanup() -{ - pre_cleanup - - for iface in $h2 $h1; do - ip link set dev $iface \ - mtu ${netif_mtu[$iface]} \ - down - done -} - -check_ethtool_counter_group_support -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample index 1fc4f0242f..f1ca95e79a 100644 --- a/tools/testing/selftests/net/forwarding/forwarding.config.sample +++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample @@ -3,51 +3,28 @@ ############################################################################## # Topology description. p1 looped back to p2, p3 to p4 and so on. -declare -A NETIFS -NETIFS[p1]=veth0 -NETIFS[p2]=veth1 -NETIFS[p3]=veth2 -NETIFS[p4]=veth3 -NETIFS[p5]=veth4 -NETIFS[p6]=veth5 -NETIFS[p7]=veth6 -NETIFS[p8]=veth7 -NETIFS[p9]=veth8 -NETIFS[p10]=veth9 +NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) # Port that does not have a cable connected. NETIF_NO_CABLE=eth8 ############################################################################## -# Defines +# In addition to the topology-related variables, it is also possible to override +# in this file other variables that net/lib.sh, net/forwarding/lib.sh or other +# libraries or selftests use. E.g.: -# IPv4 ping utility name -PING=ping -# IPv6 ping utility name. Some distributions use 'ping' for IPv6. PING6=ping6 -# Packet generator. Some distributions use 'mz'. MZ=mausezahn -# mausezahn delay between transmissions in microseconds. -MZ_DELAY=0 -# Time to wait after interfaces participating in the test are all UP WAIT_TIME=5 -# Whether to pause on failure or not. -PAUSE_ON_FAIL=no -# Whether to pause on cleanup or not. -PAUSE_ON_CLEANUP=no -# Type of network interface to create -NETIF_TYPE=veth -# Whether to create virtual interfaces (veth) or not -NETIF_CREATE=yes -# Timeout (in seconds) before ping exits regardless of how many packets have -# been sent or received -PING_TIMEOUT=5 -# Minimum ageing_time (in centiseconds) supported by hardware -LOW_AGEING_TIME=1000 -# Flag for tc match, supposed to be skip_sw/skip_hw which means do not process -# filter by software/hardware -TC_FLAG=skip_hw -# IPv6 traceroute utility name. -TROUTE6=traceroute6 - diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh b/tools/testing/selftests/net/forwarding/hw_stats_l3.sh deleted file mode 100755 index 48584a5138..0000000000 --- a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh +++ /dev/null @@ -1,340 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# +--------------------+ +----------------------+ -# | H1 | | H2 | -# | | | | -# | $h1.200 + | | + $h2.200 | -# | 192.0.2.1/28 | | | | 192.0.2.18/28 | -# | 2001:db8:1::1/64 | | | | 2001:db8:2::1/64 | -# | | | | | | -# | $h1 + | | + $h2 | -# | | | | | | -# +------------------|-+ +-|--------------------+ -# | | -# +------------------|-------------------------|--------------------+ -# | SW | | | -# | | | | -# | $rp1 + + $rp2 | -# | | | | -# | $rp1.200 + + $rp2.200 | -# | 192.0.2.2/28 192.0.2.17/28 | -# | 2001:db8:1::2/64 2001:db8:2::2/64 | -# | | -# +-----------------------------------------------------------------+ - -ALL_TESTS=" - ping_ipv4 - ping_ipv6 - test_stats_rx_ipv4 - test_stats_tx_ipv4 - test_stats_rx_ipv6 - test_stats_tx_ipv6 - respin_enablement - test_stats_rx_ipv4 - test_stats_tx_ipv4 - test_stats_rx_ipv6 - test_stats_tx_ipv6 - reapply_config - ping_ipv4 - ping_ipv6 - test_stats_rx_ipv4 - test_stats_tx_ipv4 - test_stats_rx_ipv6 - test_stats_tx_ipv6 - test_stats_report_rx - test_stats_report_tx - test_destroy_enabled - test_double_enable -" -NUM_NETIFS=4 -source lib.sh - -h1_create() -{ - simple_if_init $h1 - vlan_create $h1 200 v$h1 192.0.2.1/28 2001:db8:1::1/64 - ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 - ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 -} - -h1_destroy() -{ - ip -6 route del 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 - ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 - vlan_destroy $h1 200 - simple_if_fini $h1 -} - -h2_create() -{ - simple_if_init $h2 - vlan_create $h2 200 v$h2 192.0.2.18/28 2001:db8:2::1/64 - ip route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 - ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 -} - -h2_destroy() -{ - ip -6 route del 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 - ip route del 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 - vlan_destroy $h2 200 - simple_if_fini $h2 -} - -router_rp1_200_create() -{ - ip link add name $rp1.200 link $rp1 type vlan id 200 - ip link set dev $rp1.200 addrgenmode eui64 - ip link set dev $rp1.200 up - ip address add dev $rp1.200 192.0.2.2/28 - ip address add dev $rp1.200 2001:db8:1::2/64 - ip stats set dev $rp1.200 l3_stats on -} - -router_rp1_200_destroy() -{ - ip stats set dev $rp1.200 l3_stats off - ip address del dev $rp1.200 2001:db8:1::2/64 - ip address del dev $rp1.200 192.0.2.2/28 - ip link del dev $rp1.200 -} - -router_create() -{ - ip link set dev $rp1 up - router_rp1_200_create - - ip link set dev $rp2 up - vlan_create $rp2 200 "" 192.0.2.17/28 2001:db8:2::2/64 -} - -router_destroy() -{ - vlan_destroy $rp2 200 - ip link set dev $rp2 down - - router_rp1_200_destroy - ip link set dev $rp1 down -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - rp1=${NETIFS[p2]} - - rp2=${NETIFS[p3]} - h2=${NETIFS[p4]} - - rp1mac=$(mac_get $rp1) - rp2mac=$(mac_get $rp2) - - vrf_prepare - - h1_create - h2_create - - router_create - - forwarding_enable -} - -cleanup() -{ - pre_cleanup - - forwarding_restore - - router_destroy - - h2_destroy - h1_destroy - - vrf_cleanup -} - -ping_ipv4() -{ - ping_test $h1.200 192.0.2.18 " IPv4" -} - -ping_ipv6() -{ - ping_test $h1.200 2001:db8:2::1 " IPv6" -} - -send_packets_rx_ipv4() -{ - # Send 21 packets instead of 20, because the first one might trap and go - # through the SW datapath, which might not bump the HW counter. - $MZ $h1.200 -c 21 -d 20msec -p 100 \ - -a own -b $rp1mac -A 192.0.2.1 -B 192.0.2.18 \ - -q -t udp sp=54321,dp=12345 -} - -send_packets_rx_ipv6() -{ - $MZ $h1.200 -6 -c 21 -d 20msec -p 100 \ - -a own -b $rp1mac -A 2001:db8:1::1 -B 2001:db8:2::1 \ - -q -t udp sp=54321,dp=12345 -} - -send_packets_tx_ipv4() -{ - $MZ $h2.200 -c 21 -d 20msec -p 100 \ - -a own -b $rp2mac -A 192.0.2.18 -B 192.0.2.1 \ - -q -t udp sp=54321,dp=12345 -} - -send_packets_tx_ipv6() -{ - $MZ $h2.200 -6 -c 21 -d 20msec -p 100 \ - -a own -b $rp2mac -A 2001:db8:2::1 -B 2001:db8:1::1 \ - -q -t udp sp=54321,dp=12345 -} - -___test_stats() -{ - local dir=$1; shift - local prot=$1; shift - - local a - local b - - a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) - send_packets_${dir}_${prot} - "$@" - b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ - hw_stats_get l3_stats $rp1.200 ${dir} packets) - check_err $? "Traffic not reflected in the counter: $a -> $b" -} - -__test_stats() -{ - local dir=$1; shift - local prot=$1; shift - - RET=0 - ___test_stats "$dir" "$prot" - log_test "Test $dir packets: $prot" -} - -test_stats_rx_ipv4() -{ - __test_stats rx ipv4 -} - -test_stats_tx_ipv4() -{ - __test_stats tx ipv4 -} - -test_stats_rx_ipv6() -{ - __test_stats rx ipv6 -} - -test_stats_tx_ipv6() -{ - __test_stats tx ipv6 -} - -# Make sure everything works well even after stats have been disabled and -# reenabled on the same device without touching the L3 configuration. -respin_enablement() -{ - log_info "Turning stats off and on again" - ip stats set dev $rp1.200 l3_stats off - ip stats set dev $rp1.200 l3_stats on -} - -# For the initial run, l3_stats is enabled on a completely set up netdevice. Now -# do it the other way around: enabling the L3 stats on an L2 netdevice, and only -# then apply the L3 configuration. -reapply_config() -{ - log_info "Reapplying configuration" - - router_rp1_200_destroy - - ip link add name $rp1.200 link $rp1 type vlan id 200 - ip link set dev $rp1.200 addrgenmode none - ip stats set dev $rp1.200 l3_stats on - ip link set dev $rp1.200 addrgenmode eui64 - ip link set dev $rp1.200 up - ip address add dev $rp1.200 192.0.2.2/28 - ip address add dev $rp1.200 2001:db8:1::2/64 -} - -__test_stats_report() -{ - local dir=$1; shift - local prot=$1; shift - - local a - local b - - RET=0 - - a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) - send_packets_${dir}_${prot} - ip address flush dev $rp1.200 - b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ - hw_stats_get l3_stats $rp1.200 ${dir} packets) - check_err $? "Traffic not reflected in the counter: $a -> $b" - log_test "Test ${dir} packets: stats pushed on loss of L3" - - ip stats set dev $rp1.200 l3_stats off - ip link del dev $rp1.200 - router_rp1_200_create -} - -test_stats_report_rx() -{ - __test_stats_report rx ipv4 -} - -test_stats_report_tx() -{ - __test_stats_report tx ipv4 -} - -test_destroy_enabled() -{ - RET=0 - - ip link del dev $rp1.200 - router_rp1_200_create - - log_test "Destroy l3_stats-enabled netdev" -} - -test_double_enable() -{ - RET=0 - ___test_stats rx ipv4 \ - ip stats set dev $rp1.200 l3_stats on - log_test "Test stat retention across a spurious enablement" -} - -trap cleanup EXIT - -setup_prepare -setup_wait - -used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info | - jq '.[].info.l3_stats.used') -kind=$(ip -j -d link show dev $rp1 | - jq -r '.[].linkinfo.info_kind') -if [[ $used != true ]]; then - if [[ $kind == veth ]]; then - log_test_skip "l3_stats not offloaded on veth interface" - EXIT_STATUS=$ksft_skip - else - RET=1 log_test "l3_stats not offloaded" - fi -else - tests_run -fi - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh b/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh deleted file mode 100755 index 7594bbb490..0000000000 --- a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Test L3 stats on IP-in-IP GRE tunnel without key. - -# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more -# details. - -ALL_TESTS=" - ping_ipv4 - test_stats_rx - test_stats_tx -" -NUM_NETIFS=6 -source lib.sh -source ipip_lib.sh - -setup_prepare() -{ - h1=${NETIFS[p1]} - ol1=${NETIFS[p2]} - - ul1=${NETIFS[p3]} - ul2=${NETIFS[p4]} - - ol2=${NETIFS[p5]} - h2=${NETIFS[p6]} - - ol1mac=$(mac_get $ol1) - - forwarding_enable - vrf_prepare - h1_create - h2_create - sw1_flat_create gre $ol1 $ul1 - sw2_flat_create gre $ol2 $ul2 - ip stats set dev g1a l3_stats on - ip stats set dev g2a l3_stats on -} - -cleanup() -{ - pre_cleanup - - ip stats set dev g1a l3_stats off - ip stats set dev g2a l3_stats off - - sw2_flat_destroy $ol2 $ul2 - sw1_flat_destroy $ol1 $ul1 - h2_destroy - h1_destroy - - vrf_cleanup - forwarding_restore -} - -ping_ipv4() -{ - RET=0 - - ping_test $h1 192.0.2.18 " gre flat" -} - -send_packets_ipv4() -{ - # Send 21 packets instead of 20, because the first one might trap and go - # through the SW datapath, which might not bump the HW counter. - $MZ $h1 -c 21 -d 20msec -p 100 \ - -a own -b $ol1mac -A 192.0.2.1 -B 192.0.2.18 \ - -q -t udp sp=54321,dp=12345 -} - -test_stats() -{ - local dev=$1; shift - local dir=$1; shift - - local a - local b - - RET=0 - - a=$(hw_stats_get l3_stats $dev $dir packets) - send_packets_ipv4 - b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ - hw_stats_get l3_stats $dev $dir packets) - check_err $? "Traffic not reflected in the counter: $a -> $b" - - log_test "Test $dir packets: $prot" -} - -test_stats_tx() -{ - test_stats g1a tx -} - -test_stats_rx() -{ - test_stats g2a rx -} - -skip_on_veth - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ipip_lib.sh b/tools/testing/selftests/net/forwarding/ipip_lib.sh index 30f36a57ba..01e62c4ac9 100644 --- a/tools/testing/selftests/net/forwarding/ipip_lib.sh +++ b/tools/testing/selftests/net/forwarding/ipip_lib.sh @@ -141,7 +141,6 @@ # | $h2 + | # | 192.0.2.18/28 | # +---------------------------+ -source lib.sh h1_create() { diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index e78f11140e..eabbdf00d8 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1,34 +1,125 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +############################################################################## +# Topology description. p1 looped back to p2, p3 to p4 and so on. + +declare -A NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) + +# Port that does not have a cable connected. +: "${NETIF_NO_CABLE:=eth8}" + ############################################################################## # Defines -# Can be overridden by the configuration file. -PING=${PING:=ping} -PING6=${PING6:=ping6} -MZ=${MZ:=mausezahn} -MZ_DELAY=${MZ_DELAY:=0} -ARPING=${ARPING:=arping} -TEAMD=${TEAMD:=teamd} -WAIT_TIME=${WAIT_TIME:=5} -PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} -PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no} -NETIF_TYPE=${NETIF_TYPE:=veth} -NETIF_CREATE=${NETIF_CREATE:=yes} -MCD=${MCD:=smcrouted} -MC_CLI=${MC_CLI:=smcroutectl} -PING_COUNT=${PING_COUNT:=10} -PING_TIMEOUT=${PING_TIMEOUT:=5} -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} -INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600} -LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000} -REQUIRE_JQ=${REQUIRE_JQ:=yes} -REQUIRE_MZ=${REQUIRE_MZ:=yes} -REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no} -STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no} -TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=} -TROUTE6=${TROUTE6:=traceroute6} +# Networking utilities. +: "${PING:=ping}" +: "${PING6:=ping6}" # Some distros just use ping. +: "${ARPING:=arping}" +: "${TROUTE6:=traceroute6}" + +# Packet generator. +: "${MZ:=mausezahn}" # Some distributions use 'mz'. +: "${MZ_DELAY:=0}" + +# Host configuration tools. +: "${TEAMD:=teamd}" +: "${MCD:=smcrouted}" +: "${MC_CLI:=smcroutectl}" + +# Constants for netdevice bring-up: +# Default time in seconds to wait for an interface to come up before giving up +# and bailing out. Used during initial setup. +: "${INTERFACE_TIMEOUT:=600}" +# Like INTERFACE_TIMEOUT, but default for ad-hoc waiting in testing scripts. +: "${WAIT_TIMEOUT:=20}" +# Time to wait after interfaces participating in the test are all UP. +: "${WAIT_TIME:=5}" + +# Whether to pause on, respectively, after a failure and before cleanup. +: "${PAUSE_ON_FAIL:=no}" +: "${PAUSE_ON_CLEANUP:=no}" + +# Whether to create virtual interfaces, and what netdevice type they should be. +: "${NETIF_CREATE:=yes}" +: "${NETIF_TYPE:=veth}" + +# Constants for ping tests: +# How many packets should be sent. +: "${PING_COUNT:=10}" +# Timeout (in seconds) before ping exits regardless of how many packets have +# been sent or received +: "${PING_TIMEOUT:=5}" + +# Minimum ageing_time (in centiseconds) supported by hardware +: "${LOW_AGEING_TIME:=1000}" + +# Whether to check for availability of certain tools. +: "${REQUIRE_JQ:=yes}" +: "${REQUIRE_MZ:=yes}" +: "${REQUIRE_MTOOLS:=no}" + +# Whether to override MAC addresses on interfaces participating in the test. +: "${STABLE_MAC_ADDRS:=no}" + +# Flags for tcpdump +: "${TCPDUMP_EXTRA_FLAGS:=}" + +# Flags for TC filters. +: "${TC_FLAG:=skip_hw}" + +# Whether the machine is "slow" -- i.e. might be incapable of running tests +# involving heavy traffic. This might be the case on a debug kernel, a VM, or +# e.g. a low-power board. +: "${KSFT_MACHINE_SLOW:=no}" + +############################################################################## +# Find netifs by test-specified driver name + +driver_name_get() +{ + local dev=$1; shift + local driver_path="/sys/class/net/$dev/device/driver" + + if [[ -L $driver_path ]]; then + basename `realpath $driver_path` + fi +} + +netif_find_driver() +{ + local ifnames=`ip -j link show | jq -r ".[].ifname"` + local count=0 + + for ifname in $ifnames + do + local driver_name=`driver_name_get $ifname` + if [[ ! -z $driver_name && $driver_name == $NETIF_FIND_DRIVER ]]; then + count=$((count + 1)) + NETIFS[p$count]="$ifname" + fi + done +} + +# Whether to find netdevice according to the driver speficied by the importer +: "${NETIF_FIND_DRIVER:=}" + +if [[ $NETIF_FIND_DRIVER ]]; then + unset NETIFS + declare -A NETIFS + netif_find_driver +fi net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") @@ -179,22 +270,23 @@ check_port_mab_support() fi } -skip_on_veth() +if [[ "$(id -u)" -ne 0 ]]; then + echo "SKIP: need root privileges" + exit $ksft_skip +fi + +check_driver() { - local kind=$(ip -j -d link show dev ${NETIFS[p1]} | - jq -r '.[].linkinfo.info_kind') + local dev=$1; shift + local expected=$1; shift + local driver_name=`driver_name_get $dev` - if [[ $kind == veth ]]; then - echo "SKIP: Test cannot be run with veth pairs" + if [[ $driver_name != $expected ]]; then + echo "SKIP: expected driver $expected for $dev, got $driver_name instead" exit $ksft_skip fi } -if [[ "$(id -u)" -ne 0 ]]; then - echo "SKIP: need root privileges" - exit $ksft_skip -fi - if [[ "$CHECK_TC" = "yes" ]]; then check_tc_version fi @@ -209,6 +301,21 @@ require_command() fi } +# IPv6 support was added in v3.0 +check_mtools_version() +{ + local version="$(msend -v)" + local major + + version=${version##msend version } + major=$(echo $version | cut -d. -f1) + + if [ $major -lt 3 ]; then + echo "SKIP: expected mtools version 3.0, got $version" + exit $ksft_skip + fi +} + if [[ "$REQUIRE_JQ" = "yes" ]]; then require_command jq fi @@ -216,15 +323,10 @@ if [[ "$REQUIRE_MZ" = "yes" ]]; then require_command $MZ fi if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then - # https://github.com/vladimiroltean/mtools/ - # patched for IPv6 support + # https://github.com/troglobit/mtools require_command msend require_command mreceive -fi - -if [[ ! -v NUM_NETIFS ]]; then - echo "SKIP: importer does not define \"NUM_NETIFS\"" - exit $ksft_skip + check_mtools_version fi ############################################################################## @@ -245,6 +347,23 @@ done ############################################################################## # Network interfaces configuration +if [[ ! -v NUM_NETIFS ]]; then + echo "SKIP: importer does not define \"NUM_NETIFS\"" + exit $ksft_skip +fi + +if (( NUM_NETIFS > ${#NETIFS[@]} )); then + echo "SKIP: Importer requires $NUM_NETIFS NETIFS, but only ${#NETIFS[@]} are defined (${NETIFS[@]})" + exit $ksft_skip +fi + +for i in $(seq ${#NETIFS[@]}); do + if [[ ! ${NETIFS[p$i]} ]]; then + echo "SKIP: NETIFS[p$i] not given" + exit $ksft_skip + fi +done + create_netif_veth() { local i @@ -343,13 +462,20 @@ ret_set_ksft_status() fi } +# Whether FAILs should be interpreted as XFAILs. Internal. +FAIL_TO_XFAIL= + check_err() { local err=$1 local msg=$2 if ((err)); then - ret_set_ksft_status $ksft_fail "$msg" + if [[ $FAIL_TO_XFAIL = yes ]]; then + ret_set_ksft_status $ksft_xfail "$msg" + else + ret_set_ksft_status $ksft_fail "$msg" + fi fi } @@ -374,6 +500,29 @@ check_err_fail() fi } +xfail_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW = yes ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + +xfail_on_veth() +{ + local dev=$1; shift + local kind + + kind=$(ip -j -d link show dev $dev | + jq -r '.[].linkinfo.info_kind') + if [[ $kind = veth ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + log_test_result() { local test_name=$1; shift @@ -569,6 +718,19 @@ setup_wait() sleep $WAIT_TIME } +wait_for_dev() +{ + local dev=$1; shift + local timeout=${1:-$WAIT_TIMEOUT}; shift + + slowwait $timeout ip link show dev $dev &> /dev/null + if (( $? )); then + check_err 1 + log_test wait_for_dev "Interface $dev did not appear." + exit $EXIT_STATUS + fi +} + cmd_jq() { local cmd=$1 @@ -1995,6 +2157,8 @@ bail_on_lldpad() { local reason1="$1"; shift local reason2="$1"; shift + local caller=${FUNCNAME[1]} + local src=${BASH_SOURCE[1]} if systemctl is-active --quiet lldpad; then @@ -2015,7 +2179,8 @@ bail_on_lldpad() an environment variable ALLOW_LLDPAD to a non-empty string. EOF - exit 1 + log_test_skip $src:$caller + exit $EXIT_STATUS else return fi diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh new file mode 100755 index 0000000000..ff2accccaf --- /dev/null +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -0,0 +1,208 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This tests the operation of lib.sh itself. + +ALL_TESTS=" + test_ret + test_exit_status +" +NUM_NETIFS=0 +source lib.sh + +# Simulated checks. + +do_test() +{ + local msg=$1; shift + + "$@" + check_err $? "$msg" +} + +tpass() +{ + do_test "tpass" true +} + +tfail() +{ + do_test "tfail" false +} + +txfail() +{ + FAIL_TO_XFAIL=yes do_test "txfail" false +} + +# Simulated tests. + +pass() +{ + RET=0 + do_test "true" true + log_test "true" +} + +fail() +{ + RET=0 + do_test "false" false + log_test "false" +} + +xfail() +{ + RET=0 + FAIL_TO_XFAIL=yes do_test "xfalse" false + log_test "xfalse" +} + +skip() +{ + RET=0 + log_test_skip "skip" +} + +slow_xfail() +{ + RET=0 + xfail_on_slow do_test "slow_false" false + log_test "slow_false" +} + +# lib.sh tests. + +ret_tests_run() +{ + local t + + RET=0 + retmsg= + for t in "$@"; do + $t + done + echo "$retmsg" + return $RET +} + +ret_subtest() +{ + local expect_ret=$1; shift + local expect_retmsg=$1; shift + local -a tests=( "$@" ) + + local status_names=(pass fail xfail xpass skip) + local ret + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(ret_tests_run "${tests[@]}") + ret=$? + + (( ret == expect_ret )) + check_err $? "RET=$ret expected $expect_ret" + + [[ $out == $expect_retmsg ]] + check_err $? "retmsg=$out expected $expect_retmsg" + + log_test "RET $(echo ${tests[@]}) -> ${status_names[$ret]}" +} + +test_ret() +{ + ret_subtest $ksft_pass "" + + ret_subtest $ksft_pass "" tpass + ret_subtest $ksft_fail "tfail" tfail + ret_subtest $ksft_xfail "txfail" txfail + + ret_subtest $ksft_pass "" tpass tpass + ret_subtest $ksft_fail "tfail" tpass tfail + ret_subtest $ksft_xfail "txfail" tpass txfail + + ret_subtest $ksft_fail "tfail" tfail tpass + ret_subtest $ksft_xfail "txfail" txfail tpass + + ret_subtest $ksft_fail "tfail" tfail tfail + ret_subtest $ksft_fail "tfail" tfail txfail + + ret_subtest $ksft_fail "tfail" txfail tfail + + ret_subtest $ksft_xfail "txfail" txfail txfail +} + +exit_status_tests_run() +{ + EXIT_STATUS=0 + tests_run > /dev/null + return $EXIT_STATUS +} + +exit_status_subtest() +{ + local expect_exit_status=$1; shift + local tests=$1; shift + local what=$1; shift + + local status_names=(pass fail xfail xpass skip) + local exit_status + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(TESTS="$tests" exit_status_tests_run) + exit_status=$? + + (( exit_status == expect_exit_status )) + check_err $? "EXIT_STATUS=$exit_status, expected $expect_exit_status" + + log_test "EXIT_STATUS $tests$what -> ${status_names[$exit_status]}" +} + +test_exit_status() +{ + exit_status_subtest $ksft_pass ":" + + exit_status_subtest $ksft_pass "pass" + exit_status_subtest $ksft_fail "fail" + exit_status_subtest $ksft_pass "xfail" + exit_status_subtest $ksft_skip "skip" + + exit_status_subtest $ksft_pass "pass pass" + exit_status_subtest $ksft_fail "pass fail" + exit_status_subtest $ksft_pass "pass xfail" + exit_status_subtest $ksft_skip "pass skip" + + exit_status_subtest $ksft_fail "fail pass" + exit_status_subtest $ksft_pass "xfail pass" + exit_status_subtest $ksft_skip "skip pass" + + exit_status_subtest $ksft_fail "fail fail" + exit_status_subtest $ksft_fail "fail xfail" + exit_status_subtest $ksft_fail "fail skip" + + exit_status_subtest $ksft_fail "xfail fail" + exit_status_subtest $ksft_fail "skip fail" + + exit_status_subtest $ksft_pass "xfail xfail" + exit_status_subtest $ksft_skip "xfail skip" + exit_status_subtest $ksft_skip "skip xfail" + + exit_status_subtest $ksft_skip "skip skip" + + KSFT_MACHINE_SLOW=yes \ + exit_status_subtest $ksft_pass "slow_xfail" ": slow" + + KSFT_MACHINE_SLOW=no \ + exit_status_subtest $ksft_fail "slow_xfail" ": fast" +} + +trap pre_cleanup EXIT + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c5b0cbc85b..4b364cdf3e 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -155,25 +155,30 @@ run_test() "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Multicast IPv4 to unknown group" \ - "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Multicast IPv4 to unknown group" \ + "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ @@ -187,9 +192,10 @@ run_test() "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ true - check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ - "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ + "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ + false check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \ diff --git a/tools/testing/selftests/net/forwarding/loopback.sh b/tools/testing/selftests/net/forwarding/loopback.sh deleted file mode 100755 index 8f4057310b..0000000000 --- a/tools/testing/selftests/net/forwarding/loopback.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -ALL_TESTS="loopback_test" -NUM_NETIFS=2 -source tc_common.sh -source lib.sh - -h1_create() -{ - simple_if_init $h1 192.0.2.1/24 - tc qdisc add dev $h1 clsact -} - -h1_destroy() -{ - tc qdisc del dev $h1 clsact - simple_if_fini $h1 192.0.2.1/24 -} - -h2_create() -{ - simple_if_init $h2 -} - -h2_destroy() -{ - simple_if_fini $h2 -} - -loopback_test() -{ - RET=0 - - tc filter add dev $h1 ingress protocol arp pref 1 handle 101 flower \ - skip_hw arp_op reply arp_tip 192.0.2.1 action drop - - $MZ $h1 -c 1 -t arp -q - - tc_check_packets "dev $h1 ingress" 101 1 - check_fail $? "Matched on a filter without loopback setup" - - ethtool -K $h1 loopback on - check_err $? "Failed to enable loopback" - - setup_wait_dev $h1 - - $MZ $h1 -c 1 -t arp -q - - tc_check_packets "dev $h1 ingress" 101 1 - check_err $? "Did not match on filter with loopback" - - ethtool -K $h1 loopback off - check_err $? "Failed to disable loopback" - - $MZ $h1 -c 1 -t arp -q - - tc_check_packets "dev $h1 ingress" 101 2 - check_fail $? "Matched on a filter after loopback was removed" - - tc filter del dev $h1 ingress protocol arp pref 1 handle 101 flower - - log_test "loopback" -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - vrf_prepare - - h1_create - h2_create - - if ethtool -k $h1 | grep loopback | grep -q fixed; then - log_test "SKIP: dev $h1 does not support loopback feature" - exit $ksft_skip - fi -} - -cleanup() -{ - pre_cleanup - - h2_destroy - h1_destroy - - vrf_cleanup -} - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 3f0f5dc955..2ba44247c6 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index b2d2c6cecc..2903294d8b 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -56,21 +56,12 @@ nh_stats_test_dispatch_swhw() local group_id=$1; shift local mz="$@" - local used - nh_stats_do_test "$what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get "${mz[@]}" - used=$(ip -s -j -d nexthop show id $group_id | - jq '.[].hw_stats.used') - kind=$(ip -j -d link show dev $rp11 | - jq -r '.[].linkinfo.info_kind') - if [[ $used == true ]]; then + xfail_on_veth $rp11 \ nh_stats_do_test "HW $what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get_hw "${mz[@]}" - elif [[ $kind == veth ]]; then - log_test_xfail "HW stats not offloaded on veth topology" - fi } nh_stats_test_dispatch() @@ -83,7 +74,6 @@ nh_stats_test_dispatch() local mz="$@" local enabled - local kind if ! ip nexthop help 2>&1 | grep -q hw_stats; then log_test_skip "NH stats test: ip doesn't support HW stats" diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index 4b483d24ad..cd9e346436 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/router_nh.sh b/tools/testing/selftests/net/forwarding/router_nh.sh index f3a53738bd..92904b01ea 100755 --- a/tools/testing/selftests/net/forwarding/router_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_nh.sh @@ -1,6 +1,20 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +-------------------------+ +# | H1 | | H2 | +# | $h1 + | | $h2 + | +# | 192.0.2.2/24 | | | 198.51.100.2/24 | | +# | 2001:db8:1::2/64 | | | 2001:db8:2::2/64 | | +# +-------------------|-----+ +-------------------|-----+ +# | | +# +-------------------|----------------------------|-----+ +# | R1 | | | +# | $rp1 + $rp2 + | +# | 192.0.2.1/24 198.51.100.1/24 | +# | 2001:db8:1::1/64 2001:db8:2::1/64 | +# +------------------------------------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index cdf689e994..f9d26a7911 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -199,25 +199,28 @@ ets_set_dwrr_two_bands() ets_test_strict() { ets_set_strict - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_mixed() { ets_set_mixed - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_dwrr() { ets_set_dwrr_uniform - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_varying - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_change_quantum - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_two_bands - ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_01 } diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh index 81f31179ac..17f2864456 100755 --- a/tools/testing/selftests/net/forwarding/sch_red.sh +++ b/tools/testing/selftests/net/forwarding/sch_red.sh @@ -451,35 +451,35 @@ uninstall_qdisc() ecn_test() { install_qdisc ecn - do_ecn_test $BACKLOG + xfail_on_slow do_ecn_test $BACKLOG uninstall_qdisc } ecn_nodrop_test() { install_qdisc ecn nodrop - do_ecn_nodrop_test $BACKLOG + xfail_on_slow do_ecn_nodrop_test $BACKLOG uninstall_qdisc } red_test() { install_qdisc - do_red_test $BACKLOG + xfail_on_slow do_red_test $BACKLOG uninstall_qdisc } red_qevent_test() { install_qdisc qevent early_drop block 10 - do_red_qevent_test $BACKLOG + xfail_on_slow do_red_qevent_test $BACKLOG uninstall_qdisc } ecn_qevent_test() { install_qdisc ecn qevent mark block 10 - do_ecn_qevent_test $BACKLOG + xfail_on_slow do_ecn_qevent_test $BACKLOG uninstall_qdisc } diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh index d1f26cb7cd..9cd884d4a5 100644 --- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh +++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh @@ -227,7 +227,7 @@ do_tbf_test() local nr=$(rate $t2 $t3 10) local nr_pct=$((100 * (nr - er) / er)) ((-5 <= nr_pct && nr_pct <= 5)) - check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." + xfail_on_slow check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit" } diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh index bce8bb8d2b..2e3326edfa 100644 --- a/tools/testing/selftests/net/forwarding/tc_common.sh +++ b/tools/testing/selftests/net/forwarding/tc_common.sh @@ -4,7 +4,7 @@ CHECK_TC="yes" # Can be overridden by the configuration file. See lib.sh -TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms +: "${TC_HIT_TIMEOUT:=1000}" # ms tc_check_packets() { diff --git a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh index 5a5dd90348..79775b10b9 100755 --- a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh +++ b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh @@ -1,7 +1,5 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 ALL_TESTS="tunnel_key_nofrag_test" diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index 6038b96ece..b2184847e3 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -93,6 +93,7 @@ static bool tx_socket = true; static int tcp_offset = -1; static int total_hdr_len = -1; static int ethhdr_proto = -1; +static const int num_flush_id_cases = 6; static void vlog(const char *fmt, ...) { @@ -620,6 +621,113 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE); } +static void fix_ip4_checksum(struct iphdr *iph) +{ + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); +} + +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +{ + static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf3[MAX_HDR_LEN + PAYLOAD_LEN]; + bool send_three = false; + struct iphdr *iph1; + struct iphdr *iph2; + struct iphdr *iph3; + + iph1 = (struct iphdr *)(buf1 + ETH_HLEN); + iph2 = (struct iphdr *)(buf2 + ETH_HLEN); + iph3 = (struct iphdr *)(buf3 + ETH_HLEN); + + create_packet(buf1, 0, 0, PAYLOAD_LEN, 0); + create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + + switch (tcase) { + case 0: /* DF=1, Incrementing - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + break; + + case 1: /* DF=1, Fixed - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + break; + + case 2: /* DF=0, Incrementing - should coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(9); + break; + + case 3: /* DF=0, Fixed - should not coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(8); + break; + + case 4: /* DF=1, two packets incrementing, and one fixed - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + + case 5: /* DF=1, two packets fixed, and one incrementing - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + } + + fix_ip4_checksum(iph1); + fix_ip4_checksum(iph2); + write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr); + write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr); + + if (send_three) { + fix_ip4_checksum(iph3); + write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) +{ + for (int i = 0; i < num_flush_id_cases; i++) { + sleep(1); + send_flush_id_case(fd, daddr, i); + sleep(1); + write_packet(fd, fin_pkt, total_hdr_len, daddr); + } +} + static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) { static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -938,6 +1046,8 @@ static void gro_sender(void) send_fragment4(txfd, &daddr); sleep(1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + test_flush_id(txfd, &daddr, fin_pkt); } else if (proto == PF_INET6) { sleep(1); send_fragment6(txfd, &daddr); @@ -1064,6 +1174,34 @@ static void gro_receiver(void) printf("fragmented ip4 doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); + + /* is_atomic checks */ + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Fixed - should not coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); } else if (proto == PF_INET6) { /* GRO doesn't check for ipv6 hop limit when flushing. * Hence no corresponding test to the ipv4 case. diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile index 92c1d9d080..884cd2cc06 100644 --- a/tools/testing/selftests/net/hsr/Makefile +++ b/tools/testing/selftests/net/hsr/Makefile @@ -2,6 +2,7 @@ top_srcdir = ../../../../.. -TEST_PROGS := hsr_ping.sh +TEST_PROGS := hsr_ping.sh hsr_redbox.sh +TEST_FILES += hsr_common.sh include ../../lib.mk diff --git a/tools/testing/selftests/net/hsr/config b/tools/testing/selftests/net/hsr/config index 22061204fb..241542441c 100644 --- a/tools/testing/selftests/net/hsr/config +++ b/tools/testing/selftests/net/hsr/config @@ -2,3 +2,4 @@ CONFIG_IPV6=y CONFIG_NET_SCH_NETEM=m CONFIG_HSR=y CONFIG_VETH=y +CONFIG_BRIDGE=y diff --git a/tools/testing/selftests/net/hsr/hsr_common.sh b/tools/testing/selftests/net/hsr/hsr_common.sh new file mode 100644 index 0000000000..8e97b1f2e7 --- /dev/null +++ b/tools/testing/selftests/net/hsr/hsr_common.sh @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: GPL-2.0 +# Common code for HSR testing scripts + +source ../lib.sh +ret=0 +ksft_skip=4 + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + +do_ping() +{ + local netns="$1" + local connect_addr="$2" + local ping_args="-q -c 2" + + if is_v6 "${connect_addr}"; then + $ipv6 || return 0 + ping_args="${ping_args} -6" + fi + + ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null + if [ $? -ne 0 ] ; then + echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2 + ret=1 + return 1 + fi + + return 0 +} + +do_ping_long() +{ + local netns="$1" + local connect_addr="$2" + local ping_args="-q -c 10" + + if is_v6 "${connect_addr}"; then + $ipv6 || return 0 + ping_args="${ping_args} -6" + fi + + OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)" + if [ $? -ne 0 ] ; then + echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2 + ret=1 + return 1 + fi + + VAL="$(echo $OUT | cut -d' ' -f1-8)" + SED_VAL="$(echo ${VAL} | sed -r -e 's/([0-9]{2}).*([0-9]{2}).*[[:space:]]([0-9]+%).*/\1 transmitted \2 received \3 loss/')" + if [ "${SED_VAL}" != "10 transmitted 10 received 0% loss" ] + then + echo "$netns -> $connect_addr ping TEST [ FAIL ]" + echo "Expect to send and receive 10 packets and no duplicates." + echo "Full message: ${OUT}." + ret=1 + return 1 + fi + + return 0 +} + +stop_if_error() +{ + local msg="$1" + + if [ ${ret} -ne 0 ]; then + echo "FAIL: ${msg}" 1>&2 + exit ${ret} + fi +} + +check_prerequisites() +{ + ip -Version > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip + fi +} diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh index 1c6457e546..3684b813b0 100755 --- a/tools/testing/selftests/net/hsr/hsr_ping.sh +++ b/tools/testing/selftests/net/hsr/hsr_ping.sh @@ -1,10 +1,10 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ret=0 -ksft_skip=4 ipv6=true +source ./hsr_common.sh + optstring="h4" usage() { echo "Usage: $0 [OPTION]" @@ -27,88 +27,6 @@ while getopts "$optstring" option;do esac done -sec=$(date +%s) -rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) -ns1="ns1-$rndh" -ns2="ns2-$rndh" -ns3="ns3-$rndh" - -cleanup() -{ - local netns - for netns in "$ns1" "$ns2" "$ns3" ;do - ip netns del $netns - done -} - -# $1: IP address -is_v6() -{ - [ -z "${1##*:*}" ] -} - -do_ping() -{ - local netns="$1" - local connect_addr="$2" - local ping_args="-q -c 2" - - if is_v6 "${connect_addr}"; then - $ipv6 || return 0 - ping_args="${ping_args} -6" - fi - - ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null - if [ $? -ne 0 ] ; then - echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2 - ret=1 - return 1 - fi - - return 0 -} - -do_ping_long() -{ - local netns="$1" - local connect_addr="$2" - local ping_args="-q -c 10" - - if is_v6 "${connect_addr}"; then - $ipv6 || return 0 - ping_args="${ping_args} -6" - fi - - OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)" - if [ $? -ne 0 ] ; then - echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2 - ret=1 - return 1 - fi - - VAL="$(echo $OUT | cut -d' ' -f1-8)" - if [ "$VAL" != "10 packets transmitted, 10 received, 0% packet loss," ] - then - echo "$netns -> $connect_addr ping TEST [ FAIL ]" - echo "Expect to send and receive 10 packets and no duplicates." - echo "Full message: ${OUT}." - ret=1 - return 1 - fi - - return 0 -} - -stop_if_error() -{ - local msg="$1" - - if [ ${ret} -ne 0 ]; then - echo "FAIL: ${msg}" 1>&2 - exit ${ret} - fi -} - do_complete_ping_test() { echo "INFO: Initial validation ping." @@ -248,27 +166,15 @@ setup_hsr_interfaces() ip -net "$ns3" link set hsr3 up } -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi +check_prerequisites +setup_ns ns1 ns2 ns3 -trap cleanup EXIT - -for i in "$ns1" "$ns2" "$ns3" ;do - ip netns add $i || exit $ksft_skip - ip -net $i link set lo up -done +trap cleanup_all_ns EXIT setup_hsr_interfaces 0 do_complete_ping_test -cleanup -for i in "$ns1" "$ns2" "$ns3" ;do - ip netns add $i || exit $ksft_skip - ip -net $i link set lo up -done +setup_ns ns1 ns2 ns3 setup_hsr_interfaces 1 do_complete_ping_test diff --git a/tools/testing/selftests/net/hsr/hsr_redbox.sh b/tools/testing/selftests/net/hsr/hsr_redbox.sh new file mode 100755 index 0000000000..1f36785347 --- /dev/null +++ b/tools/testing/selftests/net/hsr/hsr_redbox.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ipv6=false + +source ./hsr_common.sh + +do_complete_ping_test() +{ + echo "INFO: Initial validation ping (HSR-SAN/RedBox)." + # Each node has to be able to reach each one. + do_ping "${ns1}" 100.64.0.2 + do_ping "${ns2}" 100.64.0.1 + # Ping between SANs (test bridge) + do_ping "${ns4}" 100.64.0.51 + do_ping "${ns5}" 100.64.0.41 + # Ping from SANs to hsr1 (via hsr2) (and opposite) + do_ping "${ns3}" 100.64.0.1 + do_ping "${ns1}" 100.64.0.3 + do_ping "${ns1}" 100.64.0.41 + do_ping "${ns4}" 100.64.0.1 + do_ping "${ns1}" 100.64.0.51 + do_ping "${ns5}" 100.64.0.1 + stop_if_error "Initial validation failed." + + # Wait for MGNT HSR frames being received and nodes being + # merged. + sleep 5 + + echo "INFO: Longer ping test (HSR-SAN/RedBox)." + # Ping from SAN to hsr1 (via hsr2) + do_ping_long "${ns3}" 100.64.0.1 + # Ping from hsr1 (via hsr2) to SANs (and opposite) + do_ping_long "${ns1}" 100.64.0.3 + do_ping_long "${ns1}" 100.64.0.41 + do_ping_long "${ns4}" 100.64.0.1 + do_ping_long "${ns1}" 100.64.0.51 + do_ping_long "${ns5}" 100.64.0.1 + stop_if_error "Longer ping test failed." + + echo "INFO: All good." +} + +setup_hsr_interfaces() +{ + local HSRv="$1" + + echo "INFO: preparing interfaces for HSRv${HSRv} (HSR-SAN/RedBox)." +# +# IPv4 addresses (100.64.X.Y/24), and [X.Y] is presented on below diagram: +# +# +# |NS1 | |NS4 | +# | [0.1] | | | +# | /-- hsr1 --\ | | [0.41] | +# | ns1eth1 ns1eth2 | | ns4eth1 (SAN) | +# |------------------------| |-------------------| +# | | | +# | | | +# | | | +# |------------------------| |-------------------------------| +# | ns2eth1 ns2eth2 | | ns3eth2 | +# | \-- hsr2 --/ | | / | +# | [0.2] \ | | / | |------------| +# | ns2eth3 |---| ns3eth1 -- ns3br1 -- ns3eth3--|--| ns5eth1 | +# | (interlink)| | [0.3] [0.11] | | [0.51] | +# |NS2 (RedBOX) | |NS3 (BR) | | NS5 (SAN) | +# +# + # Check if iproute2 supports adding interlink port to hsrX device + ip link help hsr | grep -q INTERLINK + [ $? -ne 0 ] && { echo "iproute2: HSR interlink interface not supported!"; exit 0; } + + # Create interfaces for name spaces + ip link add ns1eth1 netns "${ns1}" type veth peer name ns2eth1 netns "${ns2}" + ip link add ns1eth2 netns "${ns1}" type veth peer name ns2eth2 netns "${ns2}" + ip link add ns2eth3 netns "${ns2}" type veth peer name ns3eth1 netns "${ns3}" + ip link add ns3eth2 netns "${ns3}" type veth peer name ns4eth1 netns "${ns4}" + ip link add ns3eth3 netns "${ns3}" type veth peer name ns5eth1 netns "${ns5}" + + sleep 1 + + ip -n "${ns1}" link set ns1eth1 up + ip -n "${ns1}" link set ns1eth2 up + + ip -n "${ns2}" link set ns2eth1 up + ip -n "${ns2}" link set ns2eth2 up + ip -n "${ns2}" link set ns2eth3 up + + ip -n "${ns3}" link add name ns3br1 type bridge + ip -n "${ns3}" link set ns3br1 up + ip -n "${ns3}" link set ns3eth1 master ns3br1 up + ip -n "${ns3}" link set ns3eth2 master ns3br1 up + ip -n "${ns3}" link set ns3eth3 master ns3br1 up + + ip -n "${ns4}" link set ns4eth1 up + ip -n "${ns5}" link set ns5eth1 up + + ip -net "${ns1}" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version ${HSRv} proto 0 + ip -net "${ns2}" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 interlink ns2eth3 supervision 45 version ${HSRv} proto 0 + + ip -n "${ns1}" addr add 100.64.0.1/24 dev hsr1 + ip -n "${ns2}" addr add 100.64.0.2/24 dev hsr2 + ip -n "${ns3}" addr add 100.64.0.11/24 dev ns3br1 + ip -n "${ns3}" addr add 100.64.0.3/24 dev ns3eth1 + ip -n "${ns4}" addr add 100.64.0.41/24 dev ns4eth1 + ip -n "${ns5}" addr add 100.64.0.51/24 dev ns5eth1 + + ip -n "${ns1}" link set hsr1 up + ip -n "${ns2}" link set hsr2 up +} + +check_prerequisites +setup_ns ns1 ns2 ns3 ns4 ns5 + +trap cleanup_all_ns EXIT + +setup_hsr_interfaces 1 +do_complete_ping_test + +exit $ret diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 16372ca167..9155c914c0 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -4,11 +4,16 @@ ############################################################################## # Defines -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} +: "${WAIT_TIMEOUT:=20}" + BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms -# Kselftest framework requirement - SKIP code is 4. +# Kselftest framework constants. +ksft_pass=0 +ksft_fail=1 +ksft_xfail=2 ksft_skip=4 + # namespace list created by setup_ns NS_LIST=() diff --git a/tools/testing/selftests/net/lib/.gitignore b/tools/testing/selftests/net/lib/.gitignore new file mode 100644 index 0000000000..1ebc6187f4 --- /dev/null +++ b/tools/testing/selftests/net/lib/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +csum diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile new file mode 100644 index 0000000000..82c3264b11 --- /dev/null +++ b/tools/testing/selftests/net/lib/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES) +# Additional include paths needed by kselftest.h +CFLAGS += -I../../ + +TEST_FILES := ../../../../../Documentation/netlink/specs +TEST_FILES += ../../../../net/ynl + +TEST_GEN_FILES += csum + +TEST_INCLUDES := $(wildcard py/*.py) + +include ../../lib.mk diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c new file mode 100644 index 0000000000..b9f3fc3c34 --- /dev/null +++ b/tools/testing/selftests/net/lib/csum.c @@ -0,0 +1,1000 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Test hardware checksum offload: Rx + Tx, IPv4 + IPv6, TCP + UDP. + * + * The test runs on two machines to exercise the NIC. For this reason it + * is not integrated in kselftests. + * + * CMD=$((./csum -[46] -[tu] -S $SADDR -D $DADDR -[RT] -r 1 $EXTRA_ARGS)) + * + * Rx: + * + * The sender sends packets with a known checksum field using PF_INET(6) + * SOCK_RAW sockets. + * + * good packet: $CMD [-t] + * bad packet: $CMD [-t] -E + * + * The receiver reads UDP packets with a UDP socket. This is not an + * option for TCP packets ('-t'). Optionally insert an iptables filter + * to avoid these entering the real protocol stack. + * + * The receiver also reads all packets with a PF_PACKET socket, to + * observe whether both good and bad packets arrive on the host. And to + * read the optional TP_STATUS_CSUM_VALID bit. This requires setting + * option PACKET_AUXDATA, and works only for CHECKSUM_UNNECESSARY. + * + * Tx: + * + * The sender needs to build CHECKSUM_PARTIAL packets to exercise tx + * checksum offload. + * + * The sender can sends packets with a UDP socket. + * + * Optionally crafts a packet that sums up to zero to verify that the + * device writes negative zero 0xFFFF in this case to distinguish from + * 0x0000 (checksum disabled), as required by RFC 768. Hit this case + * by choosing a specific source port. + * + * good packet: $CMD -U + * zero csum: $CMD -U -Z + * + * The sender can also build packets with PF_PACKET with PACKET_VNET_HDR, + * to cover more protocols. PF_PACKET requires passing src and dst mac + * addresses. + * + * good packet: $CMD -s $smac -d $dmac -p [-t] + * + * Argument '-z' sends UDP packets with a 0x000 checksum disabled field, + * to verify that the NIC passes these packets unmodified. + * + * Argument '-e' adds a transport mode encapsulation header between + * network and transport header. This will fail for devices that parse + * headers. Should work on devices that implement protocol agnostic tx + * checksum offload (NETIF_F_HW_CSUM). + * + * Argument '-r $SEED' optionally randomizes header, payload and length + * to increase coverage between packets sent. SEED 1 further chooses a + * different seed for each run (and logs this for reproducibility). It + * is advised to enable this for extra coverage in continuous testing. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" + +static bool cfg_bad_csum; +static int cfg_family = PF_INET6; +static int cfg_num_pkt = 4; +static bool cfg_do_rx = true; +static bool cfg_do_tx = true; +static bool cfg_encap; +static char *cfg_ifname = "eth0"; +static char *cfg_mac_dst; +static char *cfg_mac_src; +static int cfg_proto = IPPROTO_UDP; +static int cfg_payload_char = 'a'; +static int cfg_payload_len = 100; +static uint16_t cfg_port_dst = 34000; +static uint16_t cfg_port_src = 33000; +static uint16_t cfg_port_src_encap = 33001; +static unsigned int cfg_random_seed; +static int cfg_rcvbuf = 1 << 22; /* be able to queue large cfg_num_pkt */ +static bool cfg_send_pfpacket; +static bool cfg_send_udp; +static int cfg_timeout_ms = 2000; +static bool cfg_zero_disable; /* skip checksum: set to zero (udp only) */ +static bool cfg_zero_sum; /* create packet that adds up to zero */ + +static struct sockaddr_in cfg_daddr4 = {.sin_family = AF_INET}; +static struct sockaddr_in cfg_saddr4 = {.sin_family = AF_INET}; +static struct sockaddr_in6 cfg_daddr6 = {.sin6_family = AF_INET6}; +static struct sockaddr_in6 cfg_saddr6 = {.sin6_family = AF_INET6}; + +#define ENC_HEADER_LEN (sizeof(struct udphdr) + sizeof(struct udp_encap_hdr)) +#define MAX_HEADER_LEN (sizeof(struct ipv6hdr) + ENC_HEADER_LEN + sizeof(struct tcphdr)) +#define MAX_PAYLOAD_LEN 1024 + +/* Trivial demo encap. Stand-in for transport layer protocols like ESP or PSP */ +struct udp_encap_hdr { + uint8_t nexthdr; + uint8_t padding[3]; +}; + +/* Ipaddrs, for pseudo csum. Global var is ugly, pass through funcs was worse */ +static void *iph_addr_p; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000UL) + (tv.tv_usec / 1000UL); +} + +static uint32_t checksum_nofold(char *data, size_t len, uint32_t sum) +{ + uint16_t *words = (uint16_t *)data; + int i; + + for (i = 0; i < len / 2; i++) + sum += words[i]; + + if (len & 1) + sum += ((unsigned char *)data)[len - 1]; + + return sum; +} + +static uint16_t checksum_fold(void *data, size_t len, uint32_t sum) +{ + sum = checksum_nofold(data, len, sum); + + while (sum > 0xFFFF) + sum = (sum & 0xFFFF) + (sum >> 16); + + return ~sum; +} + +static uint16_t checksum(void *th, uint16_t proto, size_t len) +{ + uint32_t sum; + int alen; + + alen = cfg_family == PF_INET6 ? 32 : 8; + + sum = checksum_nofold(iph_addr_p, alen, 0); + sum += htons(proto); + sum += htons(len); + + /* With CHECKSUM_PARTIAL kernel expects non-inverted pseudo csum */ + if (cfg_do_tx && cfg_send_pfpacket) + return ~checksum_fold(NULL, 0, sum); + else + return checksum_fold(th, len, sum); +} + +static void *build_packet_ipv4(void *_iph, uint8_t proto, unsigned int len) +{ + struct iphdr *iph = _iph; + + memset(iph, 0, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->ttl = 8; + iph->protocol = proto; + iph->saddr = cfg_saddr4.sin_addr.s_addr; + iph->daddr = cfg_daddr4.sin_addr.s_addr; + iph->tot_len = htons(sizeof(*iph) + len); + iph->check = checksum_fold(iph, sizeof(*iph), 0); + + iph_addr_p = &iph->saddr; + + return iph + 1; +} + +static void *build_packet_ipv6(void *_ip6h, uint8_t proto, unsigned int len) +{ + struct ipv6hdr *ip6h = _ip6h; + + memset(ip6h, 0, sizeof(*ip6h)); + + ip6h->version = 6; + ip6h->payload_len = htons(len); + ip6h->nexthdr = proto; + ip6h->hop_limit = 64; + ip6h->saddr = cfg_saddr6.sin6_addr; + ip6h->daddr = cfg_daddr6.sin6_addr; + + iph_addr_p = &ip6h->saddr; + + return ip6h + 1; +} + +static void *build_packet_udp(void *_uh) +{ + struct udphdr *uh = _uh; + + uh->source = htons(cfg_port_src); + uh->dest = htons(cfg_port_dst); + uh->len = htons(sizeof(*uh) + cfg_payload_len); + uh->check = 0; + + /* choose source port so that uh->check adds up to zero */ + if (cfg_zero_sum) { + uh->source = 0; + uh->source = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); + + fprintf(stderr, "tx: changing sport: %hu -> %hu\n", + cfg_port_src, ntohs(uh->source)); + cfg_port_src = ntohs(uh->source); + } + + if (cfg_zero_disable) + uh->check = 0; + else + uh->check = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); + + if (cfg_bad_csum) + uh->check = ~uh->check; + + fprintf(stderr, "tx: sending checksum: 0x%x\n", uh->check); + return uh + 1; +} + +static void *build_packet_tcp(void *_th) +{ + struct tcphdr *th = _th; + + th->source = htons(cfg_port_src); + th->dest = htons(cfg_port_dst); + th->doff = 5; + th->check = 0; + + th->check = checksum(th, IPPROTO_TCP, sizeof(*th) + cfg_payload_len); + + if (cfg_bad_csum) + th->check = ~th->check; + + fprintf(stderr, "tx: sending checksum: 0x%x\n", th->check); + return th + 1; +} + +static char *build_packet_udp_encap(void *_uh) +{ + struct udphdr *uh = _uh; + struct udp_encap_hdr *eh = _uh + sizeof(*uh); + + /* outer dst == inner dst, to simplify BPF filter + * outer src != inner src, to demultiplex on recv + */ + uh->dest = htons(cfg_port_dst); + uh->source = htons(cfg_port_src_encap); + uh->check = 0; + uh->len = htons(sizeof(*uh) + + sizeof(*eh) + + sizeof(struct tcphdr) + + cfg_payload_len); + + eh->nexthdr = IPPROTO_TCP; + + return build_packet_tcp(eh + 1); +} + +static char *build_packet(char *buf, int max_len, int *len) +{ + uint8_t proto; + char *off; + int tlen; + + if (cfg_random_seed) { + int *buf32 = (void *)buf; + int i; + + for (i = 0; i < (max_len / sizeof(int)); i++) + buf32[i] = rand(); + } else { + memset(buf, cfg_payload_char, max_len); + } + + if (cfg_proto == IPPROTO_UDP) + tlen = sizeof(struct udphdr) + cfg_payload_len; + else + tlen = sizeof(struct tcphdr) + cfg_payload_len; + + if (cfg_encap) { + proto = IPPROTO_UDP; + tlen += ENC_HEADER_LEN; + } else { + proto = cfg_proto; + } + + if (cfg_family == PF_INET) + off = build_packet_ipv4(buf, proto, tlen); + else + off = build_packet_ipv6(buf, proto, tlen); + + if (cfg_encap) + off = build_packet_udp_encap(off); + else if (cfg_proto == IPPROTO_UDP) + off = build_packet_udp(off); + else + off = build_packet_tcp(off); + + /* only pass the payload, but still compute headers for cfg_zero_sum */ + if (cfg_send_udp) { + *len = cfg_payload_len; + return off; + } + + *len = off - buf + cfg_payload_len; + return buf; +} + +static int open_inet(int ipproto, int protocol) +{ + int fd; + + fd = socket(cfg_family, ipproto, protocol); + if (fd == -1) + error(1, errno, "socket inet"); + + if (cfg_family == PF_INET6) { + /* may have been updated by cfg_zero_sum */ + cfg_saddr6.sin6_port = htons(cfg_port_src); + + if (bind(fd, (void *)&cfg_saddr6, sizeof(cfg_saddr6))) + error(1, errno, "bind dgram 6"); + if (connect(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) + error(1, errno, "connect dgram 6"); + } else { + /* may have been updated by cfg_zero_sum */ + cfg_saddr4.sin_port = htons(cfg_port_src); + + if (bind(fd, (void *)&cfg_saddr4, sizeof(cfg_saddr4))) + error(1, errno, "bind dgram 4"); + if (connect(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) + error(1, errno, "connect dgram 4"); + } + + return fd; +} + +static int open_packet(void) +{ + int fd, one = 1; + + fd = socket(PF_PACKET, SOCK_RAW, 0); + if (fd == -1) + error(1, errno, "socket packet"); + + if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &one, sizeof(one))) + error(1, errno, "setsockopt packet_vnet_ndr"); + + return fd; +} + +static void send_inet(int fd, const char *buf, int len) +{ + int ret; + + ret = write(fd, buf, len); + if (ret == -1) + error(1, errno, "write"); + if (ret != len) + error(1, 0, "write: %d", ret); +} + +static void eth_str_to_addr(const char *str, unsigned char *eth) +{ + if (sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + ð[0], ð[1], ð[2], ð[3], ð[4], ð[5]) != 6) + error(1, 0, "cannot parse mac addr %s", str); +} + +static void send_packet(int fd, const char *buf, int len) +{ + struct virtio_net_hdr vh = {0}; + struct sockaddr_ll addr = {0}; + struct msghdr msg = {0}; + struct ethhdr eth; + struct iovec iov[3]; + int ret; + + addr.sll_family = AF_PACKET; + addr.sll_halen = ETH_ALEN; + addr.sll_ifindex = if_nametoindex(cfg_ifname); + if (!addr.sll_ifindex) + error(1, errno, "if_nametoindex %s", cfg_ifname); + + vh.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + if (cfg_family == PF_INET6) { + vh.csum_start = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); + addr.sll_protocol = htons(ETH_P_IPV6); + } else { + vh.csum_start = sizeof(struct ethhdr) + sizeof(struct iphdr); + addr.sll_protocol = htons(ETH_P_IP); + } + + if (cfg_encap) + vh.csum_start += ENC_HEADER_LEN; + + if (cfg_proto == IPPROTO_TCP) { + vh.csum_offset = __builtin_offsetof(struct tcphdr, check); + vh.hdr_len = vh.csum_start + sizeof(struct tcphdr); + } else { + vh.csum_offset = __builtin_offsetof(struct udphdr, check); + vh.hdr_len = vh.csum_start + sizeof(struct udphdr); + } + + eth_str_to_addr(cfg_mac_src, eth.h_source); + eth_str_to_addr(cfg_mac_dst, eth.h_dest); + eth.h_proto = addr.sll_protocol; + + iov[0].iov_base = &vh; + iov[0].iov_len = sizeof(vh); + + iov[1].iov_base = ð + iov[1].iov_len = sizeof(eth); + + iov[2].iov_base = (void *)buf; + iov[2].iov_len = len; + + msg.msg_iov = iov; + msg.msg_iovlen = ARRAY_SIZE(iov); + + msg.msg_name = &addr; + msg.msg_namelen = sizeof(addr); + + ret = sendmsg(fd, &msg, 0); + if (ret == -1) + error(1, errno, "sendmsg packet"); + if (ret != sizeof(vh) + sizeof(eth) + len) + error(1, errno, "sendmsg packet: %u", ret); +} + +static int recv_prepare_udp(void) +{ + int fd; + + fd = socket(cfg_family, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket r"); + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &cfg_rcvbuf, sizeof(cfg_rcvbuf))) + error(1, errno, "setsockopt SO_RCVBUF r"); + + if (cfg_family == PF_INET6) { + if (bind(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) + error(1, errno, "bind r"); + } else { + if (bind(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) + error(1, errno, "bind r"); + } + + return fd; +} + +/* Filter out all traffic that is not cfg_proto with our destination port. + * + * Otherwise background noise may cause PF_PACKET receive queue overflow, + * dropping the expected packets and failing the test. + */ +static void __recv_prepare_packet_filter(int fd, int off_nexthdr, int off_dport) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, off_nexthdr), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_encap ? IPPROTO_UDP : cfg_proto, 0, 2), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, off_dport), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_port_dst, 1, 0), + BPF_STMT(BPF_RET + BPF_K, 0), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = ARRAY_SIZE(filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +static void recv_prepare_packet_filter(int fd) +{ + const int off_dport = offsetof(struct tcphdr, dest); /* same for udp */ + + if (cfg_family == AF_INET) + __recv_prepare_packet_filter(fd, offsetof(struct iphdr, protocol), + sizeof(struct iphdr) + off_dport); + else + __recv_prepare_packet_filter(fd, offsetof(struct ipv6hdr, nexthdr), + sizeof(struct ipv6hdr) + off_dport); +} + +static void recv_prepare_packet_bind(int fd) +{ + struct sockaddr_ll laddr = {0}; + + laddr.sll_family = AF_PACKET; + + if (cfg_family == PF_INET) + laddr.sll_protocol = htons(ETH_P_IP); + else + laddr.sll_protocol = htons(ETH_P_IPV6); + + laddr.sll_ifindex = if_nametoindex(cfg_ifname); + if (!laddr.sll_ifindex) + error(1, 0, "if_nametoindex %s", cfg_ifname); + + if (bind(fd, (void *)&laddr, sizeof(laddr))) + error(1, errno, "bind pf_packet"); +} + +static int recv_prepare_packet(void) +{ + int fd, one = 1; + + fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket p"); + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &cfg_rcvbuf, sizeof(cfg_rcvbuf))) + error(1, errno, "setsockopt SO_RCVBUF p"); + + /* enable auxdata to recv checksum status (valid vs unknown) */ + if (setsockopt(fd, SOL_PACKET, PACKET_AUXDATA, &one, sizeof(one))) + error(1, errno, "setsockopt auxdata"); + + /* install filter to restrict packet flow to match */ + recv_prepare_packet_filter(fd); + + /* bind to address family to start packet flow */ + recv_prepare_packet_bind(fd); + + return fd; +} + +static int recv_udp(int fd) +{ + static char buf[MAX_PAYLOAD_LEN]; + int ret, count = 0; + + while (1) { + ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); + if (ret == -1 && errno == EAGAIN) + break; + if (ret == -1) + error(1, errno, "recv r"); + + fprintf(stderr, "rx: udp: len=%u\n", ret); + count++; + } + + return count; +} + +static int recv_verify_csum(void *th, int len, uint16_t sport, uint16_t csum_field) +{ + uint16_t csum; + + csum = checksum(th, cfg_proto, len); + + fprintf(stderr, "rx: pkt: sport=%hu len=%u csum=0x%hx verify=0x%hx\n", + sport, len, csum_field, csum); + + /* csum must be zero unless cfg_bad_csum indicates bad csum */ + if (csum && !cfg_bad_csum) { + fprintf(stderr, "pkt: bad csum\n"); + return 1; + } else if (cfg_bad_csum && !csum) { + fprintf(stderr, "pkt: good csum, while bad expected\n"); + return 1; + } + + if (cfg_zero_sum && csum_field != 0xFFFF) { + fprintf(stderr, "pkt: zero csum: field should be 0xFFFF, is 0x%hx\n", csum_field); + return 1; + } + + return 0; +} + +static int recv_verify_packet_tcp(void *th, int len) +{ + struct tcphdr *tcph = th; + + if (len < sizeof(*tcph) || tcph->dest != htons(cfg_port_dst)) + return -1; + + return recv_verify_csum(th, len, ntohs(tcph->source), tcph->check); +} + +static int recv_verify_packet_udp_encap(void *th, int len) +{ + struct udp_encap_hdr *eh = th; + + if (len < sizeof(*eh) || eh->nexthdr != IPPROTO_TCP) + return -1; + + return recv_verify_packet_tcp(eh + 1, len - sizeof(*eh)); +} + +static int recv_verify_packet_udp(void *th, int len) +{ + struct udphdr *udph = th; + + if (len < sizeof(*udph)) + return -1; + + if (udph->dest != htons(cfg_port_dst)) + return -1; + + if (udph->source == htons(cfg_port_src_encap)) + return recv_verify_packet_udp_encap(udph + 1, + len - sizeof(*udph)); + + return recv_verify_csum(th, len, ntohs(udph->source), udph->check); +} + +static int recv_verify_packet_ipv4(void *nh, int len) +{ + struct iphdr *iph = nh; + uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + + if (len < sizeof(*iph) || iph->protocol != proto) + return -1; + + iph_addr_p = &iph->saddr; + if (proto == IPPROTO_TCP) + return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph)); + else + return recv_verify_packet_udp(iph + 1, len - sizeof(*iph)); +} + +static int recv_verify_packet_ipv6(void *nh, int len) +{ + struct ipv6hdr *ip6h = nh; + uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + + if (len < sizeof(*ip6h) || ip6h->nexthdr != proto) + return -1; + + iph_addr_p = &ip6h->saddr; + + if (proto == IPPROTO_TCP) + return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h)); + else + return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h)); +} + +/* return whether auxdata includes TP_STATUS_CSUM_VALID */ +static uint32_t recv_get_packet_csum_status(struct msghdr *msg) +{ + struct tpacket_auxdata *aux = NULL; + struct cmsghdr *cm; + + if (msg->msg_flags & MSG_CTRUNC) + error(1, 0, "cmsg: truncated"); + + for (cm = CMSG_FIRSTHDR(msg); cm; cm = CMSG_NXTHDR(msg, cm)) { + if (cm->cmsg_level != SOL_PACKET || + cm->cmsg_type != PACKET_AUXDATA) + error(1, 0, "cmsg: level=%d type=%d\n", + cm->cmsg_level, cm->cmsg_type); + + if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata))) + error(1, 0, "cmsg: len=%lu expected=%lu", + cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); + + aux = (void *)CMSG_DATA(cm); + } + + if (!aux) + error(1, 0, "cmsg: no auxdata"); + + return aux->tp_status; +} + +static int recv_packet(int fd) +{ + static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; + unsigned long total = 0, bad_csums = 0, bad_validations = 0; + char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; + struct pkt *buf = (void *)_buf; + struct msghdr msg = {0}; + uint32_t tp_status; + struct iovec iov; + int len, ret; + + iov.iov_base = _buf; + iov.iov_len = sizeof(_buf); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + while (1) { + msg.msg_flags = 0; + + len = recvmsg(fd, &msg, MSG_DONTWAIT); + if (len == -1 && errno == EAGAIN) + break; + if (len == -1) + error(1, errno, "recv p"); + + tp_status = recv_get_packet_csum_status(&msg); + + /* GRO might coalesce randomized packets. Such GSO packets are + * then reinitialized for csum offload (CHECKSUM_PARTIAL), with + * a pseudo csum. Do not try to validate these checksums. + */ + if (tp_status & TP_STATUS_CSUMNOTREADY) { + fprintf(stderr, "cmsg: GSO packet has partial csum: skip\n"); + continue; + } + + if (cfg_family == PF_INET6) + ret = recv_verify_packet_ipv6(buf, len); + else + ret = recv_verify_packet_ipv4(buf, len); + + if (ret == -1 /* skip: non-matching */) + continue; + + total++; + if (ret == 1) + bad_csums++; + + /* Fail if kernel returns valid for known bad csum. + * Do not fail if kernel does not validate a good csum: + * Absence of validation does not imply invalid. + */ + if (tp_status & TP_STATUS_CSUM_VALID && cfg_bad_csum) { + fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n"); + bad_validations++; + } + } + + if (bad_csums || bad_validations) + error(1, 0, "rx: errors at pf_packet: total=%lu bad_csums=%lu bad_valids=%lu\n", + total, bad_csums, bad_validations); + + return total; +} + +static void parse_args(int argc, char *const argv[]) +{ + const char *daddr = NULL, *saddr = NULL; + int c; + + while ((c = getopt(argc, argv, "46d:D:eEi:l:L:n:r:PRs:S:tTuUzZ")) != -1) { + switch (c) { + case '4': + cfg_family = PF_INET; + break; + case '6': + cfg_family = PF_INET6; + break; + case 'd': + cfg_mac_dst = optarg; + break; + case 'D': + daddr = optarg; + break; + case 'e': + cfg_encap = true; + break; + case 'E': + cfg_bad_csum = true; + break; + case 'i': + cfg_ifname = optarg; + break; + case 'l': + cfg_payload_len = strtol(optarg, NULL, 0); + break; + case 'L': + cfg_timeout_ms = strtol(optarg, NULL, 0) * 1000; + break; + case 'n': + cfg_num_pkt = strtol(optarg, NULL, 0); + break; + case 'r': + cfg_random_seed = strtol(optarg, NULL, 0); + break; + case 'P': + cfg_send_pfpacket = true; + break; + case 'R': + /* only Rx: used with two machine tests */ + cfg_do_tx = false; + break; + case 's': + cfg_mac_src = optarg; + break; + case 'S': + saddr = optarg; + break; + case 't': + cfg_proto = IPPROTO_TCP; + break; + case 'T': + /* only Tx: used with two machine tests */ + cfg_do_rx = false; + break; + case 'u': + cfg_proto = IPPROTO_UDP; + break; + case 'U': + /* send using real udp socket, + * to exercise tx checksum offload + */ + cfg_send_udp = true; + break; + case 'z': + cfg_zero_disable = true; + break; + case 'Z': + cfg_zero_sum = true; + break; + default: + error(1, 0, "unknown arg %c", c); + } + } + + if (!daddr || !saddr) + error(1, 0, "Must pass -D and -S "); + + if (cfg_do_tx && cfg_send_pfpacket && (!cfg_mac_src || !cfg_mac_dst)) + error(1, 0, "Transmit with pf_packet requires mac addresses"); + + if (cfg_payload_len > MAX_PAYLOAD_LEN) + error(1, 0, "Payload length exceeds max"); + + if (cfg_proto != IPPROTO_UDP && (cfg_zero_sum || cfg_zero_disable)) + error(1, 0, "Only UDP supports zero csum"); + + if (cfg_zero_sum && !cfg_send_udp) + error(1, 0, "Zero checksum conversion requires -U for tx csum offload"); + if (cfg_zero_sum && cfg_bad_csum) + error(1, 0, "Cannot combine zero checksum conversion and invalid checksum"); + if (cfg_zero_sum && cfg_random_seed) + error(1, 0, "Cannot combine zero checksum conversion with randomization"); + + if (cfg_family == PF_INET6) { + cfg_saddr6.sin6_port = htons(cfg_port_src); + cfg_daddr6.sin6_port = htons(cfg_port_dst); + + if (inet_pton(cfg_family, daddr, &cfg_daddr6.sin6_addr) != 1) + error(1, errno, "Cannot parse ipv6 -D"); + if (inet_pton(cfg_family, saddr, &cfg_saddr6.sin6_addr) != 1) + error(1, errno, "Cannot parse ipv6 -S"); + } else { + cfg_saddr4.sin_port = htons(cfg_port_src); + cfg_daddr4.sin_port = htons(cfg_port_dst); + + if (inet_pton(cfg_family, daddr, &cfg_daddr4.sin_addr) != 1) + error(1, errno, "Cannot parse ipv4 -D"); + if (inet_pton(cfg_family, saddr, &cfg_saddr4.sin_addr) != 1) + error(1, errno, "Cannot parse ipv4 -S"); + } + + if (cfg_do_tx && cfg_random_seed) { + /* special case: time-based seed */ + if (cfg_random_seed == 1) + cfg_random_seed = (unsigned int)gettimeofday_ms(); + srand(cfg_random_seed); + fprintf(stderr, "randomization seed: %u\n", cfg_random_seed); + } +} + +static void do_tx(void) +{ + static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; + char *buf; + int fd, len, i; + + buf = build_packet(_buf, sizeof(_buf), &len); + + if (cfg_send_pfpacket) + fd = open_packet(); + else if (cfg_send_udp) + fd = open_inet(SOCK_DGRAM, 0); + else + fd = open_inet(SOCK_RAW, IPPROTO_RAW); + + for (i = 0; i < cfg_num_pkt; i++) { + if (cfg_send_pfpacket) + send_packet(fd, buf, len); + else + send_inet(fd, buf, len); + + /* randomize each packet individually to increase coverage */ + if (cfg_random_seed) { + cfg_payload_len = rand() % MAX_PAYLOAD_LEN; + buf = build_packet(_buf, sizeof(_buf), &len); + } + } + + if (close(fd)) + error(1, errno, "close tx"); +} + +static void do_rx(int fdp, int fdr) +{ + unsigned long count_udp = 0, count_pkt = 0; + long tleft, tstop; + struct pollfd pfd; + + tstop = gettimeofday_ms() + cfg_timeout_ms; + tleft = cfg_timeout_ms; + + do { + pfd.events = POLLIN; + pfd.fd = fdp; + if (poll(&pfd, 1, tleft) == -1) + error(1, errno, "poll"); + + if (pfd.revents & POLLIN) + count_pkt += recv_packet(fdp); + + if (cfg_proto == IPPROTO_UDP) + count_udp += recv_udp(fdr); + + tleft = tstop - gettimeofday_ms(); + } while (tleft > 0); + + if (close(fdr)) + error(1, errno, "close r"); + if (close(fdp)) + error(1, errno, "close p"); + + if (count_pkt < cfg_num_pkt) + error(1, 0, "rx: missing packets at pf_packet: %lu < %u", + count_pkt, cfg_num_pkt); + + if (cfg_proto == IPPROTO_UDP) { + if (cfg_bad_csum && count_udp) + error(1, 0, "rx: unexpected packets at udp"); + if (!cfg_bad_csum && !count_udp) + error(1, 0, "rx: missing packets at udp"); + } +} + +int main(int argc, char *const argv[]) +{ + int fdp = -1, fdr = -1; /* -1 to silence -Wmaybe-uninitialized */ + + parse_args(argc, argv); + + /* open receive sockets before transmitting */ + if (cfg_do_rx) { + fdp = recv_prepare_packet(); + fdr = recv_prepare_udp(); + } + + if (cfg_do_tx) + do_tx(); + + if (cfg_do_rx) + do_rx(fdp, fdr); + + fprintf(stderr, "OK\n"); + return 0; +} diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py new file mode 100644 index 0000000000..b6d498d125 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .consts import KSRC +from .ksft import * +from .netns import NetNS +from .nsim import * +from .utils import * +from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily diff --git a/tools/testing/selftests/net/lib/py/consts.py b/tools/testing/selftests/net/lib/py/consts.py new file mode 100644 index 0000000000..f518ce79d8 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/consts.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../..").resolve() +KSRC = (Path(__file__).parent / "../../../../../..").resolve() + +KSFT_MAIN_NAME = Path(sys.argv[0]).with_suffix("").name diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py new file mode 100644 index 0000000000..4769b4eb1e --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: GPL-2.0 + +import builtins +import inspect +import sys +import time +import traceback +from .consts import KSFT_MAIN_NAME + +KSFT_RESULT = None +KSFT_RESULT_ALL = True + + +class KsftFailEx(Exception): + pass + + +class KsftSkipEx(Exception): + pass + + +class KsftXfailEx(Exception): + pass + + +def ksft_pr(*objs, **kwargs): + print("#", *objs, **kwargs) + + +def _fail(*args): + global KSFT_RESULT + KSFT_RESULT = False + + frame = inspect.stack()[2] + ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":") + ksft_pr(*args) + + +def ksft_eq(a, b, comment=""): + global KSFT_RESULT + if a != b: + _fail("Check failed", a, "!=", b, comment) + + +def ksft_true(a, comment=""): + if not a: + _fail("Check failed", a, "does not eval to True", comment) + + +def ksft_in(a, b, comment=""): + if a not in b: + _fail("Check failed", a, "not in", b, comment) + + +def ksft_ge(a, b, comment=""): + if a < b: + _fail("Check failed", a, "<", b, comment) + + +class ksft_raises: + def __init__(self, expected_type): + self.exception = None + self.expected_type = expected_type + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is None: + _fail(f"Expected exception {str(self.expected_type.__name__)}, none raised") + elif self.expected_type != exc_type: + _fail(f"Expected exception {str(self.expected_type.__name__)}, raised {str(exc_type.__name__)}") + self.exception = exc_val + # Suppress the exception if its the expected one + return self.expected_type == exc_type + + +def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""): + end = time.monotonic() + deadline + while True: + if cond(): + return + if time.monotonic() > end: + _fail("Waiting for condition timed out", comment) + return + time.sleep(sleep) + + +def ktap_result(ok, cnt=1, case="", comment=""): + global KSFT_RESULT_ALL + KSFT_RESULT_ALL = KSFT_RESULT_ALL and ok + + res = "" + if not ok: + res += "not " + res += "ok " + res += str(cnt) + " " + res += KSFT_MAIN_NAME + if case: + res += "." + str(case.__name__) + if comment: + res += " # " + comment + print(res) + + +def ksft_run(cases=None, globs=None, case_pfx=None, args=()): + cases = cases or [] + + if globs and case_pfx: + for key, value in globs.items(): + if not callable(value): + continue + for prefix in case_pfx: + if key.startswith(prefix): + cases.append(value) + break + + totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} + + print("KTAP version 1") + print("1.." + str(len(cases))) + + global KSFT_RESULT + cnt = 0 + for case in cases: + KSFT_RESULT = True + cnt += 1 + try: + case(*args) + except KsftSkipEx as e: + ktap_result(True, cnt, case, comment="SKIP " + str(e)) + totals['skip'] += 1 + continue + except KsftXfailEx as e: + ktap_result(True, cnt, case, comment="XFAIL " + str(e)) + totals['xfail'] += 1 + continue + except Exception as e: + tb = traceback.format_exc() + for line in tb.strip().split('\n'): + ksft_pr("Exception|", line) + ktap_result(False, cnt, case) + totals['fail'] += 1 + continue + + ktap_result(KSFT_RESULT, cnt, case) + if KSFT_RESULT: + totals['pass'] += 1 + else: + totals['fail'] += 1 + + print( + f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0" + ) + + +def ksft_exit(): + global KSFT_RESULT_ALL + sys.exit(0 if KSFT_RESULT_ALL else 1) diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py new file mode 100644 index 0000000000..ecff85f907 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/netns.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .utils import ip +import random +import string + + +class NetNS: + def __init__(self, name=None): + if name: + self.name = name + else: + self.name = ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + ip('netns add ' + self.name) + + def __del__(self): + if self.name: + ip('netns del ' + self.name) + self.name = None + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + self.__del__() + + def __str__(self): + return self.name + + def __repr__(self): + return f"NetNS({self.name})" diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py new file mode 100644 index 0000000000..f571a8b313 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json +import os +import random +import re +import time +from .utils import cmd, ip + + +class NetdevSim: + """ + Class for netdevsim netdevice and its attributes. + """ + + def __init__(self, nsimdev, port_index, ifname, ns=None): + # In case udev renamed the netdev to according to new schema, + # check if the name matches the port_index. + nsimnamere = re.compile(r"eni\d+np(\d+)") + match = nsimnamere.match(ifname) + if match and int(match.groups()[0]) != port_index + 1: + raise Exception("netdevice name mismatches the expected one") + + self.ifname = ifname + self.nsimdev = nsimdev + self.port_index = port_index + self.ns = ns + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) + ret = ip("-j link show dev %s" % ifname, ns=ns) + self.dev = json.loads(ret.stdout)[0] + self.ifindex = self.dev["ifindex"] + + def dfs_write(self, path, val): + self.nsimdev.dfs_write(f'ports/{self.port_index}/' + path, val) + + +class NetdevSimDev: + """ + Class for netdevsim bus device and its attributes. + """ + @staticmethod + def ctrl_write(path, val): + fullpath = os.path.join("/sys/bus/netdevsim/", path) + with open(fullpath, "w") as f: + f.write(val) + + def dfs_write(self, path, val): + fullpath = os.path.join(f"/sys/kernel/debug/netdevsim/netdevsim{self.addr}/", path) + with open(fullpath, "w") as f: + f.write(val) + + def __init__(self, port_count=1, queue_count=1, ns=None): + # nsim will spawn in init_net, we'll set to actual ns once we switch it there + self.ns = None + + if not os.path.exists("/sys/bus/netdevsim"): + cmd("modprobe netdevsim") + + addr = random.randrange(1 << 15) + while True: + try: + self.ctrl_write("new_device", "%u %u %u" % (addr, port_count, queue_count)) + except OSError as e: + if e.errno == errno.ENOSPC: + addr = random.randrange(1 << 15) + continue + raise e + break + self.addr = addr + + # As probe of netdevsim device might happen from a workqueue, + # so wait here until all netdevs appear. + self.wait_for_netdevs(port_count) + + if ns: + cmd(f"devlink dev reload netdevsim/netdevsim{addr} netns {ns.name}") + self.ns = ns + + cmd("udevadm settle", ns=self.ns) + ifnames = self.get_ifnames() + + self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr + + self.nsims = [] + for port_index in range(port_count): + self.nsims.append(self._make_port(port_index, ifnames[port_index])) + + self.removed = False + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.remove() + + def _make_port(self, port_index, ifname): + return NetdevSim(self, port_index, ifname, self.ns) + + def get_ifnames(self): + ifnames = [] + listdir = cmd(f"ls /sys/bus/netdevsim/devices/netdevsim{self.addr}/net/", + ns=self.ns).stdout.split() + for ifname in listdir: + ifnames.append(ifname) + ifnames.sort() + return ifnames + + def wait_for_netdevs(self, port_count): + timeout = 5 + timeout_start = time.time() + + while True: + try: + ifnames = self.get_ifnames() + except FileNotFoundError as e: + ifnames = [] + if len(ifnames) == port_count: + break + if time.time() < timeout_start + timeout: + continue + raise Exception("netdevices did not appear within timeout") + + def remove(self): + if not self.removed: + self.ctrl_write("del_device", "%u" % (self.addr, )) + self.removed = True + + def remove_nsim(self, nsim): + self.nsims.remove(nsim) + self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), + "%u" % (nsim.port_index, )) diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py new file mode 100644 index 0000000000..0540ea2492 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json as _json +import random +import re +import subprocess +import time + + +class cmd: + def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None, timeout=5): + if ns: + comm = f'ip netns exec {ns} ' + comm + + self.stdout = None + self.stderr = None + self.ret = None + + self.comm = comm + if host: + self.proc = host.cmd(comm) + else: + self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if not background: + self.process(terminate=False, fail=fail, timeout=timeout) + + def process(self, terminate=True, fail=None, timeout=5): + if fail is None: + fail = not terminate + + if terminate: + self.proc.terminate() + stdout, stderr = self.proc.communicate(timeout) + self.stdout = stdout.decode("utf-8") + self.stderr = stderr.decode("utf-8") + self.proc.stdout.close() + self.proc.stderr.close() + self.ret = self.proc.returncode + + if self.proc.returncode != 0 and fail: + if len(stderr) > 0 and stderr[-1] == "\n": + stderr = stderr[:-1] + raise Exception("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % + (self.proc.args, stdout, stderr)) + + +class bkg(cmd): + def __init__(self, comm, shell=True, fail=None, ns=None, host=None, + exit_wait=False): + super().__init__(comm, background=True, + shell=shell, fail=fail, ns=ns, host=host) + self.terminate = not exit_wait + self.check_fail = fail + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + return self.process(terminate=self.terminate, fail=self.check_fail) + + +def tool(name, args, json=None, ns=None, host=None): + cmd_str = name + ' ' + if json: + cmd_str += '--json ' + cmd_str += args + cmd_obj = cmd(cmd_str, ns=ns, host=host) + if json: + return _json.loads(cmd_obj.stdout) + return cmd_obj + + +def ip(args, json=None, ns=None, host=None): + if ns: + args = f'-netns {ns} ' + args + return tool('ip', args, json=json, host=host) + + +def rand_port(): + """ + Get unprivileged port, for now just random, one day we may decide to check if used. + """ + return random.randint(10000, 65535) + + +def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5): + end = time.monotonic() + deadline + + pattern = f":{port:04X} .* " + if proto == "tcp": # for tcp protocol additionally check the socket state + pattern += "0A" + pattern = re.compile(pattern) + + while True: + data = cmd(f'cat /proc/net/{proto}*', ns=ns, host=host, shell=True).stdout + for row in data.split("\n"): + if pattern.search(row): + return + if time.monotonic() > end: + raise Exception("Waiting for port listen timed out") + time.sleep(sleep) diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py new file mode 100644 index 0000000000..1ace58370c --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path +from .consts import KSRC, KSFT_DIR +from .ksft import ksft_pr, ktap_result + +# Resolve paths +try: + if (KSFT_DIR / "kselftest-list.txt").exists(): + # Running in "installed" selftests + tools_full_path = KSFT_DIR + SPEC_PATH = KSFT_DIR / "net/lib/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.lib.ynl.lib import YnlFamily, NlError + else: + # Running in tree + tools_full_path = KSRC / "tools" + SPEC_PATH = KSRC / "Documentation/netlink/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.ynl.lib import YnlFamily, NlError +except ModuleNotFoundError as e: + ksft_pr("Failed importing `ynl` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +# +# Wrapper classes, loading the right specs +# Set schema='' to avoid jsonschema validation, it's slow +# +class EthtoolFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(), + schema='') + + +class RtnlFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), + schema='') + + +class NetdevFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(), + schema='') diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index bc97ab33a0..776d43a692 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -200,6 +200,58 @@ chk_msk_cestab() "${expected}" "${msg}" "" } +msk_info_get_value() +{ + local port="${1}" + local info="${2}" + + ss -N "${ns}" -inHM dport "${port}" | \ + mptcp_lib_get_info_value "${info}" "${info}" +} + +chk_msk_info() +{ + local port="${1}" + local info="${2}" + local cnt="${3}" + local msg="....chk ${info}" + local delta_ms=250 # half what we waited before, just to be sure + local now + + now=$(msk_info_get_value "${port}" "${info}") + + mptcp_lib_print_title "${msg}" + if { [ -z "${cnt}" ] || [ -z "${now}" ]; } && + ! mptcp_lib_expect_all_features; then + mptcp_lib_pr_skip "Feature probably not supported" + mptcp_lib_result_skip "${msg}" + elif [ "$((cnt + delta_ms))" -lt "${now}" ]; then + mptcp_lib_pr_ok + mptcp_lib_result_pass "${msg}" + else + mptcp_lib_pr_fail "value of ${info} changed by $((now - cnt))ms," \ + "expected at least ${delta_ms}ms" + mptcp_lib_result_fail "${msg}" + ret=${KSFT_FAIL} + fi +} + +chk_last_time_info() +{ + local port="${1}" + local data_sent data_recv ack_recv + + data_sent=$(msk_info_get_value "${port}" "last_data_sent") + data_recv=$(msk_info_get_value "${port}" "last_data_recv") + ack_recv=$(msk_info_get_value "${port}" "last_ack_recv") + + sleep 0.5 # wait to check after if the timestamps difference + + chk_msk_info "${port}" "last_data_sent" "${data_sent}" + chk_msk_info "${port}" "last_data_recv" "${data_recv}" + chk_msk_info "${port}" "last_ack_recv" "${ack_recv}" +} + wait_connected() { local listener_ns="${1}" @@ -233,6 +285,7 @@ echo "b" | \ 127.0.0.1 >/dev/null & wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " +chk_last_time_info 10000 chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" chk_msk_inuse 2 diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 4131f3263a..b77fb7065b 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -147,7 +147,7 @@ cleanup() mptcp_lib_check_mptcp mptcp_lib_check_kallsyms -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc sin=$(mktemp) sout=$(mktemp) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 2290125f6d..108aeeb84e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -31,7 +31,6 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) capture=false checksum=false -ip_mptcp=0 check_invert=0 validate_checksum=false init=0 @@ -142,7 +141,7 @@ init() { mptcp_lib_check_mptcp mptcp_lib_check_kallsyms - mptcp_lib_check_tools ip ss "${iptables}" "${ip6tables}" + mptcp_lib_check_tools ip tc ss "${iptables}" "${ip6tables}" sin=$(mktemp) sout=$(mktemp) @@ -610,173 +609,65 @@ kill_events_pids() pm_nl_set_limits() { - local ns=$1 - local addrs=$2 - local subflows=$3 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows - else - ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows - fi + mptcp_lib_pm_nl_set_limits "${@}" } pm_nl_add_endpoint() { - local ns=$1 - local addr=$2 - local flags _flags - local port _port - local dev _dev - local id _id - local nr=2 - - local p - for p in "${@}" - do - if [ $p = "flags" ]; then - eval _flags=\$"$nr" - [ -n "$_flags" ]; flags="flags $_flags" - fi - if [ $p = "dev" ]; then - eval _dev=\$"$nr" - [ -n "$_dev" ]; dev="dev $_dev" - fi - if [ $p = "id" ]; then - eval _id=\$"$nr" - [ -n "$_id" ]; id="id $_id" - fi - if [ $p = "port" ]; then - eval _port=\$"$nr" - [ -n "$_port" ]; port="port $_port" - fi - - nr=$((nr + 1)) - done - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port - else - ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port - fi + mptcp_lib_pm_nl_add_endpoint "${@}" } pm_nl_del_endpoint() { - local ns=$1 - local id=$2 - local addr=$3 - - if [ $ip_mptcp -eq 1 ]; then - [ $id -ne 0 ] && addr='' - ip -n $ns mptcp endpoint delete id $id $addr - else - ip netns exec $ns ./pm_nl_ctl del $id $addr - fi + mptcp_lib_pm_nl_del_endpoint "${@}" } pm_nl_flush_endpoint() { - local ns=$1 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint flush - else - ip netns exec $ns ./pm_nl_ctl flush - fi + mptcp_lib_pm_nl_flush_endpoint "${@}" } pm_nl_show_endpoints() { - local ns=$1 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint show - else - ip netns exec $ns ./pm_nl_ctl dump - fi + mptcp_lib_pm_nl_show_endpoints "${@}" } pm_nl_change_endpoint() { - local ns=$1 - local id=$2 - local flags=$3 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint change id $id ${flags//","/" "} - else - ip netns exec $ns ./pm_nl_ctl set id $id flags $flags - fi + mptcp_lib_pm_nl_change_endpoint "${@}" } pm_nl_check_endpoint() { - local line expected_line local msg="$1" local ns=$2 local addr=$3 - local _flags="" - local flags - local _port - local port - local dev - local _id - local id + local flags dev id port print_check "${msg}" shift 3 while [ -n "$1" ]; do - if [ $1 = "flags" ]; then - _flags=$2 - [ -n "$_flags" ]; flags="flags $_flags" - shift - elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="dev $2" + case "${1}" in + "flags" | "dev" | "id" | "port") + eval "${1}"="${2}" shift - elif [ $1 = "id" ]; then - _id=$2 - [ -n "$_id" ]; id="id $_id" - shift - elif [ $1 = "port" ]; then - _port=$2 - [ -n "$_port" ]; port=" port $_port" - shift - fi + ;; + *) + ;; + esac shift done - if [ -z "$id" ]; then + if [ -z "${id}" ]; then test_fail "bad test - missing endpoint id" return fi - if [ $ip_mptcp -eq 1 ]; then - # get line and trim trailing whitespace - line=$(ip -n $ns mptcp endpoint show $id) - line="${line% }" - # the dump order is: address id flags port dev - [ -n "$addr" ] && expected_line="$addr" - expected_line+=" $id" - [ -n "$_flags" ] && expected_line+=" ${_flags//","/" "}" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$port" ] && expected_line+=" $port" - else - line=$(ip netns exec $ns ./pm_nl_ctl get $_id) - # the dump order is: id flags dev address port - expected_line="$id" - [ -n "$flags" ] && expected_line+=" $flags" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$addr" ] && expected_line+=" $addr" - [ -n "$_port" ] && expected_line+=" $_port" - fi - if [ "$line" = "$expected_line" ]; then - print_ok - else - fail_test "expected '$expected_line' found '$line'" - fi + check_output "mptcp_lib_pm_nl_get_endpoint ${ns} ${id}" \ + "$(mptcp_lib_pm_nl_format_endpoints \ + "${id},${addr},${flags//","/" "},${dev},${port}")" } pm_nl_set_endpoint() @@ -3711,7 +3602,7 @@ while getopts "${all_tests_args}cCih" opt; do checksum=true ;; i) - ip_mptcp=1 + mptcp_lib_set_ip_mptcp ;; h) usage diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d529b4b37a..6ffa9b7a32 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -21,8 +21,10 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 +MPTCP_LIB_SUBTEST_FLAKY=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" +MPTCP_LIB_IP_MPTCP=0 # only if supported (or forced) and not disabled, see no-color.org if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } && @@ -40,6 +42,16 @@ else readonly MPTCP_LIB_COLOR_RESET= fi +# SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var can be set not to ignore errors +# from subtests marked as flaky +mptcp_lib_override_flaky() { + [ "${SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY:-}" = 1 ] +} + +mptcp_lib_subtest_is_flaky() { + [ "${MPTCP_LIB_SUBTEST_FLAKY}" = 1 ] && ! mptcp_lib_override_flaky +} + # $1: color, $2: text mptcp_lib_print_color() { echo -e "${MPTCP_LIB_START_PRINT:-}${*}${MPTCP_LIB_COLOR_RESET}" @@ -71,7 +83,16 @@ mptcp_lib_pr_skip() { } mptcp_lib_pr_fail() { - mptcp_lib_print_err "[FAIL]${1:+ ${*}}" + local title cmt + + if mptcp_lib_subtest_is_flaky; then + title="IGNO" + cmt=" (flaky)" + else + title="FAIL" + fi + + mptcp_lib_print_err "[${title}]${cmt}${1:+ ${*}}" } mptcp_lib_pr_info() { @@ -207,7 +228,13 @@ mptcp_lib_result_pass() { # $1: test name mptcp_lib_result_fail() { - __mptcp_lib_result_add "not ok" "${1}" + if mptcp_lib_subtest_is_flaky; then + # It might sound better to use 'not ok # TODO' or 'ok # SKIP', + # but some CIs don't understand 'TODO' and treat SKIP as errors. + __mptcp_lib_result_add "ok" "${1} # IGNORE Flaky" + else + __mptcp_lib_result_add "not ok" "${1}" + fi } # $1: test name @@ -384,6 +411,12 @@ mptcp_lib_check_tools() { exit ${KSFT_SKIP} fi ;; + "tc") + if ! tc -help &> /dev/null; then + mptcp_lib_pr_skip "Could not run test without tc tool" + exit ${KSFT_SKIP} + fi + ;; "ss") if ! ss -h | grep -q MPTCP; then mptcp_lib_pr_skip "ss tool does not support MPTCP" @@ -505,3 +538,131 @@ mptcp_lib_verify_listener_events() { mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}" return "${rc}" } + +mptcp_lib_set_ip_mptcp() { + MPTCP_LIB_IP_MPTCP=1 +} + +mptcp_lib_is_ip_mptcp() { + [ "${MPTCP_LIB_IP_MPTCP}" = "1" ] +} + +# format: ,,, +mptcp_lib_pm_nl_format_endpoints() { + local entry id ip flags dev port + + for entry in "${@}"; do + IFS=, read -r id ip flags dev port <<< "${entry}" + if mptcp_lib_is_ip_mptcp; then + echo -n "${ip}" + [ -n "${port}" ] && echo -n " port ${port}" + echo -n " id ${id}" + [ -n "${flags}" ] && echo -n " ${flags}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo " " # always a space at the end + else + echo -n "id ${id}" + echo -n " flags ${flags//" "/","}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo -n " ${ip}" + [ -n "${port}" ] && echo -n " ${port}" + echo "" + fi + done +} + +mptcp_lib_pm_nl_get_endpoint() { + local ns=${1} + local id=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show id "${id}" + else + ip netns exec "${ns}" ./pm_nl_ctl get "${id}" + fi +} + +mptcp_lib_pm_nl_set_limits() { + local ns=${1} + local addrs=${2} + local subflows=${3} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp limits set add_addr_accepted "${addrs}" subflows "${subflows}" + else + ip netns exec "${ns}" ./pm_nl_ctl limits "${addrs}" "${subflows}" + fi +} + +mptcp_lib_pm_nl_add_endpoint() { + local ns=${1} + local addr=${2} + local flags dev id port + local nr=2 + + local p + for p in "${@}"; do + case "${p}" in + "flags" | "dev" | "id" | "port") + eval "${p}"=\$"${nr}" + ;; + esac + + nr=$((nr + 1)) + done + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + else + ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + fi +} + +mptcp_lib_pm_nl_del_endpoint() { + local ns=${1} + local id=${2} + local addr=${3} + + if mptcp_lib_is_ip_mptcp; then + [ "${id}" -ne 0 ] && addr='' + ip -n "${ns}" mptcp endpoint delete id "${id}" ${addr:+"${addr}"} + else + ip netns exec "${ns}" ./pm_nl_ctl del "${id}" "${addr}" + fi +} + +mptcp_lib_pm_nl_flush_endpoint() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint flush + else + ip netns exec "${ns}" ./pm_nl_ctl flush + fi +} + +mptcp_lib_pm_nl_show_endpoints() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show + else + ip netns exec "${ns}" ./pm_nl_ctl dump + fi +} + +mptcp_lib_pm_nl_change_endpoint() { + local ns=${1} + local id=${2} + local flags=${3} + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint change id "${id}" ${flags//","/" "} + else + ip netns exec "${ns}" ./pm_nl_ctl set id "${id}" flags "${flags}" + fi +} diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index e2d70c1878..68899a303a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -22,6 +22,28 @@ ns1="" ns2="" ns_sbox="" +usage() { + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" +} + +while getopts "hi" option;do + case "$option" in + "h") + usage "$0" + exit ${KSFT_PASS} + ;; + "i") + mptcp_lib_set_ip_mptcp + ;; + "?") + usage "$0" + exit ${KSFT_FAIL} + ;; + esac +done + add_mark_rules() { local ns=$1 @@ -58,15 +80,15 @@ init() # let $ns2 reach any $ns1 address from any interface ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i - ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal - ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "10.0.${i}.1" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "dead:beef:${i}::1" flags signal - ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal - ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "10.0.${i}.2" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "dead:beef:${i}::2" flags signal done - ip netns exec $ns1 ./pm_nl_ctl limits 8 8 - ip netns exec $ns2 ./pm_nl_ctl limits 8 8 + mptcp_lib_pm_nl_set_limits "${ns1}" 8 8 + mptcp_lib_pm_nl_set_limits "${ns2}" 8 8 add_mark_rules $ns1 1 add_mark_rules $ns2 2 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 6ab8c5d363..2757378b1b 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -1,28 +1,28 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Double quotes to prevent globbing and word splitting is recommended in new -# code but we accept it, especially because there were too many before having -# address all other issues detected by shellcheck. -#shellcheck disable=SC2086 - . "$(dirname "${0}")/mptcp_lib.sh" ret=0 usage() { - echo "Usage: $0 [ -h ]" + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" } -optstring=h +optstring=hi while getopts "$optstring" option;do case "$option" in "h") - usage $0 + usage "$0" exit ${KSFT_PASS} ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") - usage $0 + usage "$0" exit ${KSFT_FAIL} ;; esac @@ -35,7 +35,7 @@ err=$(mktemp) #shellcheck disable=SC2317 cleanup() { - rm -f $err + rm -f "${err}" mptcp_lib_ns_exit "${ns1}" } @@ -46,6 +46,76 @@ trap cleanup EXIT mptcp_lib_ns_init ns1 +format_limits() { + local accept="${1}" + local subflows="${2}" + + if mptcp_lib_is_ip_mptcp; then + # with a space at the end + printf "add_addr_accepted %d subflows %d \n" "${accept}" "${subflows}" + else + printf "accept %d\nsubflows %d\n" "${accept}" "${subflows}" + fi +} + +get_limits() { + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp limits + else + ip netns exec "${ns1}" ./pm_nl_ctl limits + fi +} + +format_endpoints() { + mptcp_lib_pm_nl_format_endpoints "${@}" +} + +get_endpoint() { + # shellcheck disable=SC2317 # invoked indirectly + mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" +} + +change_address() { + local addr=${1} + local flags=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp endpoint change "${addr}" "${flags}" + else + ip netns exec "${ns1}" ./pm_nl_ctl set "${addr}" flags "${flags}" + fi +} + +set_limits() +{ + mptcp_lib_pm_nl_set_limits "${ns1}" "${@}" +} + +add_endpoint() +{ + mptcp_lib_pm_nl_add_endpoint "${ns1}" "${@}" +} + +del_endpoint() +{ + mptcp_lib_pm_nl_del_endpoint "${ns1}" "${@}" +} + +flush_endpoint() +{ + mptcp_lib_pm_nl_flush_endpoint "${ns1}" +} + +show_endpoints() +{ + mptcp_lib_pm_nl_show_endpoints "${ns1}" +} + +change_endpoint() +{ + mptcp_lib_pm_nl_change_endpoint "${ns1}" "${@}" +} + check() { local cmd="$1" @@ -67,125 +137,126 @@ check() fi } -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "show_endpoints" "" "defaults addr list" -default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)" +default_limits="$(get_limits)" if mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 -subflows 2" "defaults limits" + check "get_limits" "$(format_limits 0 2)" "defaults limits" fi -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup -check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr" +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 flags subflow dev lo +add_endpoint 10.0.1.3 flags signal,backup +check "get_endpoint 1" "$(format_endpoints "1,10.0.1.1")" "simple add/get addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 2 flags subflow dev lo 10.0.1.2 -id 3 flags signal,backup 10.0.1.3" "dump addrs" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2,subflow,lo" \ + "3,10.0.1.3,signal backup")" "dump addrs" -ip netns exec $ns1 ./pm_nl_ctl del 2 -check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3" "dump addrs after del" +del_endpoint 2 +check "get_endpoint 2" "" "simple del addr" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup")" "dump addrs after del" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" +add_endpoint 10.0.1.3 2>/dev/null +check "get_endpoint 4" "" "duplicate addr" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 flags signal -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment" +add_endpoint 10.0.1.4 flags signal +check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" for i in $(seq 5 9); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 + add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1 done -check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" -check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" +check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" +check "get_endpoint 10" "" "above hard addr limit" -ip netns exec $ns1 ./pm_nl_ctl del 9 +del_endpoint 9 for i in $(seq 10 255); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i - ip netns exec $ns1 ./pm_nl_ctl del $i + add_endpoint 10.0.0.9 id "${i}" + del_endpoint "${i}" done -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3 -id 4 flags signal 10.0.1.4 -id 5 flags signal 10.0.1.5 -id 6 flags signal 10.0.1.6 -id 7 flags signal 10.0.1.7 -id 8 flags signal 10.0.1.8" "id limit" - -ip netns exec $ns1 ./pm_nl_ctl flush -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" - -ip netns exec $ns1 ./pm_nl_ctl limits 9 1 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit" - -ip netns exec $ns1 ./pm_nl_ctl limits 1 9 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit" - -ip netns exec $ns1 ./pm_nl_ctl limits 8 8 -check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 -subflows 8" "set limits" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 id 100 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.5 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 2 flags 10.0.1.2 -id 3 flags 10.0.1.7 -id 4 flags 10.0.1.8 -id 100 flags 10.0.1.3 -id 101 flags 10.0.1.4 -id 254 flags 10.0.1.5 -id 255 flags 10.0.1.6" "set ids" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.2 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.3 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.5 id 253 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.0.1 -id 2 flags 10.0.0.4 -id 3 flags 10.0.0.6 -id 4 flags 10.0.0.7 -id 5 flags 10.0.0.8 -id 253 flags 10.0.0.5 -id 254 flags 10.0.0.2 -id 255 flags 10.0.0.3" "wrap-around ids" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup 10.0.1.1" "set flags (backup)" -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nobackup)" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup" \ + "4,10.0.1.4,signal" \ + "5,10.0.1.5,signal" \ + "6,10.0.1.6,signal" \ + "7,10.0.1.7,signal" \ + "8,10.0.1.8,signal")" "id limit" + +flush_endpoint +check "show_endpoints" "" "flush addrs" + +set_limits 9 1 2>/dev/null +check "get_limits" "${default_limits}" "rcv addrs above hard limit" + +set_limits 1 9 2>/dev/null +check "get_limits" "${default_limits}" "subflows above hard limit" + +set_limits 8 8 +check "get_limits" "$(format_limits 8 8)" "set limits" + +flush_endpoint +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 +add_endpoint 10.0.1.3 id 100 +add_endpoint 10.0.1.4 +add_endpoint 10.0.1.5 id 254 +add_endpoint 10.0.1.6 +add_endpoint 10.0.1.7 +add_endpoint 10.0.1.8 +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2" \ + "3,10.0.1.7" \ + "4,10.0.1.8" \ + "100,10.0.1.3" \ + "101,10.0.1.4" \ + "254,10.0.1.5" \ + "255,10.0.1.6")" "set ids" + +flush_endpoint +add_endpoint 10.0.0.1 +add_endpoint 10.0.0.2 id 254 +add_endpoint 10.0.0.3 +add_endpoint 10.0.0.4 +add_endpoint 10.0.0.5 id 253 +add_endpoint 10.0.0.6 +add_endpoint 10.0.0.7 +add_endpoint 10.0.0.8 +check "show_endpoints" \ + "$(format_endpoints "1,10.0.0.1" \ + "2,10.0.0.4" \ + "3,10.0.0.6" \ + "4,10.0.0.7" \ + "5,10.0.0.8" \ + "253,10.0.0.5" \ + "254,10.0.0.2" \ + "255,10.0.0.3")" "wrap-around ids" + +flush_endpoint +add_endpoint 10.0.1.1 flags subflow +change_address 10.0.1.1 backup +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup")" \ + "set flags (backup)" +change_address 10.0.1.1 nobackup +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nobackup)" # fullmesh support has been added later -ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null -if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" || +change_endpoint 1 fullmesh 2>/dev/null +if show_endpoints | grep -q "fullmesh" || mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,fullmesh 10.0.1.1" " (fullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nofullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup,fullmesh 10.0.1.1" " (backup,fullmesh)" + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \ + " (fullmesh)" + change_endpoint 1 nofullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nofullmesh)" + change_endpoint 1 backup,fullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \ + " (backup,fullmesh)" else for st in fullmesh nofullmesh backup,fullmesh; do st=" (${st})" diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 7322e1e4e5..f74e1c3c12 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -27,10 +27,11 @@ capout="" size=0 usage() { - echo "Usage: $0 [ -b ] [ -c ] [ -d ]" + echo "Usage: $0 [ -b ] [ -c ] [ -d ] [ -i]" echo -e "\t-b: bail out after first error, otherwise runs al testcases" echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" echo -e "\t-d: debug this script" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" } # This function is used in the cleanup trap @@ -45,7 +46,7 @@ cleanup() } mptcp_lib_check_mptcp -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc # "$ns1" ns2 ns3 # ns1eth1 ns2eth1 ns2eth3 ns3eth1 @@ -85,8 +86,8 @@ setup() ip -net "$ns1" route add default via 10.0.2.2 metric 101 ip -net "$ns1" route add default via dead:beef:2::2 metric 101 - ip netns exec "$ns1" ./pm_nl_ctl limits 1 1 - ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow + mptcp_lib_pm_nl_set_limits "${ns1}" 1 1 + mptcp_lib_pm_nl_add_endpoint "${ns1}" 10.0.2.1 dev ns1eth2 flags subflow ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad @@ -108,7 +109,7 @@ setup() ip -net "$ns3" route add default via 10.0.3.2 ip -net "$ns3" route add default via dead:beef:3::2 - ip netns exec "$ns3" ./pm_nl_ctl limits 1 1 + mptcp_lib_pm_nl_set_limits "${ns3}" 1 1 # debug build can slow down measurably the test program # we use quite tight time limit on the run-time, to ensure @@ -259,7 +260,7 @@ run_test() fi } -while getopts "bcdh" option;do +while getopts "bcdhi" option;do case "$option" in "h") usage $0 @@ -274,6 +275,9 @@ while getopts "bcdh" option;do "d") set -x ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") usage $0 exit ${KSFT_FAIL} diff --git a/tools/testing/selftests/net/nat6to4.bpf.c b/tools/testing/selftests/net/nat6to4.bpf.c new file mode 100644 index 0000000000..ac54c36b25 --- /dev/null +++ b/tools/testing/selftests/net/nat6to4.bpf.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This code is taken from the Android Open Source Project and the author + * (Maciej Å»enczykowski) has gave permission to relicense it under the + * GPLv2. Therefore this program is free software; + * You can redistribute it and/or modify it under the terms of the GNU + * General Public License version 2 as published by the Free Software + * Foundation + + * The original headers, including the original license headers, are + * included below for completeness. + * + * Copyright (C) 2019 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +#include +#include + +#define IP_DF 0x4000 // Flag: "Don't Fragment" + +SEC("schedcls/ingress6/nat_6") +int sched_cls_ingress6_nat_6_prog(struct __sk_buff *skb) +{ + const int l2_header_size = sizeof(struct ethhdr); + void *data = (void *)(long)skb->data; + const void *data_end = (void *)(long)skb->data_end; + const struct ethhdr * const eth = data; // used iff is_ethernet + const struct ipv6hdr * const ip6 = (void *)(eth + 1); + + // Require ethernet dst mac address to be our unicast address. + if (skb->pkt_type != PACKET_HOST) + return TC_ACT_OK; + + // Must be meta-ethernet IPv6 frame + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + // Must have (ethernet and) ipv6 header + if (data + l2_header_size + sizeof(*ip6) > data_end) + return TC_ACT_OK; + + // Ethertype - if present - must be IPv6 + if (eth->h_proto != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + // IP version must be 6 + if (ip6->version != 6) + return TC_ACT_OK; + // Maximum IPv6 payload length that can be translated to IPv4 + if (bpf_ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr)) + return TC_ACT_OK; + switch (ip6->nexthdr) { + case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 + case IPPROTO_UDP: // address means there is no need to update their checksums. + case IPPROTO_GRE: // We do not need to bother looking at GRE/ESP headers, + case IPPROTO_ESP: // since there is never a checksum to update. + break; + default: // do not know how to handle anything else + return TC_ACT_OK; + } + + struct ethhdr eth2; // used iff is_ethernet + + eth2 = *eth; // Copy over the ethernet header (src/dst mac) + eth2.h_proto = bpf_htons(ETH_P_IP); // But replace the ethertype + + struct iphdr ip = { + .version = 4, // u4 + .ihl = sizeof(struct iphdr) / sizeof(__u32), // u4 + .tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8 + .tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + sizeof(struct iphdr)), // u16 + .id = 0, // u16 + .frag_off = bpf_htons(IP_DF), // u16 + .ttl = ip6->hop_limit, // u8 + .protocol = ip6->nexthdr, // u8 + .check = 0, // u16 + .saddr = 0x0201a8c0, // u32 + .daddr = 0x0101a8c0, // u32 + }; + + // Calculate the IPv4 one's complement checksum of the IPv4 header. + __wsum sum4 = 0; + + for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i) + sum4 += ((__u16 *)&ip)[i]; + + // Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4 + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 + ip.check = (__u16)~sum4; // sum4 cannot be zero, so this is never 0xFFFF + + // Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header. + __wsum sum6 = 0; + // We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits) + for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i) + sum6 += ~((__u16 *)ip6)[i]; // note the bitwise negation + + // Note that there is no L4 checksum update: we are relying on the checksum neutrality + // of the ipv6 address chosen by netd's ClatdController. + + // Packet mutations begin - point of no return, but if this first modification fails + // the packet is probably still pristine, so let clatd handle it. + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0)) + return TC_ACT_OK; + bpf_csum_update(skb, sum6); + + data = (void *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + if (data + l2_header_size + sizeof(struct iphdr) > data_end) + return TC_ACT_SHOT; + + struct ethhdr *new_eth = data; + + // Copy over the updated ethernet header + *new_eth = eth2; + + // Copy over the new ipv4 header. + *(struct iphdr *)(new_eth + 1) = ip; + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); +} + +SEC("schedcls/egress4/snat4") +int sched_cls_egress4_snat4_prog(struct __sk_buff *skb) +{ + const int l2_header_size = sizeof(struct ethhdr); + void *data = (void *)(long)skb->data; + const void *data_end = (void *)(long)skb->data_end; + const struct ethhdr *const eth = data; // used iff is_ethernet + const struct iphdr *const ip4 = (void *)(eth + 1); + + // Must be meta-ethernet IPv4 frame + if (skb->protocol != bpf_htons(ETH_P_IP)) + return TC_ACT_OK; + + // Must have ipv4 header + if (data + l2_header_size + sizeof(struct ipv6hdr) > data_end) + return TC_ACT_OK; + + // Ethertype - if present - must be IPv4 + if (eth->h_proto != bpf_htons(ETH_P_IP)) + return TC_ACT_OK; + + // IP version must be 4 + if (ip4->version != 4) + return TC_ACT_OK; + + // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header + if (ip4->ihl != 5) + return TC_ACT_OK; + + // Maximum IPv6 payload length that can be translated to IPv4 + if (bpf_htons(ip4->tot_len) > 0xFFFF - sizeof(struct ipv6hdr)) + return TC_ACT_OK; + + // Calculate the IPv4 one's complement checksum of the IPv4 header. + __wsum sum4 = 0; + + for (int i = 0; i < sizeof(*ip4) / sizeof(__u16); ++i) + sum4 += ((__u16 *)ip4)[i]; + + // Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4 + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 + // for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF + if (sum4 != 0xFFFF) + return TC_ACT_OK; + + // Minimum IPv4 total length is the size of the header + if (bpf_ntohs(ip4->tot_len) < sizeof(*ip4)) + return TC_ACT_OK; + + // We are incapable of dealing with IPv4 fragments + if (ip4->frag_off & ~bpf_htons(IP_DF)) + return TC_ACT_OK; + + switch (ip4->protocol) { + case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 + case IPPROTO_GRE: // address means there is no need to update their checksums. + case IPPROTO_ESP: // We do not need to bother looking at GRE/ESP headers, + break; // since there is never a checksum to update. + + case IPPROTO_UDP: // See above comment, but must also have UDP header... + if (data + sizeof(*ip4) + sizeof(struct udphdr) > data_end) + return TC_ACT_OK; + const struct udphdr *uh = (const struct udphdr *)(ip4 + 1); + // If IPv4/UDP checksum is 0 then fallback to clatd so it can calculate the + // checksum. Otherwise the network or more likely the NAT64 gateway might + // drop the packet because in most cases IPv6/UDP packets with a zero checksum + // are invalid. See RFC 6935. TODO: calculate checksum via bpf_csum_diff() + if (!uh->check) + return TC_ACT_OK; + break; + + default: // do not know how to handle anything else + return TC_ACT_OK; + } + struct ethhdr eth2; // used iff is_ethernet + + eth2 = *eth; // Copy over the ethernet header (src/dst mac) + eth2.h_proto = bpf_htons(ETH_P_IPV6); // But replace the ethertype + + struct ipv6hdr ip6 = { + .version = 6, // __u8:4 + .priority = ip4->tos >> 4, // __u8:4 + .flow_lbl = {(ip4->tos & 0xF) << 4, 0, 0}, // __u8[3] + .payload_len = bpf_htons(bpf_ntohs(ip4->tot_len) - 20), // __be16 + .nexthdr = ip4->protocol, // __u8 + .hop_limit = ip4->ttl, // __u8 + }; + ip6.saddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); + ip6.saddr.in6_u.u6_addr32[1] = 0; + ip6.saddr.in6_u.u6_addr32[2] = 0; + ip6.saddr.in6_u.u6_addr32[3] = bpf_htonl(1); + ip6.daddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); + ip6.daddr.in6_u.u6_addr32[1] = 0; + ip6.daddr.in6_u.u6_addr32[2] = 0; + ip6.daddr.in6_u.u6_addr32[3] = bpf_htonl(2); + + // Calculate the IPv6 16-bit one's complement checksum of the IPv6 header. + __wsum sum6 = 0; + // We'll end up with a non-zero sum due to ip6.version == 6 + for (int i = 0; i < sizeof(ip6) / sizeof(__u16); ++i) + sum6 += ((__u16 *)&ip6)[i]; + + // Packet mutations begin - point of no return, but if this first modification fails + // the packet is probably still pristine, so let clatd handle it. + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0)) + return TC_ACT_OK; + + // This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet. + // In such a case, skb->csum is a 16-bit one's complement sum of the entire payload, + // thus we need to subtract out the ipv4 header's sum, and add in the ipv6 header's sum. + // However, we've already verified the ipv4 checksum is correct and thus 0. + // Thus we only need to add the ipv6 header's sum. + // + // bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error + // (-ENOTSUPP) if it isn't. So we just ignore the return code (see above for more details). + bpf_csum_update(skb, sum6); + + // bpf_skb_change_proto() invalidates all pointers - reload them. + data = (void *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + + // I cannot think of any valid way for this error condition to trigger, however I do + // believe the explicit check is required to keep the in kernel ebpf verifier happy. + if (data + l2_header_size + sizeof(ip6) > data_end) + return TC_ACT_SHOT; + + struct ethhdr *new_eth = data; + + // Copy over the updated ethernet header + *new_eth = eth2; + // Copy over the new ipv4 header. + *(struct ipv6hdr *)(new_eth + 1) = ip6; + return TC_ACT_OK; +} + +char _license[] SEC("license") = ("GPL"); diff --git a/tools/testing/selftests/net/nat6to4.c b/tools/testing/selftests/net/nat6to4.c deleted file mode 100644 index ac54c36b25..0000000000 --- a/tools/testing/selftests/net/nat6to4.c +++ /dev/null @@ -1,285 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * This code is taken from the Android Open Source Project and the author - * (Maciej Å»enczykowski) has gave permission to relicense it under the - * GPLv2. Therefore this program is free software; - * You can redistribute it and/or modify it under the terms of the GNU - * General Public License version 2 as published by the Free Software - * Foundation - - * The original headers, including the original license headers, are - * included below for completeness. - * - * Copyright (C) 2019 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - -#include -#include - -#define IP_DF 0x4000 // Flag: "Don't Fragment" - -SEC("schedcls/ingress6/nat_6") -int sched_cls_ingress6_nat_6_prog(struct __sk_buff *skb) -{ - const int l2_header_size = sizeof(struct ethhdr); - void *data = (void *)(long)skb->data; - const void *data_end = (void *)(long)skb->data_end; - const struct ethhdr * const eth = data; // used iff is_ethernet - const struct ipv6hdr * const ip6 = (void *)(eth + 1); - - // Require ethernet dst mac address to be our unicast address. - if (skb->pkt_type != PACKET_HOST) - return TC_ACT_OK; - - // Must be meta-ethernet IPv6 frame - if (skb->protocol != bpf_htons(ETH_P_IPV6)) - return TC_ACT_OK; - - // Must have (ethernet and) ipv6 header - if (data + l2_header_size + sizeof(*ip6) > data_end) - return TC_ACT_OK; - - // Ethertype - if present - must be IPv6 - if (eth->h_proto != bpf_htons(ETH_P_IPV6)) - return TC_ACT_OK; - - // IP version must be 6 - if (ip6->version != 6) - return TC_ACT_OK; - // Maximum IPv6 payload length that can be translated to IPv4 - if (bpf_ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr)) - return TC_ACT_OK; - switch (ip6->nexthdr) { - case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 - case IPPROTO_UDP: // address means there is no need to update their checksums. - case IPPROTO_GRE: // We do not need to bother looking at GRE/ESP headers, - case IPPROTO_ESP: // since there is never a checksum to update. - break; - default: // do not know how to handle anything else - return TC_ACT_OK; - } - - struct ethhdr eth2; // used iff is_ethernet - - eth2 = *eth; // Copy over the ethernet header (src/dst mac) - eth2.h_proto = bpf_htons(ETH_P_IP); // But replace the ethertype - - struct iphdr ip = { - .version = 4, // u4 - .ihl = sizeof(struct iphdr) / sizeof(__u32), // u4 - .tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8 - .tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + sizeof(struct iphdr)), // u16 - .id = 0, // u16 - .frag_off = bpf_htons(IP_DF), // u16 - .ttl = ip6->hop_limit, // u8 - .protocol = ip6->nexthdr, // u8 - .check = 0, // u16 - .saddr = 0x0201a8c0, // u32 - .daddr = 0x0101a8c0, // u32 - }; - - // Calculate the IPv4 one's complement checksum of the IPv4 header. - __wsum sum4 = 0; - - for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i) - sum4 += ((__u16 *)&ip)[i]; - - // Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4 - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 - ip.check = (__u16)~sum4; // sum4 cannot be zero, so this is never 0xFFFF - - // Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header. - __wsum sum6 = 0; - // We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits) - for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i) - sum6 += ~((__u16 *)ip6)[i]; // note the bitwise negation - - // Note that there is no L4 checksum update: we are relying on the checksum neutrality - // of the ipv6 address chosen by netd's ClatdController. - - // Packet mutations begin - point of no return, but if this first modification fails - // the packet is probably still pristine, so let clatd handle it. - if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0)) - return TC_ACT_OK; - bpf_csum_update(skb, sum6); - - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - if (data + l2_header_size + sizeof(struct iphdr) > data_end) - return TC_ACT_SHOT; - - struct ethhdr *new_eth = data; - - // Copy over the updated ethernet header - *new_eth = eth2; - - // Copy over the new ipv4 header. - *(struct iphdr *)(new_eth + 1) = ip; - return bpf_redirect(skb->ifindex, BPF_F_INGRESS); -} - -SEC("schedcls/egress4/snat4") -int sched_cls_egress4_snat4_prog(struct __sk_buff *skb) -{ - const int l2_header_size = sizeof(struct ethhdr); - void *data = (void *)(long)skb->data; - const void *data_end = (void *)(long)skb->data_end; - const struct ethhdr *const eth = data; // used iff is_ethernet - const struct iphdr *const ip4 = (void *)(eth + 1); - - // Must be meta-ethernet IPv4 frame - if (skb->protocol != bpf_htons(ETH_P_IP)) - return TC_ACT_OK; - - // Must have ipv4 header - if (data + l2_header_size + sizeof(struct ipv6hdr) > data_end) - return TC_ACT_OK; - - // Ethertype - if present - must be IPv4 - if (eth->h_proto != bpf_htons(ETH_P_IP)) - return TC_ACT_OK; - - // IP version must be 4 - if (ip4->version != 4) - return TC_ACT_OK; - - // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header - if (ip4->ihl != 5) - return TC_ACT_OK; - - // Maximum IPv6 payload length that can be translated to IPv4 - if (bpf_htons(ip4->tot_len) > 0xFFFF - sizeof(struct ipv6hdr)) - return TC_ACT_OK; - - // Calculate the IPv4 one's complement checksum of the IPv4 header. - __wsum sum4 = 0; - - for (int i = 0; i < sizeof(*ip4) / sizeof(__u16); ++i) - sum4 += ((__u16 *)ip4)[i]; - - // Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4 - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 - // for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF - if (sum4 != 0xFFFF) - return TC_ACT_OK; - - // Minimum IPv4 total length is the size of the header - if (bpf_ntohs(ip4->tot_len) < sizeof(*ip4)) - return TC_ACT_OK; - - // We are incapable of dealing with IPv4 fragments - if (ip4->frag_off & ~bpf_htons(IP_DF)) - return TC_ACT_OK; - - switch (ip4->protocol) { - case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 - case IPPROTO_GRE: // address means there is no need to update their checksums. - case IPPROTO_ESP: // We do not need to bother looking at GRE/ESP headers, - break; // since there is never a checksum to update. - - case IPPROTO_UDP: // See above comment, but must also have UDP header... - if (data + sizeof(*ip4) + sizeof(struct udphdr) > data_end) - return TC_ACT_OK; - const struct udphdr *uh = (const struct udphdr *)(ip4 + 1); - // If IPv4/UDP checksum is 0 then fallback to clatd so it can calculate the - // checksum. Otherwise the network or more likely the NAT64 gateway might - // drop the packet because in most cases IPv6/UDP packets with a zero checksum - // are invalid. See RFC 6935. TODO: calculate checksum via bpf_csum_diff() - if (!uh->check) - return TC_ACT_OK; - break; - - default: // do not know how to handle anything else - return TC_ACT_OK; - } - struct ethhdr eth2; // used iff is_ethernet - - eth2 = *eth; // Copy over the ethernet header (src/dst mac) - eth2.h_proto = bpf_htons(ETH_P_IPV6); // But replace the ethertype - - struct ipv6hdr ip6 = { - .version = 6, // __u8:4 - .priority = ip4->tos >> 4, // __u8:4 - .flow_lbl = {(ip4->tos & 0xF) << 4, 0, 0}, // __u8[3] - .payload_len = bpf_htons(bpf_ntohs(ip4->tot_len) - 20), // __be16 - .nexthdr = ip4->protocol, // __u8 - .hop_limit = ip4->ttl, // __u8 - }; - ip6.saddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); - ip6.saddr.in6_u.u6_addr32[1] = 0; - ip6.saddr.in6_u.u6_addr32[2] = 0; - ip6.saddr.in6_u.u6_addr32[3] = bpf_htonl(1); - ip6.daddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); - ip6.daddr.in6_u.u6_addr32[1] = 0; - ip6.daddr.in6_u.u6_addr32[2] = 0; - ip6.daddr.in6_u.u6_addr32[3] = bpf_htonl(2); - - // Calculate the IPv6 16-bit one's complement checksum of the IPv6 header. - __wsum sum6 = 0; - // We'll end up with a non-zero sum due to ip6.version == 6 - for (int i = 0; i < sizeof(ip6) / sizeof(__u16); ++i) - sum6 += ((__u16 *)&ip6)[i]; - - // Packet mutations begin - point of no return, but if this first modification fails - // the packet is probably still pristine, so let clatd handle it. - if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0)) - return TC_ACT_OK; - - // This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet. - // In such a case, skb->csum is a 16-bit one's complement sum of the entire payload, - // thus we need to subtract out the ipv4 header's sum, and add in the ipv6 header's sum. - // However, we've already verified the ipv4 checksum is correct and thus 0. - // Thus we only need to add the ipv6 header's sum. - // - // bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error - // (-ENOTSUPP) if it isn't. So we just ignore the return code (see above for more details). - bpf_csum_update(skb, sum6); - - // bpf_skb_change_proto() invalidates all pointers - reload them. - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - - // I cannot think of any valid way for this error condition to trigger, however I do - // believe the explicit check is required to keep the in kernel ebpf verifier happy. - if (data + l2_header_size + sizeof(ip6) > data_end) - return TC_ACT_SHOT; - - struct ethhdr *new_eth = data; - - // Copy over the updated ethernet header - *new_eth = eth2; - // Copy over the new ipv4 header. - *(struct ipv6hdr *)(new_eth + 1) = ip6; - return TC_ACT_OK; -} - -char _license[] SEC("license") = ("GPL"); diff --git a/tools/testing/selftests/net/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore new file mode 100644 index 0000000000..0a64d6d0e2 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/.gitignore @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +audit_logread +connect_close +conntrack_dump_flush +sctp_collision +nf_queue diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile new file mode 100644 index 0000000000..47945b2b3f --- /dev/null +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 + +top_srcdir = ../../../../.. + +HOSTPKG_CONFIG := pkg-config +MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) +MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) + +TEST_PROGS := br_netfilter.sh bridge_brouter.sh +TEST_PROGS += conntrack_icmp_related.sh +TEST_PROGS += conntrack_ipip_mtu.sh +TEST_PROGS += conntrack_tcp_unreplied.sh +TEST_PROGS += conntrack_sctp_collision.sh +TEST_PROGS += conntrack_vrf.sh +TEST_PROGS += ipvs.sh +TEST_PROGS += nf_conntrack_packetdrill.sh +TEST_PROGS += nf_nat_edemux.sh +TEST_PROGS += nft_audit.sh +TEST_PROGS += nft_concat_range.sh +TEST_PROGS += nft_conntrack_helper.sh +TEST_PROGS += nft_fib.sh +TEST_PROGS += nft_flowtable.sh +TEST_PROGS += nft_meta.sh +TEST_PROGS += nft_nat.sh +TEST_PROGS += nft_nat_zones.sh +TEST_PROGS += nft_queue.sh +TEST_PROGS += nft_synproxy.sh +TEST_PROGS += nft_zones_many.sh +TEST_PROGS += rpath.sh +TEST_PROGS += xt_string.sh + +TEST_PROGS_EXTENDED = nft_concat_range_perf.sh + +TEST_GEN_PROGS = conntrack_dump_flush + +TEST_GEN_FILES = audit_logread +TEST_GEN_FILES += connect_close nf_queue +TEST_GEN_FILES += sctp_collision + +include ../../lib.mk + +$(OUTPUT)/nf_queue: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS) + +$(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS) + +TEST_FILES := lib.sh +TEST_FILES += packetdrill + +TEST_INCLUDES := \ + ../lib.sh diff --git a/tools/testing/selftests/net/netfilter/audit_logread.c b/tools/testing/selftests/net/netfilter/audit_logread.c new file mode 100644 index 0000000000..a0a880fc2d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/audit_logread.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fd; + +#define MAX_AUDIT_MESSAGE_LENGTH 8970 +struct audit_message { + struct nlmsghdr nlh; + union { + struct audit_status s; + char data[MAX_AUDIT_MESSAGE_LENGTH]; + } u; +}; + +int audit_recv(int fd, struct audit_message *rep) +{ + struct sockaddr_nl addr; + socklen_t addrlen = sizeof(addr); + int ret; + + do { + ret = recvfrom(fd, rep, sizeof(*rep), 0, + (struct sockaddr *)&addr, &addrlen); + } while (ret < 0 && errno == EINTR); + + if (ret < 0 || + addrlen != sizeof(addr) || + addr.nl_pid != 0 || + rep->nlh.nlmsg_type == NLMSG_ERROR) /* short-cut for now */ + return -1; + + return ret; +} + +int audit_send(int fd, uint16_t type, uint32_t key, uint32_t val) +{ + static int seq = 0; + struct audit_message msg = { + .nlh = { + .nlmsg_len = NLMSG_SPACE(sizeof(msg.u.s)), + .nlmsg_type = type, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, + .nlmsg_seq = ++seq, + }, + .u.s = { + .mask = key, + .enabled = key == AUDIT_STATUS_ENABLED ? val : 0, + .pid = key == AUDIT_STATUS_PID ? val : 0, + } + }; + struct sockaddr_nl addr = { + .nl_family = AF_NETLINK, + }; + int ret; + + do { + ret = sendto(fd, &msg, msg.nlh.nlmsg_len, 0, + (struct sockaddr *)&addr, sizeof(addr)); + } while (ret < 0 && errno == EINTR); + + if (ret != (int)msg.nlh.nlmsg_len) + return -1; + return 0; +} + +int audit_set(int fd, uint32_t key, uint32_t val) +{ + struct audit_message rep = { 0 }; + int ret; + + ret = audit_send(fd, AUDIT_SET, key, val); + if (ret) + return ret; + + ret = audit_recv(fd, &rep); + if (ret < 0) + return ret; + return 0; +} + +int readlog(int fd) +{ + struct audit_message rep = { 0 }; + int ret = audit_recv(fd, &rep); + const char *sep = ""; + char *k, *v; + + if (ret < 0) + return ret; + + if (rep.nlh.nlmsg_type != AUDIT_NETFILTER_CFG) + return 0; + + /* skip the initial "audit(...): " part */ + strtok(rep.u.data, " "); + + while ((k = strtok(NULL, "="))) { + v = strtok(NULL, " "); + + /* these vary and/or are uninteresting, ignore */ + if (!strcmp(k, "pid") || + !strcmp(k, "comm") || + !strcmp(k, "subj")) + continue; + + /* strip the varying sequence number */ + if (!strcmp(k, "table")) + *strchrnul(v, ':') = '\0'; + + printf("%s%s=%s", sep, k, v); + sep = " "; + } + if (*sep) { + printf("\n"); + fflush(stdout); + } + return 0; +} + +void cleanup(int sig) +{ + audit_set(fd, AUDIT_STATUS_ENABLED, 0); + close(fd); + if (sig) + exit(0); +} + +int main(int argc, char **argv) +{ + struct sigaction act = { + .sa_handler = cleanup, + }; + + fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_AUDIT); + if (fd < 0) { + perror("Can't open netlink socket"); + return -1; + } + + if (sigaction(SIGTERM, &act, NULL) < 0 || + sigaction(SIGINT, &act, NULL) < 0) { + perror("Can't set signal handler"); + close(fd); + return -1; + } + + audit_set(fd, AUDIT_STATUS_ENABLED, 1); + audit_set(fd, AUDIT_STATUS_PID, getpid()); + + while (1) + readlog(fd); +} diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh new file mode 100755 index 0000000000..c28379a965 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test for legacy br_netfilter module combined with connection tracking, +# a combination that doesn't really work. +# Multicast/broadcast packets race for hash table insertion. + +# eth0 br0 eth0 +# setup is: ns1 <->,ns0 <-> ns3 +# ns2 <-' `'-> ns4 + +source lib.sh + +checktool "nft --version" "run test without nft tool" + +cleanup() { + cleanup_all_ns +} + +trap cleanup EXIT + +setup_ns ns0 ns1 ns2 ns3 ns4 + +ret=0 + +do_ping() +{ + fromns="$1" + dstip="$2" + + if ! ip netns exec "$fromns" ping -c 1 -q "$dstip" > /dev/null; then + echo "ERROR: ping from $fromns to $dstip" + ip netns exec "$ns0" nft list ruleset + ret=1 + fi +} + +bcast_ping() +{ + fromns="$1" + dstip="$2" + + local packets=500 + + [ "$KSFT_MACHINE_SLOW" = yes ] && packets=100 + + for i in $(seq 1 $packets); do + if ! ip netns exec "$fromns" ping -q -f -b -c 1 -q "$dstip" > /dev/null 2>&1; then + echo "ERROR: ping -b from $fromns to $dstip" + ip netns exec "$ns0" nft list ruleset + ret=1 + break + fi + done +} + +ip netns exec "$ns0" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q net.ipv4.conf.default.rp_filter=0 + +if ! ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns1"; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi + +ip link add veth2 netns "$ns0" type veth peer name eth0 netns "$ns2" +ip link add veth3 netns "$ns0" type veth peer name eth0 netns "$ns3" +ip link add veth4 netns "$ns0" type veth peer name eth0 netns "$ns4" + +for i in $(seq 1 4); do + ip -net "$ns0" link set "veth$i" up +done + +if ! ip -net "$ns0" link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +# make veth0,1,2 part of bridge. +for i in $(seq 1 3); do + ip -net "$ns0" link set "veth$i" master br0 +done + +# add a macvlan on top of the bridge. +MACVLAN_ADDR=ba:f3:13:37:42:23 +ip -net "$ns0" link add link br0 name macvlan0 type macvlan mode private +ip -net "$ns0" link set macvlan0 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan0 up +ip -net "$ns0" addr add 10.23.0.1/24 dev macvlan0 + +# add a macvlan on top of veth4. +MACVLAN_ADDR=ba:f3:13:37:42:24 +ip -net "$ns0" link add link veth4 name macvlan4 type macvlan mode passthru +ip -net "$ns0" link set macvlan4 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan4 up + +# make the macvlan part of the bridge. +# veth4 is not a bridge port, only the macvlan on top of it. +ip -net "$ns0" link set macvlan4 master br0 + +ip -net "$ns0" link set br0 up +ip -net "$ns0" addr add 10.0.0.1/24 dev br0 + +modprobe -q br_netfilter +if ! ip netns exec "$ns0" sysctl -q net.bridge.bridge-nf-call-iptables=1; then + echo "SKIP: bridge netfilter not available" + ret=$ksft_skip +fi + +# for testing, so namespaces will reply to ping -b probes. +ip netns exec "$ns0" sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 + +# enable conntrack in ns0 and drop broadcast packets in forward to +# avoid them from getting confirmed in the postrouting hook before +# the cloned skb is passed up the stack. +ip netns exec "$ns0" nft -f - < nsbr <-> ns2 + +source lib.sh + +if ! ebtables -V > /dev/null 2>&1;then + echo "SKIP: Could not run test without ebtables" + exit $ksft_skip +fi + +cleanup() { + cleanup_all_ns +} + +trap cleanup EXIT + +setup_ns nsbr ns1 ns2 + +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.all.rp_filter=0 +if ! ip link add veth0 netns "$nsbr" type veth peer name eth0 netns "$ns1"; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi +ip link add veth1 netns "$nsbr" type veth peer name eth0 netns "$ns2" + +if ! ip -net "$nsbr" link add br0 type bridge; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +ip -net "$nsbr" link set veth0 up +ip -net "$nsbr" link set veth1 up + +ip -net "$nsbr" link set veth0 master br0 +ip -net "$nsbr" link set veth1 master br0 +ip -net "$nsbr" link set br0 up +ip -net "$nsbr" addr add 10.0.0.1/24 dev br0 + +# place both in same subnet, ${ns1} and ${ns2} connected via ${nsbr}:br0 +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up +ip -net "$ns1" addr add 10.0.0.11/24 dev eth0 +ip -net "$ns2" addr add 10.0.0.12/24 dev eth0 + +test_ebtables_broute() +{ + # redirect is needed so the dstmac is rewritten to the bridge itself, + # ip stack won't process OTHERHOST (foreign unicast mac) packets. + if ! ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP; then + echo "SKIP: Could not add ebtables broute redirect rule" + return $ksft_skip + fi + + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=0 + + # ping net${ns1}, expected to not work (ip forwarding is off) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then + echo "ERROR: ping works, should have failed" 1>&2 + return 1 + fi + + # enable forwarding on both interfaces. + # neither needs an ip address, but at least the bridge needs + # an ip address in same network segment as ${ns1} and ${ns2} (${nsbr} + # needs to be able to determine route for to-be-forwarded packet). + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=1 + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth1.forwarding=1 + + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then + echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 + return 1 + fi + + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule" + ip netns exec "$nsbr" ebtables -t broute -F + + # ping net${ns1}, expected to work (frames are bridged) + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then + echo "ERROR: ping did not work, but it should (bridged)" 1>&2 + return 1 + fi + + ip netns exec "$nsbr" ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP + + # ping net${ns1}, expected to not work (DROP in bridge forward) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then + echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 + return 1 + fi + + # re-activate brouter + ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP + + if ! ip netns exec "$ns2" ping -q -c 1 10.0.0.11 > /dev/null; then + echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 + return 1 + fi + + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule and bridge forward drop" + return 0 +} + +# test basic connectivity +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.0.12 > /dev/null; then + echo "ERROR: Could not reach ${ns2} from ${ns1}" 1>&2 + exit 1 +fi + +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.0.11 > /dev/null; then + echo "ERROR: Could not reach ${ns1} from ${ns2}" 1>&2 + exit 1 +fi + +test_ebtables_broute +exit $? diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config new file mode 100644 index 0000000000..63ef80ef47 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/config @@ -0,0 +1,89 @@ +CONFIG_AUDIT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BRIDGE=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_NETFILTER=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_CGROUP_BPF=y +CONFIG_DUMMY=m +CONFIG_INET_ESP=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP_NF_RAW=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP_SCTP=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_RR=m +CONFIG_IPV6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_MACVLAN=m +CONFIG_NAMESPACES=y +CONFIG_NET_CLS_U32=m +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NS=y +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_IPIP=m +CONFIG_NET_VRF=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_LOG_IPV6=m +CONFIG_NF_NAT=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_IPV4=y +CONFIG_NF_TABLES_IPV6=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_CT=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_NAT=m +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_SYNPROXY=m +CONFIG_VETH=m +CONFIG_VLAN_8021Q=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_STATISTICS=y +CONFIG_NET_PKTGEN=m +CONFIG_TUN=m diff --git a/tools/testing/selftests/net/netfilter/connect_close.c b/tools/testing/selftests/net/netfilter/connect_close.c new file mode 100644 index 0000000000..1c3b0add54 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/connect_close.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PORT 12345 +#define RUNTIME 10 + +static struct { + unsigned int timeout; + unsigned int port; +} opts = { + .timeout = RUNTIME, + .port = PORT, +}; + +static void handler(int sig) +{ + _exit(sig == SIGALRM ? 0 : 1); +} + +static void set_timeout(void) +{ + struct sigaction action = { + .sa_handler = handler, + }; + + sigaction(SIGALRM, &action, NULL); + + alarm(opts.timeout); +} + +static void do_connect(const struct sockaddr_in *dst) +{ + int s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s >= 0) + fcntl(s, F_SETFL, O_NONBLOCK); + + connect(s, (struct sockaddr *)dst, sizeof(*dst)); + close(s); +} + +static void do_accept(const struct sockaddr_in *src) +{ + int c, one = 1, s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s < 0) + return; + + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + + bind(s, (struct sockaddr *)src, sizeof(*src)); + + listen(s, 16); + + c = accept(s, NULL, NULL); + if (c >= 0) + close(c); + + close(s); +} + +static int accept_loop(void) +{ + struct sockaddr_in src = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &src.sin_addr); + + set_timeout(); + + for (;;) + do_accept(&src); + + return 1; +} + +static int connect_loop(void) +{ + struct sockaddr_in dst = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &dst.sin_addr); + + set_timeout(); + + for (;;) + do_connect(&dst); + + return 1; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "t:p:")) != -1) { + switch (c) { + case 't': + opts.timeout = atoi(optarg); + break; + case 'p': + opts.port = atoi(optarg); + break; + } + } +} + +int main(int argc, char *argv[]) +{ + pid_t p; + + parse_opts(argc, argv); + + p = fork(); + if (p < 0) + return 111; + + if (p > 0) + return accept_loop(); + + return connect_loop(); +} diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c new file mode 100644 index 0000000000..bd9317bf5a --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c @@ -0,0 +1,469 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include +#include +#include + +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +#define TEST_ZONE_ID 123 +#define NF_CT_DEFAULT_ZONE_ID 0 + +static int reply_counter; + +static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type, + uint32_t src_ip, uint32_t dst_ip, + uint16_t src_port, uint16_t dst_port) +{ + struct nlattr *nest, *nest_ip, *nest_proto; + + nest = mnl_attr_nest_start(nlh, type); + if (!nest) + return -1; + + nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); + if (!nest_ip) + return -1; + mnl_attr_put_u32(nlh, CTA_IP_V4_SRC, src_ip); + mnl_attr_put_u32(nlh, CTA_IP_V4_DST, dst_ip); + mnl_attr_nest_end(nlh, nest_ip); + + nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); + if (!nest_proto) + return -1; + mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); + mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); + mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); + mnl_attr_nest_end(nlh, nest_proto); + + mnl_attr_nest_end(nlh, nest); +} + +static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type, + struct in6_addr src_ip, struct in6_addr dst_ip, + uint16_t src_port, uint16_t dst_port) +{ + struct nlattr *nest, *nest_ip, *nest_proto; + + nest = mnl_attr_nest_start(nlh, type); + if (!nest) + return -1; + + nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); + if (!nest_ip) + return -1; + mnl_attr_put(nlh, CTA_IP_V6_SRC, sizeof(struct in6_addr), &src_ip); + mnl_attr_put(nlh, CTA_IP_V6_DST, sizeof(struct in6_addr), &dst_ip); + mnl_attr_nest_end(nlh, nest_ip); + + nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); + if (!nest_proto) + return -1; + mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); + mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); + mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); + mnl_attr_nest_end(nlh, nest_proto); + + mnl_attr_nest_end(nlh, nest); +} + +static int build_cta_proto(struct nlmsghdr *nlh) +{ + struct nlattr *nest, *nest_proto; + + nest = mnl_attr_nest_start(nlh, CTA_PROTOINFO); + if (!nest) + return -1; + + nest_proto = mnl_attr_nest_start(nlh, CTA_PROTOINFO_TCP); + if (!nest_proto) + return -1; + mnl_attr_put_u8(nlh, CTA_PROTOINFO_TCP_STATE, TCP_CONNTRACK_ESTABLISHED); + mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, 0x0a0a); + mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_REPLY, 0x0a0a); + mnl_attr_nest_end(nlh, nest_proto); + + mnl_attr_nest_end(nlh, nest); +} + +static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh, + uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *rplnlh; + unsigned int portid; + int err, ret; + + portid = mnl_socket_get_portid(sock); + + ret = build_cta_proto(nlh); + if (ret < 0) { + perror("build_cta_proto"); + return -1; + } + mnl_attr_put_u32(nlh, CTA_TIMEOUT, htonl(20000)); + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); + + if (mnl_socket_sendto(sock, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + return -1; + } + + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + if (ret < 0) { + perror("mnl_socket_recvfrom"); + return ret; + } + + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); + if (ret < 0) { + if (errno == EEXIST) { + /* The entries are probably still there from a previous + * run. So we are good + */ + return 0; + } + perror("mnl_cb_run"); + return ret; + } + + return 0; +} + +static int conntrack_data_generate_v4(struct mnl_socket *sock, uint32_t src_ip, + uint32_t dst_ip, uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct nfgenmsg *nfh; + int ret; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_ACK | NLM_F_EXCL; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_INET; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + ret = build_cta_tuple_v4(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, 12345, 443); + if (ret < 0) { + perror("build_cta_tuple_v4"); + return ret; + } + ret = build_cta_tuple_v4(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, 443, 12345); + if (ret < 0) { + perror("build_cta_tuple_v4"); + return ret; + } + return conntrack_data_insert(sock, nlh, zone); +} + +static int conntrack_data_generate_v6(struct mnl_socket *sock, + struct in6_addr src_ip, + struct in6_addr dst_ip, + uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct nfgenmsg *nfh; + int ret; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_ACK | NLM_F_EXCL; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_INET6; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + ret = build_cta_tuple_v6(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, + 12345, 443); + if (ret < 0) { + perror("build_cta_tuple_v6"); + return ret; + } + ret = build_cta_tuple_v6(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, + 12345, 443); + if (ret < 0) { + perror("build_cta_tuple_v6"); + return ret; + } + return conntrack_data_insert(sock, nlh, zone); +} + +static int count_entries(const struct nlmsghdr *nlh, void *data) +{ + reply_counter++; +} + +static int conntracK_count_zone(struct mnl_socket *sock, uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh, *rplnlh; + struct nfgenmsg *nfh; + struct nlattr *nest; + unsigned int portid; + int err, ret; + + portid = mnl_socket_get_portid(sock); + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_GET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_UNSPEC; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); + + ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); + if (ret < 0) { + perror("mnl_socket_sendto"); + return ret; + } + + reply_counter = 0; + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + while (ret > 0) { + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, + count_entries, NULL); + if (ret <= MNL_CB_STOP) + break; + + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + } + if (ret < 0) { + perror("mnl_socket_recvfrom"); + return ret; + } + + return reply_counter; +} + +static int conntrack_flush_zone(struct mnl_socket *sock, uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh, *rplnlh; + struct nfgenmsg *nfh; + struct nlattr *nest; + unsigned int portid; + int err, ret; + + portid = mnl_socket_get_portid(sock); + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_DELETE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_UNSPEC; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); + + ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); + if (ret < 0) { + perror("mnl_socket_sendto"); + return ret; + } + + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + if (ret < 0) { + perror("mnl_socket_recvfrom"); + return ret; + } + + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); + if (ret < 0) { + perror("mnl_cb_run"); + return ret; + } + + return 0; +} + +FIXTURE(conntrack_dump_flush) +{ + struct mnl_socket *sock; +}; + +FIXTURE_SETUP(conntrack_dump_flush) +{ + struct in6_addr src, dst; + int ret; + + self->sock = mnl_socket_open(NETLINK_NETFILTER); + if (!self->sock) { + perror("mnl_socket_open"); + SKIP(return, "cannot open netlink_netfilter socket"); + } + + ret = mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID); + EXPECT_EQ(ret, 0); + + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + if (ret < 0 && errno == EPERM) + SKIP(return, "Needs to be run as root"); + else if (ret < 0 && errno == EOPNOTSUPP) + SKIP(return, "Kernel does not seem to support conntrack zones"); + + ret = conntrack_data_generate_v4(self->sock, 0xf0f0f0f0, 0xf1f1f1f1, + TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntrack_data_generate_v4(self->sock, 0xf2f2f2f2, 0xf3f3f3f3, + TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 0); + ret = conntrack_data_generate_v4(self->sock, 0xf4f4f4f4, 0xf5f5f5f5, + TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 0); + ret = conntrack_data_generate_v4(self->sock, 0xf6f6f6f6, 0xf7f7f7f7, + NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x01000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x02000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x03000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x04000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 0); + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x05000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x06000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 0); + + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x07000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x08000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_GE(ret, 2); + if (ret > 2) + SKIP(return, "kernel does not support filtering by zone"); +} + +FIXTURE_TEARDOWN(conntrack_dump_flush) +{ +} + +TEST_F(conntrack_dump_flush, test_dump_by_zone) +{ + int ret; + + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 2); +} + +TEST_F(conntrack_dump_flush, test_flush_by_zone) +{ + int ret; + + ret = conntrack_flush_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 2); +} + +TEST_F(conntrack_dump_flush, test_flush_by_zone_default) +{ + int ret; + + ret = conntrack_flush_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh new file mode 100755 index 0000000000..c63d840ead --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh @@ -0,0 +1,278 @@ +#!/bin/bash +# +# check that ICMP df-needed/pkttoobig icmp are set are set as related +# state +# +# Setup is: +# +# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2 +# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280). +# ping nsclient2 from nsclient1, checking that conntrack did set RELATED +# 'fragmentation needed' icmp packet. +# +# In addition, nsrouter1 will perform IP masquerading, i.e. also +# check the icmp errors are propagated to the correct host as per +# nat of "established" icmp-echo "connection". + +source lib.sh + +if ! nft --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +cleanup() { + cleanup_all_ns +} + +trap cleanup EXIT + +setup_ns nsclient1 nsclient2 nsrouter1 nsrouter2 + +ret=0 + +add_addr() +{ + ns=$1 + dev=$2 + i=$3 + + ip -net "$ns" link set "$dev" up + ip -net "$ns" addr add "192.168.$i.2/24" dev "$dev" + ip -net "$ns" addr add "dead:$i::2/64" dev "$dev" nodad +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + if ! ip netns exec "$ns" nft list counter inet filter "$name" | grep -q "$expect"; then + echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 + ip netns exec "$ns" nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +check_unknown() +{ + expect="packets 0 bytes 0" + for n in ${nsclient1} ${nsclient2} ${nsrouter1} ${nsrouter2}; do + if ! check_counter "$n" "unknown" "$expect"; then + return 1 + fi + done + + return 0 +} + +DEV=veth0 +ip link add "$DEV" netns "$nsclient1" type veth peer name eth1 netns "$nsrouter1" +ip link add "$DEV" netns "$nsclient2" type veth peer name eth1 netns "$nsrouter2" +ip link add "$DEV" netns "$nsrouter1" type veth peer name eth2 netns "$nsrouter2" + +add_addr "$nsclient1" $DEV 1 +add_addr "$nsclient2" $DEV 2 + +ip -net "$nsrouter1" link set eth1 up +ip -net "$nsrouter1" link set $DEV up + +ip -net "$nsrouter2" link set eth1 mtu 1280 up +ip -net "$nsrouter2" link set eth2 up + +ip -net "$nsclient1" route add default via 192.168.1.1 +ip -net "$nsclient1" -6 route add default via dead:1::1 + +ip -net "$nsclient2" route add default via 192.168.2.1 +ip -net "$nsclient2" route add default via dead:2::1 +ip -net "$nsclient2" link set veth0 mtu 1280 + +ip -net "$nsrouter1" addr add 192.168.1.1/24 dev eth1 +ip -net "$nsrouter1" addr add 192.168.3.1/24 dev veth0 +ip -net "$nsrouter1" addr add dead:1::1/64 dev eth1 nodad +ip -net "$nsrouter1" addr add dead:3::1/64 dev veth0 nodad +ip -net "$nsrouter1" route add default via 192.168.3.10 +ip -net "$nsrouter1" -6 route add default via dead:3::10 + +ip -net "$nsrouter2" addr add 192.168.2.1/24 dev eth1 +ip -net "$nsrouter2" addr add 192.168.3.10/24 dev eth2 +ip -net "$nsrouter2" addr add dead:2::1/64 dev eth1 nodad +ip -net "$nsrouter2" addr add dead:3::10/64 dev eth2 nodad +ip -net "$nsrouter2" route add default via 192.168.3.1 +ip -net "$nsrouter2" route add default via dead:3::1 + +for i in 4 6; do + ip netns exec "$nsrouter1" sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec "$nsrouter2" sysctl -q net.ipv$i.conf.all.forwarding=1 +done + +for netns in "$nsrouter1" "$nsrouter2"; do +ip netns exec "$netns" nft -f - </dev/null; then + echo "ERROR: netns ip routing/connectivity broken" 1>&2 + exit 1 +fi +if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q dead:2::2 >/dev/null; then + echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 + exit 1 +fi + +if ! check_unknown; then + ret=1 +fi + +expect="packets 0 bytes 0" +for netns in "$nsrouter1" "$nsrouter2" "$nsclient1";do + if ! check_counter "$netns" "related" "$expect"; then + ret=1 + fi +done + +expect="packets 2 bytes 2076" +if ! check_counter "$nsclient2" "new" "$expect"; then + ret=1 +fi + +if ip netns exec "$nsclient1" ping -W 0.5 -q -c 1 -s 1300 -M "do" 192.168.2.2 > /dev/null; then + echo "ERROR: ping should have failed with PMTU too big error" 1>&2 + ret=1 +fi + +# nsrouter2 should have generated the icmp error, so +# related counter should be 0 (its in forward). +expect="packets 0 bytes 0" +if ! check_counter "$nsrouter2" "related" "$expect"; then + ret=1 +fi + +# but nsrouter1 should have seen it, same for nsclient1. +expect="packets 1 bytes 576" +for netns in ${nsrouter1} ${nsclient1};do + if ! check_counter "$netns" "related" "$expect"; then + ret=1 + fi +done + +if ip netns exec "${nsclient1}" ping6 -W 0.5 -c 1 -s 1300 dead:2::2 > /dev/null; then + echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 + ret=1 +fi + +expect="packets 2 bytes 1856" +for netns in "${nsrouter1}" "${nsclient1}";do + if ! check_counter "$netns" "related" "$expect"; then + ret=1 + fi +done + +if [ $ret -eq 0 ];then + echo "PASS: icmp mtu error had RELATED state" +else + echo "ERROR: icmp error RELATED state test has failed" +fi + +# add 'bad' route, expect icmp REDIRECT to be generated +ip netns exec "${nsclient1}" ip route add 192.168.1.42 via 192.168.1.1 +ip netns exec "${nsclient1}" ip route add dead:1::42 via dead:1::1 + +ip netns exec "$nsclient1" ping -W 1 -q -i 0.5 -c 2 192.168.1.42 > /dev/null + +expect="packets 1 bytes 112" +if ! check_counter "$nsclient1" "redir4" "$expect"; then + ret=1 +fi + +ip netns exec "$nsclient1" ping -W 1 -c 1 dead:1::42 > /dev/null +expect="packets 1 bytes 192" +if ! check_counter "$nsclient1" "redir6" "$expect"; then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: icmp redirects had RELATED state" +else + echo "ERROR: icmp redirect RELATED state test has failed" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh new file mode 100755 index 0000000000..9832a5d019 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh @@ -0,0 +1,191 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +# Conntrack needs to reassemble fragments in order to have complete +# packets for rule matching. Reassembly can lead to packet loss. + +# Consider the following setup: +# +--------+ +---------+ +--------+ +# |Router A|-------|Wanrouter|-------|Router B| +# | |.IPIP..| |..IPIP.| | +# +--------+ +---------+ +--------+ +# / mtu 1400 \ +# / \ +#+--------+ +--------+ +#|Client A| |Client B| +#| | | | +#+--------+ +--------+ + +# Router A and Router B use IPIP tunnel interfaces to tunnel traffic +# between Client A and Client B over WAN. Wanrouter has MTU 1400 set +# on its interfaces. + +rx=$(mktemp) + +checktool "iptables --version" "run test without iptables" +checktool "socat -h" "run test without socat" + +setup_ns r_a r_b r_w c_a c_b + +cleanup() { + cleanup_all_ns + rm -f "$rx" +} + +trap cleanup EXIT + +listener_ready() +{ + ns="$1" + port="$2" + ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port" +} + +test_path() { + msg="$1" + + ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null & + + busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000 + + for i in 1 2 3; do + head -c1400 /dev/zero | tr "\000" "a" | \ + ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000 + done + + wait + + bytes=$(wc -c < "$rx") + + if [ "$bytes" -eq 1400 ];then + echo "OK: PMTU $msg connection tracking" + else + echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" + exit 1 + fi +} + +# Detailed setup for Router A +# --------------------------- +# Interfaces: +# eth0: 10.2.2.1/24 +# eth1: 192.168.10.1/24 +# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1 +# Routes: +# 192.168.20.0/24 dev ipip0 (192.168.20.0/24 is subnet of Client B) +# 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) +# No iptables rules at all. + +ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w" +ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a" + +l_addr="10.2.2.1" +r_addr="10.4.4.1" +ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip + +for dev in lo veth0 veth1 ipip0; do + ip -net "$r_a" link set "$dev" up +done + +ip -net "$r_a" addr add 10.2.2.1/24 dev veth0 +ip -net "$r_a" addr add 192.168.10.1/24 dev veth1 + +ip -net "$r_a" route add 192.168.20.0/24 dev ipip0 +ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254 + +ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Detailed setup for Router B +# --------------------------- +# Interfaces: +# eth0: 10.4.4.1/24 +# eth1: 192.168.20.1/24 +# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1 +# Routes: +# 192.168.10.0/24 dev ipip0 (192.168.10.0/24 is subnet of Client A) +# 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) +# No iptables rules at all. + +ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w" +ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b" + +l_addr="10.4.4.1" +r_addr="10.2.2.1" + +ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip + +for dev in veth0 veth1 ipip0; do + ip -net "$r_b" link set $dev up +done + +ip -net "$r_b" addr add 10.4.4.1/24 dev veth0 +ip -net "$r_b" addr add 192.168.20.1/24 dev veth1 + +ip -net "$r_b" route add 192.168.10.0/24 dev ipip0 +ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254 +ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Client A +ip -net "$c_a" addr add 192.168.10.2/24 dev veth0 +ip -net "$c_a" link set dev veth0 up +ip -net "$c_a" route add default via 192.168.10.1 + +# Client A +ip -net "$c_b" addr add 192.168.20.2/24 dev veth0 +ip -net "$c_b" link set dev veth0 up +ip -net "$c_b" route add default via 192.168.20.1 + +# Wan +ip -net "$r_w" addr add 10.2.2.254/24 dev veth0 +ip -net "$r_w" addr add 10.4.4.254/24 dev veth1 + +ip -net "$r_w" link set dev veth0 up mtu 1400 +ip -net "$r_w" link set dev veth1 up mtu 1400 + +ip -net "$r_a" link set dev veth0 mtu 1400 +ip -net "$r_b" link set dev veth0 mtu 1400 + +ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Path MTU discovery +# ------------------ +# Running tracepath from Client A to Client B shows PMTU discovery is working +# as expected: +# +# clienta:~# tracepath 192.168.20.2 +# 1?: [LOCALHOST] pmtu 1500 +# 1: 192.168.10.1 0.867ms +# 1: 192.168.10.1 0.302ms +# 2: 192.168.10.1 0.312ms pmtu 1480 +# 2: no reply +# 3: 192.168.10.1 0.510ms pmtu 1380 +# 3: 192.168.20.2 2.320ms reached +# Resume: pmtu 1380 hops 3 back 3 + +# ip netns exec ${c_a} traceroute --mtu 192.168.20.2 + +# Router A has learned PMTU (1400) to Router B from Wanrouter. +# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B +# from Router A. + +#Send large UDP packet +#--------------------- +#Now we send a 1400 bytes UDP packet from Client A to Client B: + +# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000 +test_path "without" + +# The IPv4 stack on Client A already knows the PMTU to Client B, so the +# UDP packet is sent as two fragments (1380 + 20). Router A forwards the +# fragments between eth1 and ipip0. The fragments fit into the tunnel and +# reach their destination. + +#When sending the large UDP packet again, Router A now reassembles the +#fragments before routing the packet over ipip0. The resulting IPIP +#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is +#dropped on Router A before sending. + +ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW +test_path "with" diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh new file mode 100755 index 0000000000..d860f7d974 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Testing For SCTP COLLISION SCENARIO as Below: +# +# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] +# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] +# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] +# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] +# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] +# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] +# +# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS + +source lib.sh + +CLIENT_IP="198.51.200.1" +CLIENT_PORT=1234 + +SERVER_IP="198.51.100.1" +SERVER_PORT=1234 + +CLIENT_GW="198.51.200.2" +SERVER_GW="198.51.100.2" + +# setup the topo +setup() { + setup_ns CLIENT_NS SERVER_NS ROUTER_NS + ip -n "$SERVER_NS" link add link0 type veth peer name link1 netns "$ROUTER_NS" + ip -n "$CLIENT_NS" link add link3 type veth peer name link2 netns "$ROUTER_NS" + + ip -n "$SERVER_NS" link set link0 up + ip -n "$SERVER_NS" addr add $SERVER_IP/24 dev link0 + ip -n "$SERVER_NS" route add $CLIENT_IP dev link0 via $SERVER_GW + + ip -n "$ROUTER_NS" link set link1 up + ip -n "$ROUTER_NS" link set link2 up + ip -n "$ROUTER_NS" addr add $SERVER_GW/24 dev link1 + ip -n "$ROUTER_NS" addr add $CLIENT_GW/24 dev link2 + ip net exec "$ROUTER_NS" sysctl -wq net.ipv4.ip_forward=1 + + ip -n "$CLIENT_NS" link set link3 up + ip -n "$CLIENT_NS" addr add $CLIENT_IP/24 dev link3 + ip -n "$CLIENT_NS" route add $SERVER_IP dev link3 via $CLIENT_GW + + # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with + # tc on $SERVER_NS side + tc -n "$SERVER_NS" qdisc add dev link0 root handle 1: htb r2q 64 + tc -n "$SERVER_NS" class add dev link0 parent 1: classid 1:1 htb rate 100mbit + tc -n "$SERVER_NS" filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ + 0xff match u8 2 0xff at 32 flowid 1:1 + if ! tc -n "$SERVER_NS" qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms; then + echo "SKIP: Cannot add netem qdisc" + exit $ksft_skip + fi + + # simulate the ctstate check on OVS nf_conntrack + ip net exec "$ROUTER_NS" iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP + ip net exec "$ROUTER_NS" iptables -A INPUT -p sctp -j DROP + + # use a smaller number for assoc's max_retrans to reproduce the issue + modprobe -q sctp + ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3 +} + +cleanup() { + ip net exec "$CLIENT_NS" pkill sctp_collision >/dev/null 2>&1 + ip net exec "$SERVER_NS" pkill sctp_collision >/dev/null 2>&1 + cleanup_all_ns +} + +do_test() { + ip net exec "$SERVER_NS" ./sctp_collision server \ + $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & + ip net exec "$CLIENT_NS" ./sctp_collision client \ + $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT +} + +# NOTE: one way to work around the issue is set a smaller hb_interval +# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 + +# run the test case +trap cleanup EXIT +setup && \ +echo "Test for SCTP Collision in nf_conntrack:" && \ +do_test && echo "PASS!" +exit $? diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh new file mode 100755 index 0000000000..121ea93c01 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check that UNREPLIED tcp conntrack will eventually timeout. +# + +source lib.sh + +if ! nft --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +if ! conntrack --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +ret=0 + +cleanup() { + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +ipv4() { + echo -n 192.168."$1".2 +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + if ! ip netns exec "$ns2" nft list counter inet filter "$name" | grep -q "$expect"; then + echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 + ip netns exec "$ns2" nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +trap cleanup EXIT + +# Create test namespaces +setup_ns ns1 ns2 + +# Connect the namespace to the host using a veth pair +ip -net "$ns1" link add name veth1 type veth peer name veth2 +ip -net "$ns1" link set netns "$ns2" dev veth2 + +ip -net "$ns1" link set up dev lo +ip -net "$ns2" link set up dev lo +ip -net "$ns1" link set up dev veth1 +ip -net "$ns2" link set up dev veth2 + +ip -net "$ns2" addr add 10.11.11.2/24 dev veth2 +ip -net "$ns2" route add default via 10.11.11.1 + +ip netns exec "$ns2" sysctl -q net.ipv4.conf.veth2.forwarding=1 + +# add a rule inside NS so we enable conntrack +ip netns exec "$ns1" nft -f - </dev/null || exit 1 + +ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT & + +ip netns exec "$ns2" nft -f - < $ns2 to the virtual ip" +ip netns exec "$ns1" bash -c 'for i in $(seq 1 $BUSYWAIT_TIMEOUT) ; do + socat -u STDIN TCP:10.99.99.99:80 < /dev/null + sleep 0.1 + done' & + +wait_for_attempt() +{ + count=$(ip netns exec "$ns2" conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) + if [ "$count" -gt 0 ]; then + return 0 + fi + + return 1 +} + +# wait for conntrack to pick the new connection request up before loading +# the nat redirect rule. +if ! busywait "$BUSYWAIT_TIMEOUT" wait_for_attempt; then + echo "ERROR: $ns2 did not pick up tcp connection from peer" + exit 1 +fi + +ip netns exec "$ns2" nft -f - </dev/null | wc -l) + if [ "$count" -gt 0 ]; then + return 0 + fi + + return 1 +} +echo "INFO: NAT redirect added in ns $ns2, waiting for $BUSYWAIT_TIMEOUT ms for nat to take effect" + +busywait "$BUSYWAIT_TIMEOUT" wait_for_redirect +ret=$? + +expect="packets 1 bytes 60" +if ! check_counter "$ns2" "redir" "$expect"; then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: redirection counter has expected values" +else + echo "ERROR: no tcp connection was redirected" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh new file mode 100755 index 0000000000..073e8e62d3 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -0,0 +1,220 @@ +#!/bin/bash + +# This script demonstrates interaction of conntrack and vrf. +# The vrf driver calls the netfilter hooks again, with oif/iif +# pointing at the VRF device. +# +# For ingress, this means first iteration has iifname of lower/real +# device. In this script, thats veth0. +# Second iteration is iifname set to vrf device, tvrf in this script. +# +# For egress, this is reversed: first iteration has the vrf device, +# second iteration is done with the lower/real/veth0 device. +# +# test_ct_zone_in demonstrates unexpected change of nftables +# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack +# connection on VRF rcv" +# +# It was possible to assign conntrack zone to a packet (or mark it for +# `notracking`) in the prerouting chain before conntrack, based on real iif. +# +# After the change, the zone assignment is lost and the zone is assigned based +# on the VRF master interface (in case such a rule exists). +# assignment is lost. Instead, assignment based on the `iif` matching +# Thus it is impossible to distinguish packets based on the original +# interface. +# +# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem +# that was supposed to be fixed by the commit mentioned above to make sure +# that any fix to test case 1 won't break masquerade again. + +source lib.sh + +IP0=172.30.30.1 +IP1=172.30.30.2 +PFXL=30 +ret=0 + +cleanup() +{ + ip netns pids $ns0 | xargs kill 2>/dev/null + ip netns pids $ns1 | xargs kill 2>/dev/null + + cleanup_all_ns +} + +checktool "nft --version" "run test without nft" +checktool "conntrack --version" "run test without conntrack" +checktool "socat -h" "run test without socat" + +trap cleanup EXIT + +setup_ns ns0 ns1 + +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 + +if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: Could not add veth device" + exit $ksft_skip +fi + +if ! ip -net "$ns0" li add tvrf type vrf table 9876; then + echo "SKIP: Could not add vrf device" + exit $ksft_skip +fi + +ip -net "$ns0" li set veth0 master tvrf +ip -net "$ns0" li set tvrf up +ip -net "$ns0" li set veth0 up +ip -net "$ns1" li set veth0 up + +ip -net "$ns0" addr add $IP0/$PFXL dev veth0 +ip -net "$ns1" addr add $IP1/$PFXL dev veth0 + +listener_ready() +{ + local ns="$1" + + ss -N "$ns" -l -n -t -o "sport = :55555" | grep -q "55555" +} + +ip netns exec "$ns1" socat -u -4 TCP-LISTEN:55555,reuseaddr,fork STDOUT > /dev/null & +busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" + +# test vrf ingress handling. +# The incoming connection should be placed in conntrack zone 1, +# as decided by the first iteration of the ruleset. +test_ct_zone_in() +{ +ip netns exec "$ns0" nft -f - < /dev/null + + # should be in zone 1, not zone 2 + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) + if [ "$count" -eq 1 ]; then + echo "PASS: entry found in conntrack zone 1" + else + echo "FAIL: entry not found in conntrack zone 1" + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) + if [ "$count" -eq 1 ]; then + echo "FAIL: entry found in zone 2 instead" + else + echo "FAIL: entry not in zone 1 or 2, dumping table" + ip netns exec "$ns0" conntrack -L + ip netns exec "$ns0" nft list ruleset + fi + fi +} + +# add masq rule that gets evaluated w. outif set to vrf device. +# This tests the first iteration of the packet through conntrack, +# oifname is the vrf device. +test_masquerade_vrf() +{ + local qdisc=$1 + + if [ "$qdisc" != "default" ]; then + tc -net "$ns0" qdisc add dev tvrf root "$qdisc" + fi + + ip netns exec "$ns0" conntrack -F 2>/dev/null + +ip netns exec "$ns0" nft -f - < /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on vrf device" + ret=1 + return + fi + + # must also check that nat table was evaluated on second (lower device) iteration. + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1' && + ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then + echo "PASS: connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" + else + echo "FAIL: vrf rules have unexpected counter value" + ret=1 + fi + + if [ "$qdisc" != "default" ]; then + tc -net "$ns0" qdisc del dev tvrf root + fi +} + +# add masq rule that gets evaluated w. outif set to veth device. +# This tests the 2nd iteration of the packet through conntrack, +# oifname is the lower device (veth0 in this case). +test_masquerade_veth() +{ + ip netns exec "$ns0" conntrack -F 2>/dev/null +ip netns exec "$ns0" nft -f - < /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on veth device" + ret=1 + return + fi + + # must also check that nat table was evaluated on second (lower device) iteration. + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1'; then + echo "PASS: connect with masquerade + sport rewrite on veth device" + else + echo "FAIL: vrf masq rule has unexpected counter value" + ret=1 + fi +} + +test_ct_zone_in +test_masquerade_vrf "default" +test_masquerade_vrf "pfifo" +test_masquerade_veth + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh new file mode 100755 index 0000000000..4ceee9fb39 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/ipvs.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# End-to-end ipvs test suite +# Topology: +#--------------------------------------------------------------+ +# | | +# ns0 | ns1 | +# ----------- | ----------- ----------- | +# | veth01 | --------- | veth10 | | veth12 | | +# ----------- peer ----------- ----------- | +# | | | | +# ----------- | | | +# | br0 | |----------------- peer |--------------| +# ----------- | | | +# | | | | +# ---------- peer ---------- ----------- | +# | veth02 | --------- | veth20 | | veth21 | | +# ---------- | ---------- ----------- | +# | ns2 | +# | | +#--------------------------------------------------------------+ +# +# We assume that all network driver are loaded +# + +source lib.sh + +ret=0 +GREEN='\033[0;92m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +readonly port=8080 + +readonly vip_v4=207.175.44.110 +readonly cip_v4=10.0.0.2 +readonly gip_v4=10.0.0.1 +readonly dip_v4=172.16.0.1 +readonly rip_v4=172.16.0.2 +readonly sip_v4=10.0.0.3 + +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" +readonly datalen=32 + +sysipvsnet="/proc/sys/net/ipv4/vs/" +if [ ! -d $sysipvsnet ]; then + if ! modprobe -q ip_vs; then + echo "skip: could not run test without ipvs module" + exit $ksft_skip + fi +fi + +checktool "ipvsadm -v" "run test without ipvsadm" +checktool "socat -h" "run test without socat" + +setup() { + setup_ns ns0 ns1 ns2 + + ip link add veth01 netns "${ns0}" type veth peer name veth10 netns "${ns1}" + ip link add veth02 netns "${ns0}" type veth peer name veth20 netns "${ns2}" + ip link add veth12 netns "${ns1}" type veth peer name veth21 netns "${ns2}" + + ip netns exec "${ns0}" ip link set veth01 up + ip netns exec "${ns0}" ip link set veth02 up + ip netns exec "${ns0}" ip link add br0 type bridge + ip netns exec "${ns0}" ip link set veth01 master br0 + ip netns exec "${ns0}" ip link set veth02 master br0 + ip netns exec "${ns0}" ip link set br0 up + ip netns exec "${ns0}" ip addr add "${cip_v4}/24" dev br0 + + ip netns exec "${ns1}" ip link set veth10 up + ip netns exec "${ns1}" ip addr add "${gip_v4}/24" dev veth10 + ip netns exec "${ns1}" ip link set veth12 up + ip netns exec "${ns1}" ip addr add "${dip_v4}/24" dev veth12 + + ip netns exec "${ns2}" ip link set veth21 up + ip netns exec "${ns2}" ip addr add "${rip_v4}/24" dev veth21 + ip netns exec "${ns2}" ip link set veth20 up + ip netns exec "${ns2}" ip addr add "${sip_v4}/24" dev veth20 + + sleep 1 + + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none +} + +cleanup() { + cleanup_all_ns + + if [ -f "${outfile}" ]; then + rm "${outfile}" + fi + if [ -f "${infile}" ]; then + rm "${infile}" + fi +} + +server_listen() { + ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & + server_pid=$! + sleep 0.2 +} + +client_connect() { + ip netns exec "${ns0}" timeout 2 socat -u -4 STDIN TCP:"${vip_v4}":"${port}" < "${infile}" +} + +verify_data() { + wait "${server_pid}" + cmp "$infile" "$outfile" 2>/dev/null +} + +test_service() { + server_listen + client_connect + verify_data +} + + +test_dr() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 + + # avoid incorrect arp response + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 + # avoid reverse route lookup + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 + + test_service +} + +test_nat() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -m -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 + + ip netns exec "${ns2}" ip link del veth20 + ip netns exec "${ns2}" ip route add default via "${dip_v4}" dev veth21 + + test_service +} + +test_tun() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" modprobe -q ipip + ip netns exec "${ns1}" ip link set tunl0 up + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.all.send_redirects=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.default.send_redirects=0 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -i -t "${vip_v4}:${port}" -r ${rip_v4}:${port} + ip netns exec "${ns1}" ip addr add ${vip_v4}/32 dev lo:1 + + ip netns exec "${ns2}" modprobe -q ipip + ip netns exec "${ns2}" ip link set tunl0 up + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 + + test_service +} + +run_tests() { + local errors= + + echo "Testing DR mode..." + cleanup + setup + test_dr + errors=$(( $errors + $? )) + + echo "Testing NAT mode..." + cleanup + setup + test_nat + errors=$(( $errors + $? )) + + echo "Testing Tunnel mode..." + cleanup + setup + test_tun + errors=$(( $errors + $? )) + + return $errors +} + +trap cleanup EXIT + +run_tests + +if [ $? -ne 0 ]; then + echo -e "$(basename $0): ${RED}FAIL${NC}" + exit 1 +fi +echo -e "$(basename $0): ${GREEN}PASS${NC}" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/lib.sh b/tools/testing/selftests/net/netfilter/lib.sh new file mode 100644 index 0000000000..bedd35298e --- /dev/null +++ b/tools/testing/selftests/net/netfilter/lib.sh @@ -0,0 +1,10 @@ +net_netfilter_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "$net_netfilter_dir/../lib.sh" + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} diff --git a/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh new file mode 100755 index 0000000000..c6fdd2079f --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +checktool "conntrack --version" "run test without conntrack" +checktool "iptables --version" "run test without iptables" +checktool "ip6tables --version" "run test without ip6tables" + +modprobe -q tun +modprobe -q nf_conntrack +# echo 1 > /proc/sys/net/netfilter/nf_log_all_netns + +PDRILL_TIMEOUT=10 + +files=" +conntrack_ack_loss_stall.pkt +conntrack_inexact_rst.pkt +conntrack_syn_challenge_ack.pkt +conntrack_synack_old.pkt +conntrack_synack_reuse.pkt +conntrack_rst_invalid.pkt +" + +if ! packetdrill --dry_run --verbose "packetdrill/conntrack_ack_loss_stall.pkt";then + echo "SKIP: packetdrill not installed" + exit ${ksft_skip} +fi + +ret=0 + +run_packetdrill() +{ + filename="$1" + ipver="$2" + local mtu=1500 + + export NFCT_IP_VERSION="$ipver" + + if [ "$ipver" = "ipv4" ];then + export xtables="iptables" + elif [ "$ipver" = "ipv6" ];then + export xtables="ip6tables" + mtu=1520 + fi + + timeout "$PDRILL_TIMEOUT" unshare -n packetdrill --ip_version="$ipver" --mtu=$mtu \ + --tolerance_usecs=1000000 --non_fatal packet "$filename" +} + +run_one_test_file() +{ + filename="$1" + + for v in ipv4 ipv6;do + printf "%-50s(%s)%-20s" "$filename" "$v" "" + if run_packetdrill packetdrill/"$f" "$v";then + echo OK + else + echo FAIL + ret=1 + fi + done +} + +echo "Replaying packetdrill test cases:" +for f in $files;do + run_one_test_file packetdrill/"$f" +done + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh new file mode 100755 index 0000000000..1014551dd7 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test NAT source port clash resolution +# + +source lib.sh +ret=0 +socatpid=0 + +cleanup() +{ + [ "$socatpid" -gt 0 ] && kill "$socatpid" + + cleanup_all_ns +} + +checktool "socat -h" "run test without socat" +checktool "iptables --version" "run test without iptables" + +trap cleanup EXIT + +setup_ns ns1 ns2 + +# Connect the namespaces using a veth pair +ip link add name veth2 type veth peer name veth1 +ip link set netns "$ns1" dev veth1 +ip link set netns "$ns2" dev veth2 + +ip netns exec "$ns1" ip link set up dev lo +ip netns exec "$ns1" ip link set up dev veth1 +ip netns exec "$ns1" ip addr add 192.168.1.1/24 dev veth1 + +ip netns exec "$ns2" ip link set up dev lo +ip netns exec "$ns2" ip link set up dev veth2 +ip netns exec "$ns2" ip addr add 192.168.1.2/24 dev veth2 + +# Create a server in one namespace +ip netns exec "$ns1" socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & +socatpid=$! + +# Restrict source port to just one so we don't have to exhaust +# all others. +ip netns exec "$ns2" sysctl -q net.ipv4.ip_local_port_range="10000 10000" + +# add a virtual IP using DNAT +ip netns exec "$ns2" iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 + +# ... and route it to the other namespace +ip netns exec "$ns2" ip route add 10.96.0.1 via 192.168.1.1 + +# add a persistent connection from the other namespace +ip netns exec "$ns2" socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & + +sleep 1 + +# ip daddr:dport will be rewritten to 192.168.1.1 5201 +# NAT must reallocate source port 10000 because +# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use +echo test | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null +ret=$? + +# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). +if [ $ret -eq 0 ]; then + echo "PASS: socat can connect via NAT'd address" +else + echo "FAIL: socat cannot connect via NAT'd address" +fi + +# check sport clashres. +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 + +sleep 5 | ip netns exec "$ns2" socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & + +# if connect succeeds, client closes instantly due to EOF on stdin. +# if connect hangs, it will time out after 5s. +echo | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & +cpid2=$! + +time_then=$(date +%s) +wait $cpid2 +rv=$? +time_now=$(date +%s) + +# Check how much time has elapsed, expectation is for +# 'cpid2' to connect and then exit (and no connect delay). +delta=$((time_now - time_then)) + +if [ $delta -lt 2 ] && [ $rv -eq 0 ]; then + echo "PASS: could connect to service via redirected ports" +else + echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" + ret=1 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nf_queue.c b/tools/testing/selftests/net/netfilter/nf_queue.c new file mode 100644 index 0000000000..9e56b9d470 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_queue.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct options { + bool count_packets; + bool gso_enabled; + int verbose; + unsigned int queue_num; + unsigned int timeout; + uint32_t verdict; + uint32_t delay_ms; +}; + +static unsigned int queue_stats[5]; +static struct options opts; + +static void help(const char *p) +{ + printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); +} + +static int parse_attr_cb(const struct nlattr *attr, void *data) +{ + const struct nlattr **tb = data; + int type = mnl_attr_get_type(attr); + + /* skip unsupported attribute in user-space */ + if (mnl_attr_type_valid(attr, NFQA_MAX) < 0) + return MNL_CB_OK; + + switch (type) { + case NFQA_MARK: + case NFQA_IFINDEX_INDEV: + case NFQA_IFINDEX_OUTDEV: + case NFQA_IFINDEX_PHYSINDEV: + case NFQA_IFINDEX_PHYSOUTDEV: + if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0) { + perror("mnl_attr_validate"); + return MNL_CB_ERROR; + } + break; + case NFQA_TIMESTAMP: + if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, + sizeof(struct nfqnl_msg_packet_timestamp)) < 0) { + perror("mnl_attr_validate2"); + return MNL_CB_ERROR; + } + break; + case NFQA_HWADDR: + if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, + sizeof(struct nfqnl_msg_packet_hw)) < 0) { + perror("mnl_attr_validate2"); + return MNL_CB_ERROR; + } + break; + case NFQA_PAYLOAD: + break; + } + tb[type] = attr; + return MNL_CB_OK; +} + +static int queue_cb(const struct nlmsghdr *nlh, void *data) +{ + struct nlattr *tb[NFQA_MAX+1] = { 0 }; + struct nfqnl_msg_packet_hdr *ph = NULL; + uint32_t id = 0; + + (void)data; + + mnl_attr_parse(nlh, sizeof(struct nfgenmsg), parse_attr_cb, tb); + if (tb[NFQA_PACKET_HDR]) { + ph = mnl_attr_get_payload(tb[NFQA_PACKET_HDR]); + id = ntohl(ph->packet_id); + + if (opts.verbose > 0) + printf("packet hook=%u, hwproto 0x%x", + ntohs(ph->hw_protocol), ph->hook); + + if (ph->hook >= 5) { + fprintf(stderr, "Unknown hook %d\n", ph->hook); + return MNL_CB_ERROR; + } + + if (opts.verbose > 0) { + uint32_t skbinfo = 0; + + if (tb[NFQA_SKB_INFO]) + skbinfo = ntohl(mnl_attr_get_u32(tb[NFQA_SKB_INFO])); + if (skbinfo & NFQA_SKB_CSUMNOTREADY) + printf(" csumnotready"); + if (skbinfo & NFQA_SKB_GSO) + printf(" gso"); + if (skbinfo & NFQA_SKB_CSUM_NOTVERIFIED) + printf(" csumnotverified"); + puts(""); + } + + if (opts.count_packets) + queue_stats[ph->hook]++; + } + + return MNL_CB_OK + id; +} + +static struct nlmsghdr * +nfq_build_cfg_request(char *buf, uint8_t command, int queue_num) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + struct nfqnl_msg_config_cmd cmd = { + .command = command, + .pf = htons(AF_INET), + }; + struct nfgenmsg *nfg; + + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; + nlh->nlmsg_flags = NLM_F_REQUEST; + + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_CFG_CMD, sizeof(cmd), &cmd); + + return nlh; +} + +static struct nlmsghdr * +nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + struct nfqnl_msg_config_params params = { + .copy_range = htonl(range), + .copy_mode = mode, + }; + struct nfgenmsg *nfg; + + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; + nlh->nlmsg_flags = NLM_F_REQUEST; + + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_CFG_PARAMS, sizeof(params), ¶ms); + + return nlh; +} + +static struct nlmsghdr * +nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd) +{ + struct nfqnl_msg_verdict_hdr vh = { + .verdict = htonl(verd), + .id = htonl(id), + }; + struct nlmsghdr *nlh; + struct nfgenmsg *nfg; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_VERDICT; + nlh->nlmsg_flags = NLM_F_REQUEST; + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_VERDICT_HDR, sizeof(vh), &vh); + + return nlh; +} + +static void print_stats(void) +{ + unsigned int last, total; + int i; + + total = 0; + last = queue_stats[0]; + + for (i = 0; i < 5; i++) { + printf("hook %d packets %08u\n", i, queue_stats[i]); + last = queue_stats[i]; + total += last; + } + + printf("%u packets total\n", total); +} + +struct mnl_socket *open_queue(void) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + unsigned int queue_num; + struct mnl_socket *nl; + struct nlmsghdr *nlh; + struct timeval tv; + uint32_t flags; + + nl = mnl_socket_open(NETLINK_NETFILTER); + if (nl == NULL) { + perror("mnl_socket_open"); + exit(EXIT_FAILURE); + } + + if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { + perror("mnl_socket_bind"); + exit(EXIT_FAILURE); + } + + queue_num = opts.queue_num; + nlh = nfq_build_cfg_request(buf, NFQNL_CFG_CMD_BIND, queue_num); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + + nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); + + flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; + flags |= NFQA_CFG_F_UID_GID; + mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); + mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + + memset(&tv, 0, sizeof(tv)); + tv.tv_sec = opts.timeout; + if (opts.timeout && setsockopt(mnl_socket_get_fd(nl), + SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + perror("setsockopt(SO_RCVTIMEO)"); + exit(EXIT_FAILURE); + } + + return nl; +} + +static void sleep_ms(uint32_t delay) +{ + struct timespec ts = { .tv_sec = delay / 1000 }; + + delay %= 1000; + + ts.tv_nsec = delay * 1000llu * 1000llu; + + nanosleep(&ts, NULL); +} + +static int mainloop(void) +{ + unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; + struct mnl_socket *nl; + struct nlmsghdr *nlh; + unsigned int portid; + char *buf; + int ret; + + buf = malloc(buflen); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + nl = open_queue(); + portid = mnl_socket_get_portid(nl); + + for (;;) { + uint32_t id; + + ret = mnl_socket_recvfrom(nl, buf, buflen); + if (ret == -1) { + if (errno == ENOBUFS || errno == EINTR) + continue; + + if (errno == EAGAIN) { + errno = 0; + ret = 0; + break; + } + + perror("mnl_socket_recvfrom"); + exit(EXIT_FAILURE); + } + + ret = mnl_cb_run(buf, ret, 0, portid, queue_cb, NULL); + if (ret < 0) { + perror("mnl_cb_run"); + exit(EXIT_FAILURE); + } + + id = ret - MNL_CB_OK; + if (opts.delay_ms) + sleep_ms(opts.delay_ms); + + nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + } + + mnl_socket_close(nl); + + return ret; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { + switch (c) { + case 'c': + opts.count_packets = true; + break; + case 'h': + help(argv[0]); + exit(0); + break; + case 'q': + opts.queue_num = atoi(optarg); + if (opts.queue_num > 0xffff) + opts.queue_num = 0; + break; + case 'Q': + opts.verdict = atoi(optarg); + if (opts.verdict > 0xffff) { + fprintf(stderr, "Expected destination queue number\n"); + exit(1); + } + + opts.verdict <<= 16; + opts.verdict |= NF_QUEUE; + break; + case 'd': + opts.delay_ms = atoi(optarg); + if (opts.delay_ms == 0) { + fprintf(stderr, "Expected nonzero delay (in milliseconds)\n"); + exit(1); + } + break; + case 't': + opts.timeout = atoi(optarg); + break; + case 'G': + opts.gso_enabled = false; + break; + case 'v': + opts.verbose++; + break; + } + } + + if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) { + fprintf(stderr, "Cannot use same destination and source queue\n"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + int ret; + + opts.verdict = NF_ACCEPT; + opts.gso_enabled = true; + + parse_opts(argc, argv); + + ret = mainloop(); + if (opts.count_packets) + print_stats(); + + return ret; +} diff --git a/tools/testing/selftests/net/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh new file mode 100755 index 0000000000..902f8114bc --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_audit.sh @@ -0,0 +1,268 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check that audit logs generated for nft commands are as expected. + +SKIP_RC=4 +RC=0 + +if [ -r /var/run/auditd.pid ];then + read pid < /var/run/auditd.pid + p=$(pgrep ^auditd$) + + if [ "$pid" -eq "$p" ]; then + echo "SKIP: auditd is running" + exit $SKIP_RC + fi +fi + +nft --version >/dev/null 2>&1 || { + echo "SKIP: missing nft tool" + exit $SKIP_RC +} + +# nft must be recent enough to support "reset" keyword. +nft --check -f /dev/stdin >/dev/null 2>&1 <"$logfile" & +logread_pid=$! +trap 'kill $logread_pid; rm -f $logfile $rulefile' EXIT +exec 3<"$logfile" + +do_test() { # (cmd, log) + echo -n "testing for cmd: $1 ... " + cat <&3 >/dev/null + $1 >/dev/null || exit 1 + sleep 0.1 + res=$(diff -a -u <(echo "$2") - <&3) + [ $? -eq 0 ] && { echo "OK"; return; } + echo "FAIL" + grep -v '^\(---\|+++\|@@\)' <<< "$res" + ((RC--)) +} + +nft flush ruleset + +# adding tables, chains and rules + +for table in t1 t2; do + do_test "nft add table $table" \ + "table=$table family=2 entries=1 op=nft_register_table" + + do_test "nft add chain $table c1" \ + "table=$table family=2 entries=1 op=nft_register_chain" + + do_test "nft add chain $table c2; add chain $table c3" \ + "table=$table family=2 entries=2 op=nft_register_chain" + + cmd="add rule $table c1 counter" + + do_test "nft $cmd" \ + "table=$table family=2 entries=1 op=nft_register_rule" + + do_test "nft $cmd; $cmd" \ + "table=$table family=2 entries=2 op=nft_register_rule" + + cmd="" + sep="" + for chain in c2 c3; do + for i in {1..3}; do + cmd+="$sep add rule $table $chain counter" + sep=";" + done + done + do_test "nft $cmd" \ + "table=$table family=2 entries=6 op=nft_register_rule" +done + +for ((i = 0; i < 500; i++)); do + echo "add rule t2 c3 counter accept comment \"rule $i\"" +done > "$rulefile" +do_test "nft -f $rulefile" \ +'table=t2 family=2 entries=500 op=nft_register_rule' + +# adding sets and elements + +settype='type inet_service; counter' +setelem='{ 22, 80, 443 }' +setblock="{ $settype; elements = $setelem; }" +do_test "nft add set t1 s $setblock" \ +"table=t1 family=2 entries=4 op=nft_register_set" + +do_test "nft add set t1 s2 $setblock; add set t1 s3 { $settype; }" \ +"table=t1 family=2 entries=5 op=nft_register_set" + +do_test "nft add element t1 s3 $setelem" \ +"table=t1 family=2 entries=3 op=nft_register_setelem" + +# adding counters + +do_test 'nft add counter t1 c1' \ +'table=t1 family=2 entries=1 op=nft_register_obj' + +do_test 'nft add counter t2 c1; add counter t2 c2' \ +'table=t2 family=2 entries=2 op=nft_register_obj' + +for ((i = 3; i <= 500; i++)); do + echo "add counter t2 c$i" +done > "$rulefile" +do_test "nft -f $rulefile" \ +'table=t2 family=2 entries=498 op=nft_register_obj' + +# adding/updating quotas + +do_test 'nft add quota t1 q1 { 10 bytes }' \ +'table=t1 family=2 entries=1 op=nft_register_obj' + +do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \ +'table=t2 family=2 entries=2 op=nft_register_obj' + +for ((i = 3; i <= 500; i++)); do + echo "add quota t2 q$i { 10 bytes }" +done > "$rulefile" +do_test "nft -f $rulefile" \ +'table=t2 family=2 entries=498 op=nft_register_obj' + +# changing the quota value triggers obj update path +do_test 'nft add quota t1 q1 { 20 bytes }' \ +'table=t1 family=2 entries=1 op=nft_register_obj' + +# resetting rules + +do_test 'nft reset rules t1 c2' \ +'table=t1 family=2 entries=3 op=nft_reset_rule' + +do_test 'nft reset rules table t1' \ +'table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule' + +do_test 'nft reset rules t2 c3' \ +'table=t2 family=2 entries=189 op=nft_reset_rule +table=t2 family=2 entries=188 op=nft_reset_rule +table=t2 family=2 entries=126 op=nft_reset_rule' + +do_test 'nft reset rules t2' \ +'table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=186 op=nft_reset_rule +table=t2 family=2 entries=188 op=nft_reset_rule +table=t2 family=2 entries=129 op=nft_reset_rule' + +do_test 'nft reset rules' \ +'table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=180 op=nft_reset_rule +table=t2 family=2 entries=188 op=nft_reset_rule +table=t2 family=2 entries=135 op=nft_reset_rule' + +# resetting sets and elements + +elem=(22 ",80" ",443") +relem="" +for i in {1..3}; do + relem+="${elem[((i - 1))]}" + do_test "nft reset element t1 s { $relem }" \ + "table=t1 family=2 entries=$i op=nft_reset_setelem" +done + +do_test 'nft reset set t1 s' \ +'table=t1 family=2 entries=3 op=nft_reset_setelem' + +# resetting counters + +do_test 'nft reset counter t1 c1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset counters t1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset counters t2' \ +'table=t2 family=2 entries=342 op=nft_reset_obj +table=t2 family=2 entries=158 op=nft_reset_obj' + +do_test 'nft reset counters' \ +'table=t1 family=2 entries=1 op=nft_reset_obj +table=t2 family=2 entries=341 op=nft_reset_obj +table=t2 family=2 entries=159 op=nft_reset_obj' + +# resetting quotas + +do_test 'nft reset quota t1 q1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset quotas t1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset quotas t2' \ +'table=t2 family=2 entries=315 op=nft_reset_obj +table=t2 family=2 entries=185 op=nft_reset_obj' + +do_test 'nft reset quotas' \ +'table=t1 family=2 entries=1 op=nft_reset_obj +table=t2 family=2 entries=314 op=nft_reset_obj +table=t2 family=2 entries=186 op=nft_reset_obj' + +# deleting rules + +readarray -t handles < <(nft -a list chain t1 c1 | \ + sed -n 's/.*counter.* handle \(.*\)$/\1/p') + +do_test "nft delete rule t1 c1 handle ${handles[0]}" \ +'table=t1 family=2 entries=1 op=nft_unregister_rule' + +cmd='delete rule t1 c1 handle' +do_test "nft $cmd ${handles[1]}; $cmd ${handles[2]}" \ +'table=t1 family=2 entries=2 op=nft_unregister_rule' + +do_test 'nft flush chain t1 c2' \ +'table=t1 family=2 entries=3 op=nft_unregister_rule' + +do_test 'nft flush table t2' \ +'table=t2 family=2 entries=509 op=nft_unregister_rule' + +# deleting chains + +do_test 'nft delete chain t2 c2' \ +'table=t2 family=2 entries=1 op=nft_unregister_chain' + +# deleting sets and elements + +do_test 'nft delete element t1 s { 22 }' \ +'table=t1 family=2 entries=1 op=nft_unregister_setelem' + +do_test 'nft delete element t1 s { 80, 443 }' \ +'table=t1 family=2 entries=2 op=nft_unregister_setelem' + +do_test 'nft flush set t1 s2' \ +'table=t1 family=2 entries=3 op=nft_unregister_setelem' + +do_test 'nft delete set t1 s2' \ +'table=t1 family=2 entries=1 op=nft_unregister_set' + +do_test 'nft delete set t1 s3' \ +'table=t1 family=2 entries=1 op=nft_unregister_set' + +exit $RC diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh new file mode 100755 index 0000000000..6d66240e14 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -0,0 +1,1622 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# nft_concat_range.sh - Tests for sets with concatenation of ranged fields +# +# Copyright (c) 2019 Red Hat GmbH +# +# Author: Stefano Brivio +# +# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031,SC2317 +# ^ Configuration and templates sourced with eval, counters reused in subshells + +source lib.sh + +# Available test groups: +# - reported_issues: check for issues that were reported in the past +# - correctness: check that packets match given entries, and only those +# - concurrency: attempt races between insertion, deletion and lookup +# - timeout: check that packets match entries until they expire +# - performance: estimate matching rate, compare with rbtree and hash baselines +TESTS="reported_issues correctness concurrency timeout" +[ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}" + +# Set types, defined by TYPE_ variables below +TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto + net_port_net net_mac mac_net net_mac_icmp net6_mac_icmp + net6_port_net6_port net_port_mac_proto_net" + +# Reported bugs, also described by TYPE_ variables below +BUGS="flush_remove_add reload" + +# List of possible paths to pktgen script from kernel tree for performance tests +PKTGEN_SCRIPT_PATHS=" + ../../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh + pktgen/pktgen_bench_xmit_mode_netif_receive.sh" + +# Definition of set types: +# display display text for test report +# type_spec nftables set type specifier +# chain_spec nftables type specifier for rules mapping to set +# dst call sequence of format_*() functions for destination fields +# src call sequence of format_*() functions for source fields +# start initial integer used to generate addresses and ports +# count count of entries to generate and match +# src_delta number summed to destination generator for source fields +# tools list of tools for correctness and timeout tests, any can be used +# proto L4 protocol of test packets +# +# race_repeat race attempts per thread, 0 disables concurrency test for type +# flood_tools list of tools for concurrency tests, any can be used +# flood_proto L4 protocol of test packets for concurrency tests +# flood_spec nftables type specifier for concurrency tests +# +# perf_duration duration of single pktgen injection test +# perf_spec nftables type specifier for performance tests +# perf_dst format_*() functions for destination fields in performance test +# perf_src format_*() functions for source fields in performance test +# perf_entries number of set entries for performance test +# perf_proto L3 protocol of test packets +TYPE_net_port=" +display net,port +type_spec ipv4_addr . inet_service +chain_spec ip daddr . udp dport +dst addr4 port +src +start 1 +count 5 +src_delta 2000 +tools sendip bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto udp +flood_spec ip daddr . udp dport + +perf_duration 5 +perf_spec ip daddr . udp dport +perf_dst addr4 port +perf_src +perf_entries 1000 +perf_proto ipv4 +" + +TYPE_port_net=" +display port,net +type_spec inet_service . ipv4_addr +chain_spec udp dport . ip daddr +dst port addr4 +src +start 1 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto udp +flood_spec udp dport . ip daddr + +perf_duration 5 +perf_spec udp dport . ip daddr +perf_dst port addr4 +perf_src +perf_entries 100 +perf_proto ipv4 +" + +TYPE_net6_port=" +display net6,port +type_spec ipv6_addr . inet_service +chain_spec ip6 daddr . udp dport +dst addr6 port +src +start 10 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp6 + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp6 +flood_spec ip6 daddr . udp dport + +perf_duration 5 +perf_spec ip6 daddr . udp dport +perf_dst addr6 port +perf_src +perf_entries 1000 +perf_proto ipv6 +" + +TYPE_port_proto=" +display port,proto +type_spec inet_service . inet_proto +chain_spec udp dport . meta l4proto +dst port proto +src +start 1 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp + +race_repeat 0 + +perf_duration 5 +perf_spec udp dport . meta l4proto +perf_dst port proto +perf_src +perf_entries 30000 +perf_proto ipv4 +" + +TYPE_net6_port_mac=" +display net6,port,mac +type_spec ipv6_addr . inet_service . ether_addr +chain_spec ip6 daddr . udp dport . ether saddr +dst addr6 port +src mac +start 10 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp6 + +race_repeat 0 + +perf_duration 5 +perf_spec ip6 daddr . udp dport . ether daddr +perf_dst addr6 port mac +perf_src +perf_entries 10 +perf_proto ipv6 +" + +TYPE_net6_port_mac_proto=" +display net6,port,mac,proto +type_spec ipv6_addr . inet_service . ether_addr . inet_proto +chain_spec ip6 daddr . udp dport . ether saddr . meta l4proto +dst addr6 port +src mac proto +start 10 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp6 + +race_repeat 0 + +perf_duration 5 +perf_spec ip6 daddr . udp dport . ether daddr . meta l4proto +perf_dst addr6 port mac proto +perf_src +perf_entries 1000 +perf_proto ipv6 +" + +TYPE_net_port_net=" +display net,port,net +type_spec ipv4_addr . inet_service . ipv4_addr +chain_spec ip daddr . udp dport . ip saddr +dst addr4 port +src addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp +flood_spec ip daddr . udp dport . ip saddr + +perf_duration 0 +" + +TYPE_net6_port_net6_port=" +display net6,port,net6,port +type_spec ipv6_addr . inet_service . ipv6_addr . inet_service +chain_spec ip6 daddr . udp dport . ip6 saddr . udp sport +dst addr6 port +src addr6 port +start 10 +count 5 +src_delta 2000 +tools sendip socat +proto udp6 + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp6 +flood_spec ip6 daddr . tcp dport . ip6 saddr . tcp sport + +perf_duration 0 +" + +TYPE_net_port_mac_proto_net=" +display net,port,mac,proto,net +type_spec ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr +chain_spec ip daddr . udp dport . ether saddr . meta l4proto . ip saddr +dst addr4 port +src mac proto addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_mac=" +display net,mac +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp + +race_repeat 0 + +perf_duration 5 +perf_spec ip daddr . ether daddr +perf_dst addr4 mac +perf_src +perf_entries 1000 +perf_proto ipv4 +" + +TYPE_mac_net=" +display mac,net +type_spec ether_addr . ipv4_addr +chain_spec ether saddr . ip saddr +dst +src mac addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_mac_icmp=" +display net,mac - ICMP +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 5 +src_delta 2000 +tools ping +proto icmp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net6_mac_icmp=" +display net6,mac - ICMPv6 +type_spec ipv6_addr . ether_addr +chain_spec ip6 daddr . ether saddr +dst addr6 +src mac +start 10 +count 50 +src_delta 2000 +tools ping +proto icmp6 + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_port_proto_net=" +display net,port,proto,net +type_spec ipv4_addr . inet_service . inet_proto . ipv4_addr +chain_spec ip daddr . udp dport . meta l4proto . ip saddr +dst addr4 port proto +src addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp +flood_spec ip daddr . tcp dport . meta l4proto . ip saddr + +perf_duration 0 +" + +# Definition of tests for bugs reported in the past: +# display display text for test report +TYPE_flush_remove_add=" +display Add two elements, flush, re-add +" + +TYPE_reload=" +display net,mac with reload +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 1 +src_delta 2000 +tools sendip socat bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + +# Set template for all tests, types and rules are filled in depending on test +set_template=' +flush ruleset + +table inet filter { + counter test { + packets 0 bytes 0 + } + + set test { + type ${type_spec} + flags interval,timeout + } + + chain input { + type filter hook prerouting priority 0; policy accept; + ${chain_spec} @test counter name \"test\" + } +} + +table netdev perf { + counter test { + packets 0 bytes 0 + } + + counter match { + packets 0 bytes 0 + } + + set test { + type ${type_spec} + flags interval + } + + set norange { + type ${type_spec} + } + + set noconcat { + type ${type_spec%% *} + flags interval + } + + chain test { + type filter hook ingress device veth_a priority 0; + } +} +' + +err_buf= +info_buf= + +# Append string to error buffer +err() { + err_buf="${err_buf}${1} +" +} + +# Append string to information buffer +info() { + info_buf="${info_buf}${1} +" +} + +# Flush error buffer to stdout +err_flush() { + printf "%s" "${err_buf}" + err_buf= +} + +# Flush information buffer to stdout +info_flush() { + printf "%s" "${info_buf}" + info_buf= +} + +# Setup veth pair: this namespace receives traffic, B generates it +setup_veth() { + ip netns add B + ip link add veth_a type veth peer name veth_b || return 1 + + ip link set veth_a up + ip link set veth_b netns B + + ip -n B link set veth_b up + + ip addr add dev veth_a 10.0.0.1 + ip route add default dev veth_a + + ip -6 addr add fe80::1/64 dev veth_a nodad + ip -6 addr add 2001:db8::1/64 dev veth_a nodad + ip -6 route add default dev veth_a + + ip -n B route add default dev veth_b + + ip -6 -n B addr add fe80::2/64 dev veth_b nodad + ip -6 -n B addr add 2001:db8::2/64 dev veth_b nodad + ip -6 -n B route add default dev veth_b + + B() { + ip netns exec B "$@" >/dev/null 2>&1 + } +} + +# Fill in set template and initialise set +setup_set() { + eval "echo \"${set_template}\"" | nft -f - +} + +# Check that at least one of the needed tools is available +check_tools() { + [ -z "${tools}" ] && return 0 + + __tools= + for tool in ${tools}; do + __tools="${__tools} ${tool}" + + command -v "${tool}" >/dev/null && return 0 + done + err "need one of:${__tools}, skipping" && return 1 +} + +# Set up function to send ICMP packets +setup_send_icmp() { + send_icmp() { + B ping -c1 -W1 "${dst_addr4}" >/dev/null 2>&1 + } +} + +# Set up function to send ICMPv6 packets +setup_send_icmp6() { + if command -v ping6 >/dev/null; then + send_icmp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ping6 -q -c1 -W1 "${dst_addr6}" + } + else + send_icmp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ping -q -6 -c1 -W1 "${dst_addr6}" + } + fi +} + +# Set up function to send single UDP packets on IPv4 +setup_send_udp() { + if command -v sendip >/dev/null; then + send_udp() { + [ -n "${src_port}" ] && src_port="-us ${src_port}" + [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" + [ -n "${src_addr4}" ] && src_addr4="-is ${src_addr4}" + + # shellcheck disable=SC2086 # sendip needs split options + B sendip -p ipv4 -p udp ${src_addr4} ${src_port} \ + ${dst_port} "${dst_addr4}" + + src_port= + dst_port= + src_addr4= + } + elif command -v socat -v >/dev/null; then + send_udp() { + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}" dev veth_b + __socatbind=",bind=${src_addr4}" + if [ -n "${src_port}" ];then + __socatbind="${__socatbind}:${src_port}" + fi + fi + + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + [ -z "${dst_port}" ] && dst_port=12345 + + echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:"$dst_addr4":"$dst_port""${__socatbind}" + + src_addr4= + src_port= + } + elif [ -z "$(bash -c 'type -p')" ]; then + send_udp() { + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + B ip route add default dev veth_b + fi + + B bash -c "echo > /dev/udp/${dst_addr4}/${dst_port}" + + if [ -n "${src_addr4}" ]; then + B ip addr del "${src_addr4}/16" dev veth_b + fi + src_addr4= + } + else + return 1 + fi +} + +# Set up function to send single UDP packets on IPv6 +setup_send_udp6() { + if command -v sendip >/dev/null; then + send_udp6() { + [ -n "${src_port}" ] && src_port="-us ${src_port}" + [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" + if [ -n "${src_addr6}" ]; then + src_addr6="-6s ${src_addr6}" + else + src_addr6="-6s 2001:db8::2" + fi + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + B sendip -p ipv6 -p udp ${src_addr6} ${src_port} \ + ${dst_port} "${dst_addr6}" + + src_port= + dst_port= + src_addr6= + } + elif command -v socat -v >/dev/null; then + send_udp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + __socatbind6= + + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + + __socatbind6=",bind=[${src_addr6}]" + + if [ -n "${src_port}" ] ;then + __socatbind6="${__socatbind6}:${src_port}" + fi + fi + + echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:["$dst_addr6"]:"$dst_port""${__socatbind6}" + } + elif [ -z "$(bash -c 'type -p')" ]; then + send_udp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ip addr add "${src_addr6}" dev veth_b nodad + B bash -c "echo > /dev/udp/${dst_addr6}/${dst_port}" + ip -6 addr del "${dst_addr6}" dev veth_a 2>/dev/null + } + else + return 1 + fi +} + +listener_ready() +{ + port="$1" + ss -lnt -o "sport = :$port" | grep -q "$port" +} + +# Set up function to send TCP traffic on IPv4 +setup_flood_tcp() { + if command -v iperf3 >/dev/null; then + flood_tcp() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b 2>/dev/null + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ + ${src_addr4} -l16 -t 1000 + + src_addr4= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_tcp() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 2>/dev/null + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_addr4="${src_addr4}:${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" + + # shellcheck disable=SC2086 # this needs split options + B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ + -l20 -t 1000 + + src_addr4= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_tcp() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="10.0.0.2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -4 ${dst_port} -L "${dst_addr4}" \ + >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" + + # shellcheck disable=SC2086 # this needs split options + B netperf -4 -H "${dst_addr4}" ${dst_port} \ + -L "${src_addr4}" -l 1000 -t TCP_STREAM + + src_addr4= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Set up function to send TCP traffic on IPv6 +setup_flood_tcp6() { + if command -v iperf3 >/dev/null; then + flood_tcp6() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + src_addr6="-B ${src_addr6}" + else + src_addr6="-B 2001:db8::2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -c "${dst_addr6}" ${dst_port} \ + ${src_port} ${src_addr6} -l16 -t 1000 + + src_addr6= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_tcp6() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + src_addr6="-B ${src_addr6}" + else + src_addr6="-B 2001:db8::2" + fi + if [ -n "${src_port}" ]; then + src_addr6="${src_addr6}:${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" + + # shellcheck disable=SC2086 # this needs split options + B iperf -c "${dst_addr6}" -V ${dst_port} \ + ${src_addr6} -l1 -t 1000 + + src_addr6= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_tcp6() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + else + src_addr6="2001:db8::2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -6 ${dst_port} -L "${dst_addr6}" \ + >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" + + # shellcheck disable=SC2086 # this needs split options + B netperf -6 -H "${dst_addr6}" ${dst_port} \ + -L "${src_addr6}" -l 1000 -t TCP_STREAM + + src_addr6= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Set up function to send UDP traffic on IPv4 +setup_flood_udp() { + if command -v iperf3 >/dev/null; then + flood_udp() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 2>/dev/null + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr4}" ${dst_port} + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ + ${dst_port} ${src_port} ${src_addr4} + + src_addr4= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_udp() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_addr4="${src_addr4}:${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" + + # shellcheck disable=SC2086 # this needs split options + B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ + ${dst_port} ${src_addr4} + + src_addr4= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_udp() { + local n_port="${dst_port}" + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="10.0.0.2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -4 ${dst_port} -L "${dst_addr4}" \ + >/dev/null 2>&1 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" + + # shellcheck disable=SC2086 # this needs split options + B netperf -4 -H "${dst_addr4}" ${dst_port} \ + -L "${src_addr4}" -l 1000 -t UDP_STREAM + + src_addr4= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Find pktgen script and set up function to start pktgen injection +setup_perf() { + for pktgen_script_path in ${PKTGEN_SCRIPT_PATHS} __notfound; do + command -v "${pktgen_script_path}" >/dev/null && break + done + [ "${pktgen_script_path}" = "__notfound" ] && return 1 + + perf_ipv4() { + ${pktgen_script_path} -s80 \ + -i veth_a -d "${dst_addr4}" -p "${dst_port}" \ + -m "${dst_mac}" \ + -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & + perf_pid=$! + } + perf_ipv6() { + IP6=6 ${pktgen_script_path} -s100 \ + -i veth_a -d "${dst_addr6}" -p "${dst_port}" \ + -m "${dst_mac}" \ + -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & + perf_pid=$! + } +} + +# Clean up before each test +cleanup() { + nft reset counter inet filter test >/dev/null 2>&1 + nft flush ruleset >/dev/null 2>&1 + ip link del dummy0 2>/dev/null + ip route del default 2>/dev/null + ip -6 route del default 2>/dev/null + ip netns pids B 2>/dev/null | xargs kill 2>/dev/null + ip netns del B 2>/dev/null + ip link del veth_a 2>/dev/null + timeout= + killall iperf3 2>/dev/null + killall iperf 2>/dev/null + killall netperf 2>/dev/null + killall netserver 2>/dev/null +} + +cleanup_exit() { + cleanup + rm -f "$tmp" +} + +# Entry point for setup functions +setup() { + if [ "$(id -u)" -ne 0 ]; then + echo " need to run as root" + exit ${ksft_skip} + fi + + cleanup + check_tools || return 1 + for arg do + if ! eval setup_"${arg}"; then + err " ${arg} not supported" + return 1 + fi + done +} + +# Format integer into IPv4 address, summing 10.0.0.5 (arbitrary) to it +format_addr4() { + a=$((${1} + 16777216 * 10 + 5)) + printf "%i.%i.%i.%i" \ + "$((a / 16777216))" "$((a % 16777216 / 65536))" \ + "$((a % 65536 / 256))" "$((a % 256))" +} + +# Format integer into IPv6 address, summing 2001:db8:: to it +format_addr6() { + printf "2001:db8::%04x:%04x" "$((${1} / 65536))" "$((${1} % 65536))" +} + +# Format integer into EUI-48 address, summing 00:01:00:00:00:00 to it +format_mac() { + printf "00:01:%02x:%02x:%02x:%02x" \ + "$((${1} / 16777216))" "$((${1} % 16777216 / 65536))" \ + "$((${1} % 65536 / 256))" "$((${1} % 256))" +} + +# Format integer into port, avoid 0 port +format_port() { + printf "%i" "$((${1} % 65534 + 1))" +} + +# Drop suffixed '6' from L4 protocol, if any +format_proto() { + printf "%s" "${proto}" | tr -d 6 +} + +# Format destination and source fields into nft concatenated type +format() { + __start= + __end= + __expr="{ " + + for f in ${dst}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __start="$(eval format_"${f}" "${start}")" + __end="$(eval format_"${f}" "${end}")" + + if [ "${f}" = "proto" ]; then + __expr="${__expr}${__start}" + else + __expr="${__expr}${__start}-${__end}" + fi + done + for f in ${src}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __start="$(eval format_"${f}" "${srcstart}")" + __end="$(eval format_"${f}" "${srcend}")" + + if [ "${f}" = "proto" ]; then + __expr="${__expr}${__start}" + else + __expr="${__expr}${__start}-${__end}" + fi + done + + if [ -n "${timeout}" ]; then + echo "${__expr} timeout ${timeout}s }" + else + echo "${__expr} }" + fi +} + +# Format destination and source fields into nft type, start element only +format_norange() { + __expr="{ " + + for f in ${dst}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __expr="${__expr}$(eval format_"${f}" "${start}")" + done + for f in ${src}; do + __expr="${__expr} . $(eval format_"${f}" "${start}")" + done + + echo "${__expr} }" +} + +# Format first destination field into nft type +format_noconcat() { + for f in ${dst}; do + __start="$(eval format_"${f}" "${start}")" + __end="$(eval format_"${f}" "${end}")" + + if [ "${f}" = "proto" ]; then + echo "{ ${__start} }" + else + echo "{ ${__start}-${__end} }" + fi + return + done +} + +# Add single entry to 'test' set in 'inet filter' table +add() { + if ! nft add element inet filter test "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Format and output entries for sets in 'netdev perf' table +add_perf() { + if [ "${1}" = "test" ]; then + echo "add element netdev perf test $(format)" + elif [ "${1}" = "norange" ]; then + echo "add element netdev perf norange $(format_norange)" + elif [ "${1}" = "noconcat" ]; then + echo "add element netdev perf noconcat $(format_noconcat)" + fi +} + +# Add single entry to 'norange' set in 'netdev perf' table +add_perf_norange() { + if ! nft add element netdev perf norange "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Add single entry to 'noconcat' set in 'netdev perf' table +add_perf_noconcat() { + if ! nft add element netdev perf noconcat "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Delete single entry from set +del() { + if ! nft delete element inet filter test "${1}"; then + err "Failed to delete ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Return packet count from 'test' counter in 'inet filter' table +count_packets() { + found=0 + for token in $(nft list counter inet filter test); do + [ ${found} -eq 1 ] && echo "${token}" && return + [ "${token}" = "packets" ] && found=1 + done +} + +# Return packet count from 'test' counter in 'netdev perf' table +count_perf_packets() { + found=0 + for token in $(nft list counter netdev perf test); do + [ ${found} -eq 1 ] && echo "${token}" && return + [ "${token}" = "packets" ] && found=1 + done +} + +# Set MAC addresses, send traffic according to specifier +flood() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval flood_\$proto +} + +# Set MAC addresses, start pktgen injection +perf() { + dst_mac="$(format_mac "${1}")" + ip link set veth_a address "${dst_mac}" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval perf_\$perf_proto +} + +# Set MAC addresses, send single packet, check that it matches, reset counter +send_match() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval send_\$proto + if [ "$(count_packets)" != "1" ]; then + err "${proto} packet to:" + err " $(for f in ${dst}; do + eval format_\$f "${1}"; printf ' '; done)" + err "from:" + err " $(for f in ${src}; do + eval format_\$f "${2}"; printf ' '; done)" + err "should have matched ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi + nft reset counter inet filter test >/dev/null +} + +# Set MAC addresses, send single packet, check that it doesn't match +send_nomatch() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval send_\$proto + if [ "$(count_packets)" != "0" ]; then + err "${proto} packet to:" + err " $(for f in ${dst}; do + eval format_\$f "${1}"; printf ' '; done)" + err "from:" + err " $(for f in ${src}; do + eval format_\$f "${2}"; printf ' '; done)" + err "should not have matched ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Correctness test template: +# - add ranged element, check that packets match it +# - check that packets outside range don't match it +# - remove some elements, check that packets don't match anymore +test_correctness() { + setup veth send_"${proto}" set || return ${ksft_skip} + + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 + + # Delete elements now and then + if [ $((i % 3)) -eq 0 ]; then + del "$(format)" || return 1 + for j in $(seq "$start" \ + $((range_size / 2 + 1)) ${end}); do + send_nomatch "${j}" $((j + src_delta)) \ + || return 1 + done + fi + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done +} + +# Concurrency test template: +# - add all the elements +# - start a thread for each physical thread that: +# - adds all the elements +# - flushes the set +# - adds all the elements +# - flushes the entire ruleset +# - adds the set back +# - adds all the elements +# - delete all the elements +test_concurrency() { + proto=${flood_proto} + tools=${flood_tools} + chain_spec=${flood_spec} + setup veth flood_"${proto}" set || return ${ksft_skip} + + range_size=1 + cstart=${start} + flood_pids= + for i in $(seq "$start" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + + flood "${i}" $((i + src_delta)) & flood_pids="${flood_pids} $!" + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + sleep $((RANDOM%10)) + + pids= + for c in $(seq 1 "$(nproc)"); do ( + for r in $(seq 1 "${race_repeat}"); do + range_size=1 + + # $start needs to be local to this subshell + # shellcheck disable=SC2030 + start=${cstart} + for i in $(seq "$start" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush inet filter test 2>/dev/null + + range_size=1 + start=${cstart} + for i in $(seq "$start" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush ruleset + setup set 2>/dev/null + + range_size=1 + start=${cstart} + for i in $(seq "$start" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + range_size=1 + start=${cstart} + for i in $(seq "$start" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + del "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + done + ) & pids="${pids} $!" + done + + # shellcheck disable=SC2046,SC2086 # word splitting wanted here + wait $(for pid in ${pids}; do echo ${pid}; done) + # shellcheck disable=SC2046,SC2086 + kill $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null + # shellcheck disable=SC2046,SC2086 + wait $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null + + return 0 +} + +# Timeout test template: +# - add all the elements with 3s timeout while checking that packets match +# - wait 3s after the last insertion, check that packets don't match any entry +test_timeout() { + setup veth send_"${proto}" set || return ${ksft_skip} + + timeout=3 + + [ "$KSFT_MACHINE_SLOW" = "yes" ] && timeout=8 + + range_size=1 + for i in $(seq "$start" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + sleep $timeout + for i in $(seq "$start" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do + send_nomatch "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done +} + +# Performance test template: +# - add concatenated ranged entries +# - add non-ranged concatenated entries (for hash set matching rate baseline) +# - add ranged entries with first field only (for rbhash baseline) +# - start pktgen injection directly on device rx path of this namespace +# - measure drop only rate, hash and rbtree baselines, then matching rate +test_performance() { + chain_spec=${perf_spec} + dst="${perf_dst}" + src="${perf_src}" + setup veth perf set || return ${ksft_skip} + + first=${start} + range_size=1 + for set in test norange noconcat; do + start=${first} + for i in $(seq "$start" $((start + perf_entries))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + elif [ "$start" -eq "$end" ]; then + end=$((start + 1)) + fi + + add_perf ${set} + + start=$((end + range_size)) + done > "${tmp}" + nft -f "${tmp}" + done + + perf $((end - 1)) "$srcstart" + + sleep 2 + + nft add rule netdev perf test counter name \"test\" drop + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline (drop from netdev hook): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec} @norange \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline hash (non-ranged entries): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec%%. *} @noconcat \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline rbtree (match on first field only): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec} @test \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + p5="$(printf %5s "${perf_entries}")" + info " set with ${p5} full, ranged entries: ${pps}pps" + kill "${perf_pid}" +} + +test_bug_flush_remove_add() { + rounds=100 + [ "$KSFT_MACHINE_SLOW" = "yes" ] && rounds=10 + + set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' + elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' + elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' + for i in $(seq 1 $rounds); do + nft add table t "$set_cmd" || return ${ksft_skip} + nft add element t s "$elem1" 2>/dev/null || return 1 + nft flush set t s 2>/dev/null || return 1 + nft add element t s "$elem2" 2>/dev/null || return 1 + done + nft flush ruleset +} + +# - add ranged element, check that packets match it +# - reload the set, check packets still match +test_bug_reload() { + setup veth send_"${proto}" set || return ${ksft_skip} + rstart=${start} + + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + # check kernel does allocate pcpu sctrach map + # for reload with no elemet add/delete + ( echo flush set inet filter test ; + nft list set inet filter test ) | nft -f - + + start=${rstart} + range_size=1 + + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush ruleset +} + +test_reported_issues() { + eval test_bug_"${subtest}" +} + +# Run everything in a separate network namespace +[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } +tmp="$(mktemp)" +trap cleanup_exit EXIT + +# Entry point for test runs +passed=0 +for name in ${TESTS}; do + printf "TEST: %s\n" "$(echo "$name" | tr '_' ' ')" + if [ "${name}" = "reported_issues" ]; then + SUBTESTS="${BUGS}" + else + SUBTESTS="${TYPES}" + fi + + for subtest in ${SUBTESTS}; do + eval desc=\$TYPE_"${subtest}" + IFS=' +' + for __line in ${desc}; do + # shellcheck disable=SC2086 + eval ${__line%% *}=\"${__line##* }\"; + done + IFS=' +' + + if [ "${name}" = "concurrency" ] && \ + [ "${race_repeat}" = "0" ]; then + continue + fi + if [ "${name}" = "performance" ] && \ + [ "${perf_duration}" = "0" ]; then + continue + fi + + [ "$KSFT_MACHINE_SLOW" = "yes" ] && count=1 + + printf " %-32s " "${display}" + tthen=$(date +%s) + eval test_"${name}" + ret=$? + + tnow=$(date +%s) + printf "%5ds%-30s" $((tnow-tthen)) + + if [ $ret -eq 0 ]; then + printf "[ OK ]\n" + info_flush + passed=$((passed + 1)) + elif [ $ret -eq 1 ]; then + printf "[FAIL]\n" + err_flush + exit 1 + elif [ $ret -eq ${ksft_skip} ]; then + printf "[SKIP]\n" + err_flush + fi + done +done + +[ ${passed} -eq 0 ] && exit ${ksft_skip} || exit 0 diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh new file mode 100755 index 0000000000..5d276995a5 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# + +source lib.sh + +[ "$KSFT_MACHINE_SLOW" = yes ] && exit ${ksft_skip} + +NFT_CONCAT_RANGE_TESTS="performance" exec ./nft_concat_range.sh diff --git a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh new file mode 100755 index 0000000000..abcaa73371 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# +# This tests connection tracking helper assignment: +# 1. can attach ftp helper to a connection from nft ruleset. +# 2. auto-assign still works. +# +# Kselftest framework requirement - SKIP code is 4. + +source lib.sh + +ret=0 + +testipv6=1 + +checktool "socat -h" "run test without socat" +checktool "conntrack --version" "run test without conntrack" +checktool "nft --version" "run test without nft" + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + + ip netns del "$ns1" + ip netns del "$ns2" +} + +trap cleanup EXIT + +setup_ns ns1 ns2 + +if ! ip link add veth0 netns "$ns1" type veth peer name veth0 netns "$ns2" > /dev/null 2>&1;then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi + +ip -net "$ns1" link set veth0 up +ip -net "$ns2" link set veth0 up + +ip -net "$ns1" addr add 10.0.1.1/24 dev veth0 +ip -net "$ns1" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$ns2" addr add 10.0.1.2/24 dev veth0 +ip -net "$ns2" addr add dead:1::2/64 dev veth0 nodad + +load_ruleset_family() { + local family=$1 + local ns=$2 + +ip netns exec "$ns" nft -f - < /dev/null |grep -q 'helper=ftp';then + if [ "$autoassign" -eq 0 ] ;then + echo "FAIL: ${netns} did not show attached helper $message" 1>&2 + ret=1 + else + echo "PASS: ${netns} did not show attached helper $message" 1>&2 + fi + else + if [ "$autoassign" -eq 0 ] ;then + echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 + else + echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 + ret=1 + fi + fi + + return 0 +} + +listener_ready() +{ + ns="$1" + port="$2" + proto="$3" + ss -N "$ns" -lnt -o "sport = :$port" | grep -q "$port" +} + +test_helper() +{ + local port=$1 + local autoassign=$2 + + if [ "$autoassign" -eq 0 ] ;then + msg="set via ruleset" + else + msg="auto-assign" + fi + + ip netns exec "$ns2" socat -t 3 -u -4 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" "$port" "-4" + + ip netns exec "$ns1" socat -u -4 STDIN TCP:10.0.1.2:"$port" < /dev/null > /dev/null + + check_for_helper "$ns1" "ip $msg" "$port" "$autoassign" + check_for_helper "$ns2" "ip $msg" "$port" "$autoassign" + + if [ $testipv6 -eq 0 ] ;then + return 0 + fi + + ip netns exec "$ns1" conntrack -F 2> /dev/null + ip netns exec "$ns2" conntrack -F 2> /dev/null + + ip netns exec "$ns2" socat -t 3 -u -6 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" "$port" "-6" + + ip netns exec "$ns1" socat -t 3 -u -6 STDIN TCP:"[dead:1::2]":"$port" < /dev/null > /dev/null + + check_for_helper "$ns1" "ipv6 $msg" "$port" + check_for_helper "$ns2" "ipv6 $msg" "$port" +} + +if ! load_ruleset_family ip "$ns1"; then + echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 + exit 1 +fi + +if ! load_ruleset_family ip6 "$ns1"; then + echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 + testipv6=0 +fi + +if ! load_ruleset_family inet "${ns2}"; then + echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 + if ! load_ruleset_family ip "${ns2}"; then + echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 + exit 1 + fi + + if [ "$testipv6" -eq 1 ] ;then + if ! load_ruleset_family ip6 "$ns2"; then + echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 + exit 1 + fi + fi +fi + +test_helper 2121 0 +ip netns exec "$ns1" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec "$ns2" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +test_helper 21 1 + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh new file mode 100755 index 0000000000..ce1451c275 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# +# This tests the fib expression. +# +# Kselftest framework requirement - SKIP code is 4. + +source lib.sh + +ret=0 + +timeout=4 + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +cleanup() +{ + cleanup_all_ns + + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +checktool "nft --version" "run test without nft" + +setup_ns nsrouter ns1 ns2 + +trap cleanup EXIT + +if dmesg | grep -q ' nft_rpfilter: ';then + dmesg -c | grep ' nft_rpfilter: ' + echo "WARN: a previous test run has failed" 1>&2 +fi + +sysctl -q net.netfilter.nf_log_all_netns=1 + +load_ruleset() { + local netns=$1 + +ip netns exec "$netns" nft -f /dev/stdin <&2 + ip netns exec "$ns" nft list table inet filter + return 1 + fi + + if [ "$want" -gt 0 ]; then + echo "PASS: fib expression did drop packets for $address" + fi + + return 0 +} + +load_ruleset "$nsrouter" +load_ruleset "$ns1" +load_ruleset "$ns2" + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +test_ping() { + local daddr4=$1 + local daddr6=$2 + + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr4" > /dev/null; then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr6" > /dev/null; then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 + return 1 + fi + + return 0 +} + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null + +test_ping 10.0.2.1 dead:2::1 || exit 1 +check_drops || exit 1 + +test_ping 10.0.2.99 dead:2::99 || exit 1 +check_drops || exit 1 + +echo "PASS: fib expression did not cause unwanted packet drops" + +ip netns exec "$nsrouter" nft flush table inet filter + +ip -net "$ns1" route del default +ip -net "$ns1" -6 route del default + +ip -net "$ns1" addr del 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr del dead:1::99/64 dev eth0 + +ip -net "$ns1" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr add dead:2::99/64 dev eth0 nodad + +ip -net "$ns1" route add default via 10.0.2.1 +ip -net "$ns1" -6 route add default via dead:2::1 + +ip -net "$nsrouter" addr add dead:2::1/64 dev veth0 nodad + +# switch to ruleset that doesn't log, this time +# its expected that this does drop the packets. +load_ruleset_count "$nsrouter" + +# ns1 has a default route, but nsrouter does not. +# must not check return value, ping to 1.1.1.1 will +# fail. +check_fib_counter 0 "$nsrouter" 1.1.1.1 || exit 1 +check_fib_counter 0 "$nsrouter" 1c3::c01d || exit 1 + +ip netns exec "$ns1" ping -W 0.5 -c 1 -q 1.1.1.1 > /dev/null +check_fib_counter 1 "$nsrouter" 1.1.1.1 || exit 1 + +ip netns exec "$ns1" ping -W 0.5 -i 0.1 -c 3 -q 1c3::c01d > /dev/null +check_fib_counter 3 "$nsrouter" 1c3::c01d || exit 1 + +# delete all rules +ip netns exec "$ns1" nft flush ruleset +ip netns exec "$ns2" nft flush ruleset +ip netns exec "$nsrouter" nft flush ruleset + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad + +ip -net "$ns1" addr del 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr del dead:2::99/64 dev eth0 + +ip -net "$nsrouter" addr del dead:2::1/64 dev veth0 + +# ... pbr ruleset for the router, check iif+oif. +if ! load_pbr_ruleset "$nsrouter";then + echo "SKIP: Could not load fib forward ruleset" + exit $ksft_skip +fi + +ip -net "$nsrouter" rule add from all table 128 +ip -net "$nsrouter" rule add from all iif veth0 table 129 +ip -net "$nsrouter" route add table 128 to 10.0.1.0/24 dev veth0 +ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1 + +# drop main ipv4 table +ip -net "$nsrouter" -4 rule delete table main + +if ! test_ping 10.0.2.99 dead:2::99;then + ip -net "$nsrouter" nft list ruleset + echo "FAIL: fib mismatch in pbr setup" + exit 1 +fi + +echo "PASS: fib expression forward check with policy based routing" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh new file mode 100755 index 0000000000..b399555085 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -0,0 +1,671 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This tests basic flowtable functionality. +# Creates following default topology: +# +# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) +# Router1 is the one doing flow offloading, Router2 has no special +# purpose other than having a link that is smaller than either Originator +# and responder, i.e. TCPMSS announced values are too large and will still +# result in fragmentation and/or PMTU discovery. +# +# You can check with different Orgininator/Link/Responder MTU eg: +# nft_flowtable.sh -o8000 -l1500 -r2000 +# + +source lib.sh + +ret=0 +SOCAT_TIMEOUT=60 + +nsin="" +ns1out="" +ns2out="" + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +checktool "nft --version" "run test without nft tool" +checktool "socat -h" "run test without socat" + +setup_ns ns1 ns2 nsr1 nsr2 + +cleanup() { + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns + + rm -f "$nsin" "$ns1out" "$ns2out" + + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns" +} + +trap cleanup EXIT + +sysctl -q net.netfilter.nf_log_all_netns=1 + +ip link add veth0 netns "$nsr1" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr1" type veth peer name veth0 netns "$nsr2" + +ip link add veth1 netns "$nsr2" type veth peer name eth0 netns "$ns2" + +for dev in veth0 veth1; do + ip -net "$nsr1" link set "$dev" up + ip -net "$nsr2" link set "$dev" up +done + +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsr2" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsr2" addr add dead:2::1/64 dev veth1 nodad + +# set different MTUs so we need to push packets coming from ns1 (large MTU) +# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), +# or to do PTMU discovery (send ICMP error back to originator). +# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers +# is NOT the lowest link mtu. + +omtu=9000 +lmtu=1500 +rmtu=2000 + +usage(){ + echo "nft_flowtable.sh [OPTIONS]" + echo + echo "MTU options" + echo " -o originator" + echo " -l link" + echo " -r responder" + exit 1 +} + +while getopts "o:l:r:" o +do + case $o in + o) omtu=$OPTARG;; + l) lmtu=$OPTARG;; + r) rmtu=$OPTARG;; + *) usage;; + esac +done + +if ! ip -net "$nsr1" link set veth0 mtu "$omtu"; then + exit 1 +fi + +ip -net "$ns1" link set eth0 mtu "$omtu" + +if ! ip -net "$nsr2" link set veth1 mtu "$rmtu"; then + exit 1 +fi + +if ! ip -net "$nsr1" link set veth1 mtu "$lmtu"; then + exit 1 +fi + +if ! ip -net "$nsr2" link set veth0 mtu "$lmtu"; then + exit 1 +fi + +ip -net "$ns2" link set eth0 mtu "$rmtu" + +# transfer-net between nsr1 and nsr2. +# these addresses are not used for connections. +ip -net "$nsr1" addr add 192.168.10.1/24 dev veth1 +ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad + +ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 +ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad + +for i in 0 1; do + ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null +done + +for ns in "$ns1" "$ns2";do + ip -net "$ns" link set eth0 up + + if ! ip netns exec "$ns" sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then + echo "ERROR: Check Originator/Responder values (problem during address addition)" + exit 1 + fi + # don't set ip DF bit for first two tests + ip netns exec "$ns" sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null +done + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$ns2" route add default via dead:2::1 + +ip -net "$nsr1" route add default via 192.168.10.2 +ip -net "$nsr2" route add default via 192.168.10.1 + +ip netns exec "$nsr1" nft -f - < /dev/null; then + echo "ERROR: $ns1 cannot reach ns2" 1>&2 + exit 1 +fi + +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then + echo "ERROR: $ns2 cannot reach $ns1" 1>&2 + exit 1 +fi + +nsin=$(mktemp) +ns1out=$(mktemp) +ns2out=$(mktemp) + +make_file() +{ + name=$1 + + SIZE=$((RANDOM % (1024 * 128))) + SIZE=$((SIZE + (1024 * 8))) + TSIZE=$((SIZE * 1024)) + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + + SIZE=$((RANDOM % 1024)) + SIZE=$((SIZE + 128)) + TSIZE=$((TSIZE + SIZE)) + dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null +} + +check_counters() +{ + local what=$1 + local ok=1 + + local orig repl + orig=$(ip netns exec "$nsr1" nft reset counter inet filter routed_orig | grep packets) + repl=$(ip netns exec "$nsr1" nft reset counter inet filter routed_repl | grep packets) + + local orig_cnt=${orig#*bytes} + local repl_cnt=${repl#*bytes} + + local fs + fs=$(du -sb "$nsin") + local max_orig=${fs%%/*} + local max_repl=$((max_orig/4)) + + # flowtable fastpath should bypass normal routing one, i.e. the counters in forward hook + # should always be lower than the size of the transmitted file (max_orig). + if [ "$orig_cnt" -gt "$max_orig" ];then + echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 + ret=1 + ok=0 + fi + + if [ "$repl_cnt" -gt $max_repl ];then + echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 + ret=1 + ok=0 + fi + + if [ $ok -eq 1 ]; then + echo "PASS: $what" + fi +} + +check_dscp() +{ + local what=$1 + local ok=1 + + local counter + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp3 | grep packets) + + local pc4=${counter%*bytes*} + local pc4=${pc4#*packets} + + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp0 | grep packets) + local pc4z=${counter%*bytes*} + local pc4z=${pc4z#*packets} + + case "$what" in + "dscp_none") + if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_fwd") + if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_ingress") + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_egress") + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + *) + echo "FAIL: Unknown DSCP check" 1>&2 + ret=1 + ok=0 + esac + + if [ "$ok" -eq 1 ] ;then + echo "PASS: $what: dscp packet counters match" + fi +} + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + if ! cmp "$in" "$out" > /dev/null 2>&1; then + echo "FAIL: file mismatch for $what" 1>&2 + ls -l "$in" + ls -l "$out" + return 1 + fi + + return 0 +} + +listener_ready() +{ + ss -N "$nsb" -lnt -o "sport = :12345" | grep -q 12345 +} + +test_tcp_forwarding_ip() +{ + local nsa=$1 + local nsb=$2 + local dstip=$3 + local dstport=$4 + local lret=0 + + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & + lpid=$! + + busywait 1000 listener_ready + + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" + + wait $lpid + + if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then + lret=1 + ret=1 + fi + + if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then + lret=1 + ret=1 + fi + + return $lret +} + +test_tcp_forwarding() +{ + test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + + return $? +} + +test_tcp_forwarding_set_dscp() +{ + check_dscp "dscp_none" + +ip netns exec "$nsr1" nft -f - <&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# delete default route, i.e. ns2 won't be able to reach ns1 and +# will depend on ns1 being masqueraded in nsr1. +# expect ns1 has nsr1 address. +ip -net "$ns2" route del default via 10.0.2.1 +ip -net "$ns2" route del default via dead:2::1 +ip -net "$ns2" route add 192.168.10.1 via 10.0.2.1 + +# Second test: +# Same, but with NAT enabled. Same as in first test: we expect normal forward path +# to handle most packets. +ip netns exec "$nsr1" nft -f - <&2 + exit 0 +fi + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 0 ""; then + echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Third test: +# Same as second test, but with PMTU discovery enabled. This +# means that we expect the fastpath to handle packets as soon +# as the endpoints adjust the packet size. +ip netns exec "$ns1" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null + +# reset counters. +# With pmtu in-place we'll also check that nft counters +# are lower than file size and packets were forwarded via flowtable layer. +# For earlier tests (large mtus), packets cannot be handled via flowtable +# (except pure acks and other small packets). +ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then + echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 + ip netns exec "$nsr1" nft list ruleset +fi + +# Another test: +# Add bridge interface br0 to Router1, with NAT enabled. +test_bridge() { +if ! ip -net "$nsr1" link add name br0 type bridge 2>/dev/null;then + echo "SKIP: could not add bridge br0" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" addr flush dev veth0 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set veth0 master br0 +ip -net "$nsr1" addr add 10.0.1.1/24 dev br0 +ip -net "$nsr1" addr add dead:1::1/64 dev br0 nodad +ip -net "$nsr1" link set up dev br0 + +ip netns exec "$nsr1" sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null + +# br0 with NAT enabled. +ip netns exec "$nsr1" nft -f - <&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + + +# Another test: +# Add bridge interface br0 to Router1, with NAT and VLAN. +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set down dev veth0 +ip -net "$nsr1" link add link veth0 name veth0.10 type vlan id 10 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set up dev veth0.10 +ip -net "$nsr1" link set veth0.10 master br0 + +ip -net "$ns1" addr flush dev eth0 +ip -net "$ns1" link add link eth0 name eth0.10 type vlan id 10 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" link set eth0.10 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0.10 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0.10 nodad + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "bridge and VLAN"; then + echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# restore test topology (remove bridge and VLAN) +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set veth0 down +ip -net "$nsr1" link set veth0.10 down +ip -net "$nsr1" link delete veth0.10 type vlan +ip -net "$nsr1" link delete br0 type bridge +ip -net "$ns1" addr flush dev eth0.10 +ip -net "$ns1" link set eth0.10 down +ip -net "$ns1" link set eth0 down +ip -net "$ns1" link delete eth0.10 type vlan + +# restore address in ns1 and nsr1 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad +ip -net "$nsr1" link set up dev veth0 +} + +test_bridge + +KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) +KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) +SPI1=$RANDOM +SPI2=$RANDOM + +if [ $SPI1 -eq $SPI2 ]; then + SPI2=$((SPI2+1)) +fi + +do_esp() { + local ns=$1 + local me=$2 + local remote=$3 + local lnet=$4 + local rnet=$5 + local spi_out=$6 + local spi_in=$7 + + ip -net "$ns" xfrm state add src "$remote" dst "$me" proto esp spi "$spi_in" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$rnet" dst "$lnet" + ip -net "$ns" xfrm state add src "$me" dst "$remote" proto esp spi "$spi_out" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$lnet" dst "$rnet" + + # to encrypt packets as they go out (includes forwarded packets that need encapsulation) + ip -net "$ns" xfrm policy add src "$lnet" dst "$rnet" dir out tmpl src "$me" dst "$remote" proto esp mode tunnel priority 1 action allow + # to fwd decrypted packets after esp processing: + ip -net "$ns" xfrm policy add src "$rnet" dst "$lnet" dir fwd tmpl src "$remote" dst "$me" proto esp mode tunnel priority 1 action allow +} + +do_esp "$nsr1" 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 "$SPI1" "$SPI2" + +do_esp "$nsr2" 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 "$SPI2" "$SPI1" + +ip netns exec "$nsr1" nft delete table ip nat + +# restore default routes +ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +if test_tcp_forwarding "$ns1" "$ns2"; then + check_counters "ipsec tunnel mode for ns1/ns2" +else + echo "FAIL: ipsec tunnel mode for ns1/ns2" + ip netns exec "$nsr1" nft list ruleset 1>&2 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 +fi + +if [ "$1" = "" ]; then + low=1280 + mtu=$((65536 - low)) + o=$(((RANDOM%mtu) + low)) + l=$(((RANDOM%mtu) + low)) + r=$(((RANDOM%mtu) + low)) + + echo "re-run with random mtus: -o $o -l $l -r $r" + $0 -o "$o" -l "$l" -r "$r" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_meta.sh b/tools/testing/selftests/net/netfilter/nft_meta.sh new file mode 100755 index 0000000000..71505b6cb2 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_meta.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# check iif/iifname/oifgroup/iiftype match. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +sfx=$(mktemp -u "XXXXXXXX") +ns0="ns0-$sfx" + +if ! nft --version > /dev/null 2>&1; then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +cleanup() +{ + ip netns del "$ns0" +} + +ip netns add "$ns0" +ip -net "$ns0" link set lo up +ip -net "$ns0" addr add 127.0.0.1 dev lo + +trap cleanup EXIT + +currentyear=$(date +%Y) +lastyear=$((currentyear-1)) +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null + +check_lo_counters "2" true + +check_one_counter oskuidcounter "1" true +check_one_counter oskgidcounter "1" true +check_one_counter imarkcounter "1" true +check_one_counter omarkcounter "1" true +check_one_counter ilastyearcounter "0" true + +if [ $ret -eq 0 ];then + echo "OK: nftables meta iif/oif counters at expected values" +else + exit $ret +fi + +#First CPU execution and counter +taskset -p 01 $$ > /dev/null +ip netns exec "$ns0" nft reset counters > /dev/null +ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null +check_one_counter icpu0counter "2" true + +if [ $ret -eq 0 ];then + echo "OK: nftables meta cpu counter at expected values" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh new file mode 100755 index 0000000000..9e39de2645 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_nat.sh @@ -0,0 +1,1156 @@ +#!/bin/bash +# +# This test is for basic NAT functionality: snat, dnat, redirect, masquerade. +# + +source lib.sh + +ret=0 +test_inet_nat=true + +checktool "nft --version" "run test without nft tool" +checktool "socat -h" "run test without socat" + +cleanup() +{ + ip netns pids "$ns0" | xargs kill 2>/dev/null + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + rm -f "$INFILE" "$OUTFILE" + + cleanup_all_ns +} + +trap cleanup EXIT + +INFILE=$(mktemp) +OUTFILE=$(mktemp) + +setup_ns ns0 ns1 ns2 + +if ! ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1;then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2" + +ip -net "$ns0" link set veth0 up +ip -net "$ns0" addr add 10.0.1.1/24 dev veth0 +ip -net "$ns0" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$ns0" link set veth1 up +ip -net "$ns0" addr add 10.0.2.1/24 dev veth1 +ip -net "$ns0" addr add dead:2::1/64 dev veth1 nodad + +do_config() +{ + ns="$1" + subnet="$2" + + ip -net "$ns" link set eth0 up + ip -net "$ns" addr add "10.0.$subnet.99/24" dev eth0 + ip -net "$ns" route add default via "10.0.$subnet.1" + ip -net "$ns" addr add "dead:$subnet::99/64" dev eth0 nodad + ip -net "$ns" route add default via "dead:$subnet::1" +} + +do_config "$ns1" 1 +do_config "$ns2" 2 + +bad_counter() +{ + local ns=$1 + local counter=$2 + local expect=$3 + local tag=$4 + + echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2 + ip netns exec "$ns" nft list counter inet filter "$counter" 1>&2 +} + +check_counters() +{ + ns=$1 + local lret=0 + + if ! ip netns exec "$ns" nft list counter inet filter ns0in | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0in "packets 1 bytes 84" "check_counters 1" + lret=1 + fi + + if ! ip netns exec "$ns" nft list counter inet filter ns0out | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0out "packets 1 bytes 84" "check_counters 2" + lret=1 + fi + + expect="packets 1 bytes 104" + if ! ip netns exec "$ns" nft list counter inet filter ns0in6 | grep -q "$expect";then + bad_counter "$ns" ns0in6 "$expect" "check_counters 3" + lret=1 + fi + if ! ip netns exec "$ns" nft list counter inet filter ns0out6 | grep -q "$expect";then + bad_counter "$ns" ns0out6 "$expect" "check_counters 4" + lret=1 + fi + + return $lret +} + +check_ns0_counters() +{ + local ns=$1 + local lret=0 + + if ! ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0";then + bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1" + lret=1 + fi + + if ! ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0";then + bad_counter "$ns0" ns0in6 "packets 0 bytes 0" + lret=1 + fi + + if ! ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0";then + bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2" + lret=1 + fi + if ! ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0";then + bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 " + lret=1 + fi + + for dir in "in" "out" ; do + expect="packets 1 bytes 84" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}" "$expect" "check_ns0_counters 4" + lret=1 + fi + + expect="packets 1 bytes 104" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}6" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}6" "$expect" "check_ns0_counters 5" + lret=1 + fi + done + + return $lret +} + +reset_counters() +{ + for i in "$ns0" "$ns1" "$ns2" ;do + ip netns exec "$i" nft reset counters inet > /dev/null + done +} + +test_local_dnat6() +{ + local family=$1 + local lret=0 + local IPF="" + + if [ "$family" = "inet" ];then + IPF="ip6" + fi + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null;then + lret=1 + echo "ERROR: ping6 failed" + return $lret + fi + + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1" + lret=1 + fi + done + + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2" + lret=1 + fi + done + + # expect 0 count in ns1 + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3" + lret=1 + fi + done + + # expect 1 packet in ns2 + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4" + lret=1 + fi + done + + test $lret -eq 0 && echo "PASS: ipv6 ping to $ns1 was $family NATted to $ns2" + ip netns exec "$ns0" nft flush chain ip6 nat output + + return $lret +} + +test_local_dnat() +{ + local family=$1 + local lret=0 + local IPF="" + + if [ "$family" = "inet" ];then + IPF="ip" + fi + +ip netns exec "$ns0" nft -f /dev/stdin </dev/null +table $family nat { + chain output { + type nat hook output priority 0; policy accept; + ip daddr 10.0.1.99 dnat $IPF to 10.0.2.99 + } +} +EOF + if [ $? -ne 0 ]; then + if [ "$family" = "inet" ];then + echo "SKIP: inet nat tests" + test_inet_nat=false + return $ksft_skip + fi + echo "SKIP: Could not add add $family dnat hook" + return $ksft_skip + fi + + # ping netns1, expect rewrite to netns2 + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then + lret=1 + echo "ERROR: ping failed" + return $lret + fi + + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_local_dnat 1" + lret=1 + fi + done + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns2$dir" "$expect" "test_local_dnat 2" + lret=1 + fi + done + + # expect 0 count in ns1 + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + if ! ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_local_dnat 3" + lret=1 + fi + done + + # expect 1 packet in ns2 + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns2" "ns0$dir" "$expect" "test_local_dnat 4" + lret=1 + fi + done + + test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2" + + ip netns exec "$ns0" nft flush chain "$family" nat output + + reset_counters + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then + lret=1 + echo "ERROR: ping failed" + return $lret + fi + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5" + lret=1 + fi + done + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6" + lret=1 + fi + done + + # expect 1 count in ns1 + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7" + lret=1 + fi + done + + # expect 0 packet in ns2 + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8" + lret=1 + fi + done + + test $lret -eq 0 && echo "PASS: ping to $ns1 OK after $family nat output chain flush" + + return $lret +} + +listener_ready() +{ + local ns="$1" + local port="$2" + local proto="$3" + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" +} + +test_local_dnat_portonly() +{ + local family=$1 + local daddr=$2 + local lret=0 + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null + + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" + return 1 + fi + + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade6 1" + lret=1 + fi + + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade6 2" + lret=1 + fi + done + + reset_counters + +# add masquerading rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" + lret=1 + fi + + # ns1 should have seen packets from ns0, due to masquerade + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3" + lret=1 + fi + + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5" + lret=1 + fi + + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade6 6" + lret=1 + fi + done + + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)" + lret=1 + fi + + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting;then + echo "ERROR: Could not flush $family nat postrouting" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IPv6 masquerade $natflags for $ns2" + + return $lret +} + +test_masquerade() +{ + local family=$1 + local natflags=$2 + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 $natflags" + lret=1 + fi + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade 1" + lret=1 + fi + + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 2" + lret=1 + fi + done + + reset_counters + +# add masquerading rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" + lret=1 + fi + + # ns1 should have seen packets from ns0, due to masquerade + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 3" + lret=1 + fi + + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 4" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 5" + lret=1 + fi + + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade 6" + lret=1 + fi + done + + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)" + lret=1 + fi + + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting; then + echo "ERROR: Could not flush $family nat postrouting" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IP masquerade $natflags for $ns2" + + return $lret +} + +test_redirect6() +{ + local family=$1 + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null + + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then + echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" + lret=1 + fi + + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1" + lret=1 + fi + + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2" + lret=1 + fi + done + + reset_counters + +# add redirect rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect" + lret=1 + fi + + # ns1 should have seen no packets from ns2, due to redirection + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3" + lret=1 + fi + done + + # ns0 should have seen packets from ns2, due to masquerade + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4" + lret=1 + fi + done + + if ! ip netns exec "$ns0" nft delete table "$family" nat;then + echo "ERROR: Could not delete $family nat table" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IPv6 redirection for $ns2" + + return $lret +} + +test_redirect() +{ + local family=$1 + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2" + lret=1 + fi + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "$ns2$dir" "$expect" "test_redirect 1" + lret=1 + fi + + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then + bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2" + lret=1 + fi + done + + reset_counters + +# add redirect rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect" + lret=1 + fi + + # ns1 should have seen no packets from ns2, due to redirection + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3" + lret=1 + fi + done + + # ns0 should have seen packets from ns2, due to masquerade + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4" + lret=1 + fi + done + + if ! ip netns exec "$ns0" nft delete table "$family" nat;then + echo "ERROR: Could not delete $family nat table" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IP redirection for $ns2" + + return $lret +} + +# test port shadowing. +# create two listening services, one on router (ns0), one +# on client (ns2), which is masqueraded from ns1 point of view. +# ns2 sends udp packet coming from service port to ns1, on a highport. +# Later, if n1 uses same highport to connect to ns0:service, packet +# might be port-forwarded to ns2 instead. + +# second argument tells if we expect the 'fake-entry' to take effect +# (CLIENT) or not (ROUTER). +test_port_shadow() +{ + local test=$1 + local expect=$2 + local daddrc="10.0.1.99" + local daddrs="10.0.1.1" + local result="" + local logmsg="" + + # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. + echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405 + + echo ROUTER | ip netns exec "$ns0" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405 2>/dev/null & + local sc_r=$! + echo CLIENT | ip netns exec "$ns2" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405,reuseport 2>/dev/null & + local sc_c=$! + + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1405 "-u" + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" 1405 "-u" + + # ns1 tries to connect to ns0:1405. With default settings this should connect + # to client, it matches the conntrack entry created above. + + result=$(echo "data" | ip netns exec "$ns1" timeout 1 socat - UDP:"$daddrs":1405,sourceport=41404) + + if [ "$result" = "$expect" ] ;then + echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}" + else + echo "ERROR: portshadow test $test: got reply from \"$result\", not $expect as intended" + ret=1 + fi + + kill $sc_r $sc_c 2>/dev/null + + # flush udp entries for next test round, if any + ip netns exec "$ns0" conntrack -F >/dev/null 2>&1 +} + +# This prevents port shadow of router service via packet filter, +# packets claiming to originate from service port from internal +# network are dropped. +test_port_shadow_filter() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin </dev/null 2>&1;then + echo "SKIP: Could not run nat port shadowing test without conntrack tool" + return + fi + + if ! socat -h > /dev/null 2>&1;then + echo "SKIP: Could not run nat port shadowing test without socat tool" + return + fi + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns0" nft -f /dev/stdin < /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" + return 1 + fi + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null; then + echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" + lret=1 + fi + + # ns1 should have seen packets from .2.2, due to stateless rewrite. + expect="packets 1 bytes 84" + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" + lret=1 + fi + + for dir in "in" "out" ; do + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then + bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + if ! ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect";then + bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" + lret=1 + fi + + if ! ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect";then + bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" + lret=1 + fi + done + + reset_counters + + if ! socat -h > /dev/null 2>&1;then + echo "SKIP: Could not run stateless nat frag test without socat tool" + if [ $lret -eq 0 ]; then + return $ksft_skip + fi + + ip netns exec "$ns0" nft delete table ip stateless + return $lret + fi + + dd if=/dev/urandom of="$INFILE" bs=4096 count=1 2>/dev/null + + ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:"$OUTFILE" < /dev/null 2>/dev/null & + + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 4233 "-u" + + # re-do with large ping -> ip fragmentation + if ! ip netns exec "$ns2" timeout 3 socat -u STDIN UDP4-SENDTO:"10.0.1.99:4233" < "$INFILE" > /dev/null;then + echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 + lret=1 + fi + + wait + + if ! cmp "$INFILE" "$OUTFILE";then + ls -l "$INFILE" "$OUTFILE" + echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 + lret=1 + fi + + :> "$OUTFILE" + + # ns1 should have seen packets from 2.2, due to stateless rewrite. + expect="packets 3 bytes 4164" + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" + lret=1 + fi + + if ! ip netns exec "$ns0" nft delete table ip stateless; then + echo "ERROR: Could not delete table ip stateless" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: IP statless for $ns2" + + return $lret +} + +# ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 +for i in "$ns0" "$ns1" "$ns2" ;do +ip netns exec "$i" nft -f /dev/stdin < /dev/null;then + echo "ERROR: Could not reach other namespace(s)" 1>&2 + ret=1 + fi + + if ! ip netns exec "$ns0" ping -c 1 -q dead:"$i"::99 > /dev/null;then + echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 + ret=1 + fi +} + +test_basic_conn() +{ + local nsexec + name="$1" + + nsexec=$(eval echo \$"$1") + + ping_basic 1 + ping_basic 2 + + if ! check_counters "$nsexec";then + return 1 + fi + + if ! check_ns0_counters "$name";then + return 1 + fi + + reset_counters + return 0 +} + +if ! test_basic_conn "ns1" ; then + echo "ERROR: basic test for ns1 failed" 1>&2 + exit 1 +fi +if ! test_basic_conn "ns2"; then + echo "ERROR: basic test for ns1 failed" 1>&2 +fi + +if [ $ret -eq 0 ];then + echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2" +fi + +reset_counters +test_local_dnat ip +test_local_dnat6 ip6 + +reset_counters +test_local_dnat_portonly inet 10.0.1.99 + +reset_counters +$test_inet_nat && test_local_dnat inet +$test_inet_nat && test_local_dnat6 inet + +for flags in "" "fully-random"; do +reset_counters +test_masquerade ip $flags +test_masquerade6 ip6 $flags +reset_counters +$test_inet_nat && test_masquerade inet $flags +$test_inet_nat && test_masquerade6 inet $flags +done + +reset_counters +test_redirect ip +test_redirect6 ip6 +reset_counters +$test_inet_nat && test_redirect inet +$test_inet_nat && test_redirect6 inet + +test_port_shadowing +test_stateless_nat_ip + +if [ $ret -ne 0 ];then + echo -n "FAIL: " + nft --version +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh new file mode 100755 index 0000000000..3b81d88bdd --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh @@ -0,0 +1,267 @@ +#!/bin/bash +# +# Test connection tracking zone and NAT source port reallocation support. +# + +source lib.sh + +# Don't increase too much, 2000 clients should work +# just fine but script can then take several minutes with +# KASAN/debug builds. +maxclients=100 + +have_socat=0 +ret=0 + +[ "$KSFT_MACHINE_SLOW" = yes ] && maxclients=40 +# client1---. +# veth1-. +# | +# NAT Gateway --veth0--> Server +# | | +# veth2-' | +# client2---' | +# .... | +# clientX----vethX---' + +# All clients share identical IP address. +# NAT Gateway uses policy routing and conntrack zones to isolate client +# namespaces. Each client connects to Server, each with colliding tuples: +# clientsaddr:10000 -> serveraddr:dport +# NAT Gateway is supposed to do port reallocation for each of the +# connections. + +v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) +v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) +v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) +v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) +v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) +v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) + +cleanup() +{ + cleanup_all_ns + + sysctl -q net.ipv4.neigh.default.gc_thresh1="$v4gc1" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2="$v4gc2" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3="$v4gc3" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1="$v6gc1" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2="$v6gc2" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3="$v6gc3" 2>/dev/null +} + +checktool "nft --version" echo "run test without nft tool" +checktool "conntrack -V" "run test without conntrack tool" + +if socat -h >/dev/null 2>&1; then + have_socat=1 +fi + +setup_ns gw srv + +trap cleanup EXIT + +ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" +ip -net "$gw" link set veth0 up +ip -net "$srv" link set eth0 up + +sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null + +for i in $(seq 1 "$maxclients");do + setup_ns "cl$i" + + cl=$(eval echo \$cl"$i") + if ! ip link add veth"$i" netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1;then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip + fi +done + +for i in $(seq 1 "$maxclients");do + cl=$(eval echo \$cl"$i") + echo netns exec "$cl" ip link set eth0 up + echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 + echo netns exec "$gw" ip link set "veth$i" up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".rp_filter=0 + + # clients have same IP addresses. + echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 nodad + echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 + echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 + + # NB: same addresses on client-facing interfaces. + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev "veth$i" + echo netns exec "$gw" ip addr add dead:1::2/64 dev "veth$i" nodad + + # gw: policy routing + echo netns exec "$gw" ip route add 10.1.0.0/24 dev "veth$i" table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev "veth$i" table $((1000+i)) + echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip rule add fwmark "$i" lookup $((1000+i)) +done | ip -batch /dev/stdin + +ip -net "$gw" addr add 10.3.0.1/24 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 nodad + +ip -net "$srv" addr add 10.3.0.99/24 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 nodad + +ip netns exec "$gw" nft -f /dev/stdin< /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null + +# useful for debugging: allows to use 'ping' from clients to gateway. +ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null + +for i in $(seq 1 "$maxclients"); do + cl=$(eval echo \$cl"$i") + ip netns exec "$cl" ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & +done + +wait || ret=1 + +[ "$ret" -ne 0 ] && "FAIL: Ping failure from $cl" 1>&2 + +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"; then + ret=1 + echo "FAIL: counter icmp mismatch for veth$i" 1>&2 + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + break + fi +done + +if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"; then + ret=1 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }" + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 +fi + +if [ $ret -eq 0 ]; then + echo "PASS: ping test from all $maxclients namespaces" +fi + +if [ $have_socat -eq 0 ];then + echo "SKIP: socat not installed" + if [ $ret -ne 0 ];then + exit $ret + fi + exit $ksft_skip +fi + +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :5201" | grep -q 5201 +} + +ip netns exec "$srv" socat -u TCP-LISTEN:5201,fork STDOUT > /dev/null 2>/dev/null & +socatpid=$! + +busywait 1000 listener_ready "$srv" + +for i in $(seq 1 "$maxclients"); do + if [ $ret -ne 0 ]; then + break + fi + cl=$(eval echo \$cl"$i") + if ! ip netns exec "$cl" socat -4 -u STDIN TCP:10.3.0.99:5201,sourceport=10000 < /dev/null > /dev/null; then + echo "FAIL: Failure to connect for $cl" 1>&2 + ip netns exec "$gw" conntrack -S 1>&2 + ret=1 + fi +done +if [ $ret -eq 0 ];then + echo "PASS: socat connections for all $maxclients net namespaces" +fi + +kill $socatpid +wait + +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null;then + ret=1 + echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 + break + fi +done +if [ $ret -eq 0 ];then + echo "PASS: Found client connection for all $maxclients net namespaces" +fi + +if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null;then + ret=1 + echo "FAIL: cannot find return entry on veth0" 1>&2 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh new file mode 100755 index 0000000000..8538f08c64 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -0,0 +1,417 @@ +#!/bin/bash +# +# This tests nf_queue: +# 1. can process packets from all hooks +# 2. support running nfqueue from more than one base chain +# +# shellcheck disable=SC2162,SC2317 + +source lib.sh +ret=0 +timeout=2 + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + ip netns pids "$nsrouter" | xargs kill 2>/dev/null + + cleanup_all_ns + + rm -f "$TMPINPUT" + rm -f "$TMPFILE0" + rm -f "$TMPFILE1" + rm -f "$TMPFILE2" "$TMPFILE3" +} + +checktool "nft --version" "test without nft tool" + +trap cleanup EXIT + +setup_ns ns1 ns2 nsrouter + +TMPFILE0=$(mktemp) +TMPFILE1=$(mktemp) +TMPFILE2=$(mktemp) +TMPFILE3=$(mktemp) + +TMPINPUT=$(mktemp) +dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +load_ruleset() { + local name=$1 + local prio=$2 + +ip netns exec "$nsrouter" nft -f /dev/stdin < /dev/null; then + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then + return 2 + fi + + return 0 +} + +test_ping_router() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then + return 3 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then + return 4 + fi + + return 0 +} + +test_queue_blackhole() { + local proto=$1 + +ip netns exec "$nsrouter" nft -f /dev/stdin < /dev/null + lret=$? + elif [ "$proto" = "ip6" ]; then + ip netns exec "$ns1" ping -W 2 -c 1 -q dead:2::99 > /dev/null + lret=$? + else + lret=111 + fi + + # queue without bypass keyword should drop traffic if no listener exists. + if [ "$lret" -eq 0 ];then + echo "FAIL: $proto expected failure, got $lret" 1>&2 + exit 1 + fi + + if ! ip netns exec "$nsrouter" nft delete table "$proto" blackh; then + echo "FAIL: $proto: Could not delete blackh table" + exit 1 + fi + + echo "PASS: $proto: statement with no listener results in packet drop" +} + +nf_queue_wait() +{ + local procfile="/proc/self/net/netfilter/nfnetlink_queue" + local netns id + + netns="$1" + id="$2" + + # if this file doesn't exist, nfnetlink_module isn't loaded. + # rather than loading it ourselves, wait for kernel module autoload + # completion, nfnetlink should do so automatically because nf_queue + # helper program, spawned in the background, asked for this functionality. + test -f "$procfile" && + ip netns exec "$netns" cat "$procfile" | grep -q "^ *$id " +} + +test_queue() +{ + local expected="$1" + local last="" + + # spawn nf_queue listeners + ip netns exec "$nsrouter" ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" & + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 0 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1 + + if ! test_ping;then + echo "FAIL: netns routing/connectivity with active listener on queues 0 and 1: $ret" 1>&2 + exit $ret + fi + + if ! test_ping_router;then + echo "FAIL: netns router unreachable listener on queue 0 and 1: $ret" 1>&2 + exit $ret + fi + + wait + ret=$? + + for file in $TMPFILE0 $TMPFILE1; do + last=$(tail -n1 "$file") + if [ x"$last" != x"$expected packets total" ]; then + echo "FAIL: Expected $expected packets total, but got $last" 1>&2 + ip netns exec "$nsrouter" nft list ruleset + exit 1 + fi + done + + echo "PASS: Expected and received $last" +} + +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :12345" | grep -q 12345 +} + +test_tcp_forward() +{ + ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" & + local nfqpid=$! + + timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" + + ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + + wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain" +} + +test_tcp_localhost() +{ + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" + timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + local nfqpid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null + + wait "$rpid" && echo "PASS: tcp via loopback" + wait 2>/dev/null +} + +test_tcp_localhost_connectclose() +{ + ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 + + wait && echo "PASS: tcp via loopback with connect/close" + wait 2>/dev/null +} + +test_tcp_localhost_requeue() +{ +ip netns exec "$nsrouter" nft -f /dev/stdin </dev/null & + local rpid=$! + + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t "$timeout" > "$TMPFILE2" & + + # nfqueue 1 will be called via output hook. But this time, + # re-queue the packet to nfqueue program on queue 2. + ip netns exec "$nsrouter" ./nf_queue -G -d 150 -c -q 0 -Q 1 -t "$timeout" > "$TMPFILE3" & + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null + + wait + + if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then + echo "FAIL: lost packets during requeue?!" 1>&2 + return + fi + + echo "PASS: tcp via loopback and re-queueing" +} + +test_icmp_vrf() { + if ! ip -net "$ns1" link add tvrf type vrf table 9876;then + echo "SKIP: Could not add vrf device" + return + fi + + ip -net "$ns1" li set eth0 master tvrf + ip -net "$ns1" li set tvrf up + + ip -net "$ns1" route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 +ip netns exec "$ns1" nft -f /dev/stdin < /dev/null + + for n in output post; do + for d in tvrf eth0; do + if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then + echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 + ip netns exec "$ns1" nft list ruleset + ret=1 + return + fi + done + done + + wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf" + wait 2>/dev/null +} + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + +load_ruleset "filter" 0 + +if test_ping; then + # queue bypass works (rules were skipped, no listener) + echo "PASS: ${ns1} can reach ${ns2}" +else + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 + exit $ret +fi + +test_queue_blackhole ip +test_queue_blackhole ip6 + +# dummy ruleset to add base chains between the +# queueing rules. We don't want the second reinject +# to re-execute the old hooks. +load_counter_ruleset 10 + +# we are hooking all: prerouting/input/forward/output/postrouting. +# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: +# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). +# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. +# so we expect that userspace program receives 10 packets. +test_queue 10 + +# same. We queue to a second program as well. +load_ruleset "filter2" 20 +test_queue 20 + +test_tcp_forward +test_tcp_localhost +test_tcp_localhost_connectclose +test_tcp_localhost_requeue +test_icmp_vrf + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_synproxy.sh b/tools/testing/selftests/net/netfilter/nft_synproxy.sh new file mode 100755 index 0000000000..293f667a6a --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_synproxy.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +ret=0 + +checktool "nft --version" "run test without nft tool" +checktool "iperf3 --version" "run test without iperf3" + +setup_ns nsr ns1 ns2 + +modprobe -q nf_conntrack + +cleanup() { + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +trap cleanup EXIT + +ip link add veth0 netns "$nsr" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr" type veth peer name eth0 netns "$ns2" + +for dev in veth0 veth1; do + ip -net "$nsr" link set "$dev" up +done + +ip -net "$nsr" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr" addr add 10.0.2.1/24 dev veth1 + +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth0.forwarding=1 +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth1.forwarding=1 +ip netns exec "$nsr" sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 + +for n in $ns1 $ns2; do + ip -net "$n" link set eth0 up +done +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 + +# test basic connectivity +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then + echo "ERROR: $ns1 cannot reach $ns2" 1>&2 + exit 1 +fi + +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then + echo "ERROR: $ns2 cannot reach $ns1" 1>&2 + exit 1 +fi + +ip netns exec "$ns2" iperf3 -s > /dev/null 2>&1 & +# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & + +sleep 1 + +ip netns exec "$nsr" nft -f - < /dev/null; then + echo "FAIL: iperf3 returned an error" 1>&2 + ret=1 + ip netns exec "$nsr" nft list ruleset +else + echo "PASS: synproxy connection successful" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh new file mode 100755 index 0000000000..7db9982ba5 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh @@ -0,0 +1,164 @@ +#!/bin/bash + +# Test insertion speed for packets with identical addresses/ports +# that are all placed in distinct conntrack zones. + +source lib.sh + +zones=2000 +[ "$KSFT_MACHINE_SLOW" = yes ] && zones=500 + +have_ct_tool=0 +ret=0 + +cleanup() +{ + cleanup_all_ns +} + +checktool "nft --version" "run test without nft tool" +checktool "socat -V" "run test without socat tool" + +setup_ns ns1 + +trap cleanup EXIT + +if conntrack -V > /dev/null 2>&1; then + have_ct_tool=1 +fi + +test_zones() { + local max_zones=$1 + +ip netns exec "$ns1" nft -f /dev/stdin</dev/null | ip netns exec "$ns1" socat -u STDIN UDP:127.0.0.1:12345,sourceport=12345 + if [ $? -ne 0 ] ;then + ret=1 + break + fi + + stop=$(date +%s%3N) + local duration=$((stop-start)) + echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" + done + + if [ "$have_ct_tool" -eq 1 ]; then + local count duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) + + if [ "$count" -ge "$max_zones" ]; then + echo "PASS: inserted $count entries from packet path in $duration ms total" + else + ip netns exec "$ns1" conntrack -S 1>&2 + echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" + ret=1 + fi + fi + + if [ $ret -ne 0 ];then + echo "FAIL: insert $max_zones entries from packet path" 1>&2 + fi +} + +test_conntrack_tool() { + local max_zones=$1 + + ip netns exec "$ns1" conntrack -F >/dev/null 2>/dev/null + + local outerstart start stop i + outerstart=$(date +%s%3N) + start=$(date +%s%3N) + stop="$start" + i=0 + while [ "$i" -lt "$max_zones" ]; do + i=$((i + 1)) + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 + if [ $? -ne 0 ];then + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null + echo "FAIL: conntrack -I returned an error" + ret=1 + break + fi + + if [ $((i%1000)) -eq 0 ];then + stop=$(date +%s%3N) + + local duration=$((stop-start)) + echo "PASS: added 1000 entries in $duration ms (now $i total)" + start=$stop + fi + done + + local count + local duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) + + if [ "$count" -eq "$max_zones" ]; then + echo "PASS: inserted $count entries via ctnetlink in $duration ms" + else + ip netns exec "$ns1" conntrack -S 1>&2 + echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" + ret=1 + fi +} + +test_zones $zones + +if [ "$have_ct_tool" -eq 1 ];then + test_conntrack_tool $zones +else + echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" + if [ $ret -eq 0 ];then + exit $ksft_skip + fi +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/packetdrill/common.sh b/tools/testing/selftests/net/netfilter/packetdrill/common.sh new file mode 100755 index 0000000000..ed36d53519 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/common.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# for debugging set net.netfilter.nf_log_all_netns=1 in init_net +# or do not use net namespaces. +modprobe -q nf_conntrack +sysctl -q net.netfilter.nf_conntrack_log_invalid=6 + +# Flush old cached data (fastopen cookies). +ip tcp_metrics flush all > /dev/null 2>&1 + +# TCP min, default, and max receive and send buffer sizes. +sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))" +sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304" + +# TCP congestion control. +sysctl -q net.ipv4.tcp_congestion_control=cubic + +# TCP slow start after idle. +sysctl -q net.ipv4.tcp_slow_start_after_idle=0 + +# TCP Explicit Congestion Notification (ECN) +sysctl -q net.ipv4.tcp_ecn=0 + +sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1 + +# Override the default qdisc on the tun device. +# Many tests fail with timing errors if the default +# is FQ and that paces their flows. +tc qdisc add dev tun0 root pfifo + +# Enable conntrack +$xtables -A INPUT -m conntrack --ctstate NEW -p tcp --syn diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt new file mode 100644 index 0000000000..d755bd64c5 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt @@ -0,0 +1,118 @@ +// check that already-acked (retransmitted) packet is let through rather +// than tagged as INVALID. + +`packetdrill/common.sh` + +// should set -P DROP but it disconnects VM w.o. extra netns ++0 `$xtables -A INPUT -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 ++0 > S. 0:0(0) ack 1 ++.01 < . 1:1(0) ack 1 win 65535 ++0 accept(3, ..., ...) = 4 + ++0.0001 < P. 1:1461(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 1461 win 65535 ++0.0001 < P. 1461:2921(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 2921 win 65535 ++0.0001 < P. 2921:4381(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 4381 win 65535 ++0.0001 < P. 4381:5841(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 5841 win 65535 ++0.0001 < P. 5841:7301(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 7301 win 65535 ++0.0001 < P. 7301:8761(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 8761 win 65535 ++0.0001 < P. 8761:10221(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 10221 win 65535 ++0.0001 < P. 10221:11681(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 11681 win 65535 ++0.0001 < P. 11681:13141(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 13141 win 65535 ++0.0001 < P. 13141:14601(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 14601 win 65535 ++0.0001 < P. 14601:16061(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 16061 win 65535 ++0.0001 < P. 16061:17521(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 17521 win 65535 ++0.0001 < P. 17521:18981(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 18981 win 65535 ++0.0001 < P. 18981:20441(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 20441 win 65535 ++0.0001 < P. 20441:21901(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 21901 win 65535 ++0.0001 < P. 21901:23361(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 23361 win 65535 ++0.0001 < P. 23361:24821(1460) ack 1 win 257 +0.055 > . 1:1(0) ack 24821 win 65535 ++0.0001 < P. 24821:26281(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 26281 win 65535 ++0.0001 < P. 26281:27741(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 27741 win 65535 ++0.0001 < P. 27741:29201(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 29201 win 65535 ++0.0001 < P. 29201:30661(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 30661 win 65535 ++0.0001 < P. 30661:32121(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 32121 win 65535 ++0.0001 < P. 32121:33581(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 33581 win 65535 ++0.0001 < P. 33581:35041(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 35041 win 65535 ++0.0001 < P. 35041:36501(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 36501 win 65535 ++0.0001 < P. 36501:37961(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 37961 win 65535 ++0.0001 < P. 37961:39421(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 39421 win 65535 ++0.0001 < P. 39421:40881(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 40881 win 65535 ++0.0001 < P. 40881:42341(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 42341 win 65535 ++0.0001 < P. 42341:43801(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 43801 win 65535 ++0.0001 < P. 43801:45261(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 45261 win 65535 ++0.0001 < P. 45261:46721(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 46721 win 65535 ++0.0001 < P. 46721:48181(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 48181 win 65535 ++0.0001 < P. 48181:49641(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 49641 win 65535 ++0.0001 < P. 49641:51101(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 51101 win 65535 ++0.0001 < P. 51101:52561(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 52561 win 65535 ++0.0001 < P. 52561:54021(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 54021 win 65535 ++0.0001 < P. 54021:55481(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 55481 win 65535 ++0.0001 < P. 55481:56941(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 56941 win 65535 ++0.0001 < P. 56941:58401(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 58401 win 65535 ++0.0001 < P. 58401:59861(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 59861 win 65535 ++0.0001 < P. 59861:61321(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 61321 win 65535 ++0.0001 < P. 61321:62781(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 62781 win 65535 ++0.0001 < P. 62781:64241(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 64241 win 65535 ++0.0001 < P. 64241:65701(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 65701 win 65535 ++0.0001 < P. 65701:67161(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 67161 win 65535 + +// nf_ct_proto_6: SEQ is under the lower bound (already ACKed data retransmitted) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.24.72 LEN=1500 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=34375 DPT=8080 SEQ=1 ACK=4162510439 WINDOW=257 RES=0x00 ACK PSH URGP=0 ++0.0001 < P. 1:1461(1460) ack 1 win 257 + +// only sent if above packet isn't flagged as invalid ++.0 > . 1:1(0) ack 67161 win 65535 + ++0 `$xtables -D INPUT -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt new file mode 100644 index 0000000000..dccdd4c009 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt @@ -0,0 +1,62 @@ +// check RST packet that doesn't exactly match expected next sequence +// number still transitions conntrack state to CLOSE iff its already in +// FIN/CLOSE_WAIT. + +`packetdrill/common.sh` + +// 5.771921 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.771994 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.772212 client_ip > server_ip TCP 66 45020 > 443 [ACK] Seq=1905874048 Ack=781810658 Win=36352 Len=0 TSval=3317842872 TSecr=675936334 +// 5.787924 server_ip > client_ip TLSv1.2 1300 [Packet size limited during capture] +// 5.788126 server_ip > client_ip TLSv1.2 90 Application Data +// 5.788207 server_ip > client_ip TCP 66 443 > 45020 [FIN, ACK] Seq=781811916 Ack=1905874048 Win=31104 Len=0 TSval=675936350 TSecr=3317842872 +// 5.788447 client_ip > server_ip TLSv1.2 90 Application Data +// 5.788479 client_ip > server_ip TCP 66 45020 > 443 [RST, ACK] Seq=1905874072 Ack=781811917 Win=39040 Len=0 TSval=3317842889 TSecr=675936350 +// 5.788581 server_ip > client_ip TCP 54 8443 > 45020 [RST] Seq=781811892 Win=0 Len=0 + ++0 `iptables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `iptables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 + ++0.1 < S. 1:1(0) ack 1 win 65535 + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 + +// Conntrack should move to FIN_WAIT, then CLOSE_WAIT. ++0 < F. 3001:3001(0) ack 1001 win 65535 ++0 > . 1001:1001(0) ack 3002 win 65535 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE_WAIT` + ++1 close(3) = 0 +// RST: unread data. FIN was seen, hence ack + 1 ++0 > R. 1001:1001(0) ack 3002 win 65535 +// ... and then, CLOSE. ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` + +// Spurious RST from peer -- no sk state. Should NOT get +// marked INVALID, because conntrack is already closing. ++0.1 < R 2001:2001(0) win 0 + +// No packets should have been marked INVALID ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `iptables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt new file mode 100644 index 0000000000..686f18a3d9 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt @@ -0,0 +1,59 @@ +// check that out of window resets are marked as INVALID and conntrack remains +// in ESTABLISHED state. + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 + ++0.1 < S. 1:1(0) ack 1 win 65535 + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + +// out of window ++0.0 < R 0:0(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// out of window ++0.0 < R 1000000:1000000(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// in-window but not exact match ++0.0 < R 42:42(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0 < . 3001:3001(0) ack 1001 win 65535 + ++0.0 < R. 3000:3000(0) ack 1001 win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// exact next sequence ++0.0 < R. 3001:3001(0) ack 1001 win 0 +// Conntrack should move to CLOSE + +// Expect four invalid RSTs ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 4 "` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt new file mode 100644 index 0000000000..3442cd29bc --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt @@ -0,0 +1,44 @@ +// Check connection re-use, i.e. peer that receives the SYN answers with +// a challenge-ACK. +// Check that conntrack lets all packets pass, including the challenge ack, +// and that a new connection is established. + +`packetdrill/common.sh` + +// S > +// . < (challnge-ack) +// R. > +// S > +// S. < +// Expected outcome: established connection. + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 + +// Challenge ACK, old incarnation. +0.1 < . 145824453:145824453(0) ack 643160523 win 240 + ++0.01 > R 643160523:643160523(0) win 0 + ++0.01 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// Must go through. ++0.01 > S 0:0(0) win 65535 + +// correct synack ++0.1 < S. 0:0(0) ack 1 win 250 + +// 3whs completes. ++0.01 > . 1:1(0) ack 1 win 256 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + +// No packets should have been marked INVALID ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt new file mode 100644 index 0000000000..3047160c4b --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt @@ -0,0 +1,51 @@ +// Check conntrack copes with syn/ack reply for a previous, old incarnation. + +// tcpdump with buggy sequence +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083107423 ecr 0,nop,wscale 7], length 0 +// OLD synack, for old/previous S +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 145824453, ack 643160523, win 65535, options [mss 8952,nop,wscale 5,TS val 3215437785 ecr 2082921663,nop,nop], length 0 +// This reset never makes it to the endpoint, elided in the packetdrill script +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [R.], seq 1, ack 1, win 65535, options [mss 8952,nop,wscale 5,TS val 3215443451 ecr 2082921663,nop,nop], length 0 +// Syn retransmit, no change +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083115583 ecr 0,nop,wscale 7], length 0 +// CORRECT synack, should be accepted, but conntrack classified this as INVALID: +// SEQ is over the upper bound (over the window of the receiver) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.37.78 LEN=40 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=8080 DPT=34500 SEQ=162602411 ACK=2124350315 .. +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 162602410, ack 2375731742, win 65535, options [mss 8952,nop,wscale 5,TS val 3215445754 ecr 2083115583,nop,nop], length 0 + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 + +// bogus/outdated synack, invalid ack value +0.1 < S. 145824453:145824453(0) ack 643160523 win 240 + +// syn retransmitted +1.01 > S 0:0(0) win 65535 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// correct synack ++0 < S. 145758918:145758918(0) ack 1 win 250 ++0 write(3, ..., 1) = 1 + +// with buggy conntrack above packet is dropped, so SYN rtx is seen: +// script packet: 1.054007 . 1:1(0) ack 16777958 win 256 +// actual packet: 3.010000 S 0:0(0) win 65535 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + ++0 > P. 1:2(1) ack 4294901762 win 256 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ASSURED | grep -q ESTABLISHED` + +// No packets should have been marked INVALID in OUTPUT direction, 1 in INPUT ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 1 "` + ++0 `$xtables -D INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -D OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt new file mode 100644 index 0000000000..842242f8cc --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt @@ -0,0 +1,34 @@ +// Check reception of another SYN while we have an established conntrack state. +// Challenge ACK is supposed to pass through, RST reply should clear conntrack +// state and SYN retransmit should give us new 'SYN_RECV' connection state. + +`packetdrill/common.sh` + +// should show a match if bug is present: ++0 `iptables -A INPUT -m conntrack --ctstate INVALID -p tcp --tcp-flags SYN,ACK SYN,ACK` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 ++0 > S. 0:0(0) ack 1 ++.01 < . 1:1(0) ack 1 win 257 ++0 accept(3, ..., ...) = 4 + ++0 < P. 1:101(100) ack 1 win 257 ++.001 > . 1:1(0) ack 101 win 256 ++0 read(4, ..., 101) = 100 + +1.0 < S 2000:2000(0) win 32792 +// Won't expect this: challenge ack. + ++0 > . 1:1(0) ack 101 win 256 ++0 < R. 101:101(0) ack 1 win 257 ++0 close(4) = 0 + +1.5 < S 2000:2000(0) win 32792 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV` ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/rpath.sh b/tools/testing/selftests/net/netfilter/rpath.sh new file mode 100755 index 0000000000..4485fd7675 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/rpath.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# return code to signal skipped test +ksft_skip=4 + +# search for legacy iptables (it uses the xtables extensions +if iptables-legacy --version >/dev/null 2>&1; then + iptables='iptables-legacy' +elif iptables --version >/dev/null 2>&1; then + iptables='iptables' +else + iptables='' +fi + +if ip6tables-legacy --version >/dev/null 2>&1; then + ip6tables='ip6tables-legacy' +elif ip6tables --version >/dev/null 2>&1; then + ip6tables='ip6tables' +else + ip6tables='' +fi + +if nft --version >/dev/null 2>&1; then + nft='nft' +else + nft='' +fi + +if [ -z "$iptables$ip6tables$nft" ]; then + echo "SKIP: Test needs iptables, ip6tables or nft" + exit $ksft_skip +fi + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +trap "ip netns del $ns1; ip netns del $ns2" EXIT + +# create two netns, disable rp_filter in ns2 and +# keep IPv6 address when moving into VRF +ip netns add "$ns1" +ip netns add "$ns2" +ip netns exec "$ns2" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1 + +# a standard connection between the netns, should not trigger rp filter +ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2" +ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up +ip -net "$ns1" a a 192.168.23.2/24 dev v0 +ip -net "$ns2" a a 192.168.23.1/24 dev v0 +ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad + +# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0 +ip -net "$ns2" link add d0 type dummy +ip -net "$ns2" link set d0 up +ip -net "$ns1" a a 192.168.42.2/24 dev v0 +ip -net "$ns2" a a 192.168.42.1/24 dev d0 +ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad + +# firewall matches to test +[ -n "$iptables" ] && { + common='-t raw -A PREROUTING -s 192.168.0.0/16' + if ! ip netns exec "$ns2" "$iptables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi + ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert +} +[ -n "$ip6tables" ] && { + common='-t raw -A PREROUTING -s fec0::/16' + if ! ip netns exec "$ns2" "$ip6tables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi + ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert +} +[ -n "$nft" ] && ip netns exec "$ns2" $nft -f - </dev/null +} + +clear_counters() { + [ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z + [ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z + if [ -n "$nft" ]; then + ( + echo "delete table inet t"; + ip netns exec "$ns2" $nft -s list table inet t; + ) | ip netns exec "$ns2" $nft -f - + fi +} + +testrun() { + clear_counters + + # test 1: martian traffic should fail rpfilter matches + netns_ping "$ns1" -I v0 192.168.42.1 && \ + die "martian ping 192.168.42.1 succeeded" + netns_ping "$ns1" -I v0 fec0:42::1 && \ + die "martian ping fec0:42::1 succeeded" + + ipt_zero_rule "$iptables" || die "iptables matched martian" + ipt_zero_rule "$ip6tables" || die "ip6tables matched martian" + ipt_zero_reverse_rule "$iptables" && die "iptables not matched martian" + ipt_zero_reverse_rule "$ip6tables" && die "ip6tables not matched martian" + nft_zero_rule ip || die "nft IPv4 matched martian" + nft_zero_rule ip6 || die "nft IPv6 matched martian" + + clear_counters + + # test 2: rpfilter match should pass for regular traffic + netns_ping "$ns1" 192.168.23.1 || \ + die "regular ping 192.168.23.1 failed" + netns_ping "$ns1" fec0:23::1 || \ + die "regular ping fec0:23::1 failed" + + ipt_zero_rule "$iptables" && die "iptables match not effective" + ipt_zero_rule "$ip6tables" && die "ip6tables match not effective" + ipt_zero_reverse_rule "$iptables" || die "iptables match over-effective" + ipt_zero_reverse_rule "$ip6tables" || die "ip6tables match over-effective" + nft_zero_rule ip && die "nft IPv4 match not effective" + nft_zero_rule ip6 && die "nft IPv6 match not effective" + +} + +testrun + +# repeat test with vrf device in $ns2 +ip -net "$ns2" link add vrf0 type vrf table 10 +ip -net "$ns2" link set vrf0 up +ip -net "$ns2" link set v0 master vrf0 + +testrun + +echo "PASS: netfilter reverse path match works as intended" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c new file mode 100644 index 0000000000..21bb1cfd8a --- /dev/null +++ b/tools/testing/selftests/net/netfilter/sctp_collision.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + struct sockaddr_in saddr = {}, daddr = {}; + int sd, ret, len = sizeof(daddr); + struct timeval tv = {25, 0}; + char buf[] = "hello"; + + if (argc != 6 || (strcmp(argv[1], "server") && strcmp(argv[1], "client"))) { + printf("%s \n", + argv[0]); + return -1; + } + + sd = socket(AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP); + if (sd < 0) { + printf("Failed to create sd\n"); + return -1; + } + + saddr.sin_family = AF_INET; + saddr.sin_addr.s_addr = inet_addr(argv[2]); + saddr.sin_port = htons(atoi(argv[3])); + + ret = bind(sd, (struct sockaddr *)&saddr, sizeof(saddr)); + if (ret < 0) { + printf("Failed to bind to address\n"); + goto out; + } + + ret = listen(sd, 5); + if (ret < 0) { + printf("Failed to listen on port\n"); + goto out; + } + + daddr.sin_family = AF_INET; + daddr.sin_addr.s_addr = inet_addr(argv[4]); + daddr.sin_port = htons(atoi(argv[5])); + + /* make test shorter than 25s */ + ret = setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret < 0) { + printf("Failed to setsockopt SO_RCVTIMEO\n"); + goto out; + } + + if (!strcmp(argv[1], "server")) { + sleep(1); /* wait a bit for client's INIT */ + ret = connect(sd, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to connect to peer\n"); + goto out; + } + ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); + if (ret < 0) { + printf("Failed to recv msg %d\n", ret); + goto out; + } + ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to send msg %d\n", ret); + goto out; + } + printf("Server: sent! %d\n", ret); + } + + if (!strcmp(argv[1], "client")) { + usleep(300000); /* wait a bit for server's listening */ + ret = connect(sd, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to connect to peer\n"); + goto out; + } + sleep(1); /* wait a bit for server's delayed INIT_ACK to reproduce the issue */ + ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to send msg %d\n", ret); + goto out; + } + ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); + if (ret < 0) { + printf("Failed to recv msg %d\n", ret); + goto out; + } + printf("Client: rcvd! %d\n", ret); + } + ret = 0; +out: + close(sd); + return ret; +} diff --git a/tools/testing/selftests/net/netfilter/settings b/tools/testing/selftests/net/netfilter/settings new file mode 100644 index 0000000000..abc5648b59 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/settings @@ -0,0 +1 @@ +timeout=1800 diff --git a/tools/testing/selftests/net/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh new file mode 100755 index 0000000000..8d401c69e3 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/xt_string.sh @@ -0,0 +1,133 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# return code to signal skipped test +ksft_skip=4 +rc=0 + +source lib.sh + +checktool "socat -h" "run test without socat" +checktool "iptables --version" "test needs iptables" + +infile=$(mktemp) + +cleanup() +{ + ip netns del "$netns" + rm -f "$infile" +} + +trap cleanup EXIT + +setup_ns netns + +ip -net "$netns" link add d0 type dummy +ip -net "$netns" link set d0 up +ip -net "$netns" addr add 10.1.2.1/24 dev d0 + +pattern="foo bar baz" +patlen=11 +hdrlen=$((20 + 8)) # IPv4 + UDP + +#ip netns exec "$netns" tcpdump -npXi d0 & +#tcpdump_pid=$! +#trap 'kill $tcpdump_pid; ip netns del $netns' EXIT + +add_rule() { # (alg, from, to) + ip netns exec "$netns" \ + iptables -A OUTPUT -o d0 -m string \ + --string "$pattern" --algo "$1" --from "$2" --to "$3" +} +showrules() { # () + ip netns exec "$netns" iptables -v -S OUTPUT | grep '^-A' +} +zerorules() { + ip netns exec "$netns" iptables -Z OUTPUT +} +countrule() { # (pattern) + showrules | grep -c -- "$*" +} +send() { # (offset) + ( for ((i = 0; i < $1 - hdrlen; i++)); do + echo -n " " + done + echo -n "$pattern" + ) > "$infile" + + ip netns exec "$netns" socat -t 1 -u STDIN UDP-SENDTO:10.1.2.2:27374 < "$infile" +} + +add_rule bm 1000 1500 +add_rule bm 1400 1600 +add_rule kmp 1000 1500 +add_rule kmp 1400 1600 + +zerorules +send 0 +send $((1000 - patlen)) +if [ "$(countrule -c 0 0)" -ne 4 ]; then + echo "FAIL: rules match data before --from" + showrules + ((rc--)) +fi + +zerorules +send 1000 +send $((1400 - patlen)) +if [ "$(countrule -c 2)" -ne 2 ]; then + echo "FAIL: only two rules should match at low offset" + showrules + ((rc--)) +fi + +zerorules +send $((1500 - patlen)) +if [ "$(countrule -c 1)" -ne 4 ]; then + echo "FAIL: all rules should match at end of packet" + showrules + ((rc--)) +fi + +zerorules +send 1495 +if [ "$(countrule -c 1)" -ne 1 ]; then + echo "FAIL: only kmp with proper --to should match pattern spanning fragments" + showrules + ((rc--)) +fi + +zerorules +send 1500 +if [ "$(countrule -c 1)" -ne 2 ]; then + echo "FAIL: two rules should match pattern at start of second fragment" + showrules + ((rc--)) +fi + +zerorules +send $((1600 - patlen)) +if [ "$(countrule -c 1)" -ne 2 ]; then + echo "FAIL: two rules should match pattern at end of largest --to" + showrules + ((rc--)) +fi + +zerorules +send $((1600 - patlen + 1)) +if [ "$(countrule -c 1)" -ne 0 ]; then + echo "FAIL: no rules should match pattern extending largest --to" + showrules + ((rc--)) +fi + +zerorules +send 1600 +if [ "$(countrule -c 1)" -ne 0 ]; then + echo "FAIL: no rule should match pattern past largest --to" + showrules + ((rc--)) +fi + +[ $rc -eq 0 ] && echo "PASS: string match tests" +exit $rc diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py new file mode 100755 index 0000000000..93d9d91452 --- /dev/null +++ b/tools/testing/selftests/net/nl_netdev.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import time +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_eq, ksft_ge, ksft_busy_wait +from lib.py import NetdevFamily, NetdevSimDev, ip + + +def empty_check(nf) -> None: + devs = nf.dev_get({}, dump=True) + ksft_ge(len(devs), 1) + + +def lo_check(nf) -> None: + lo_info = nf.dev_get({"ifindex": 1}) + ksft_eq(len(lo_info['xdp-features']), 0) + ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0) + + +def page_pool_check(nf) -> None: + with NetdevSimDev() as nsimdev: + nsim = nsimdev.nsims[0] + + def up(): + ip(f"link set dev {nsim.ifname} up") + + def down(): + ip(f"link set dev {nsim.ifname} down") + + def get_pp(): + pp_list = nf.page_pool_get({}, dump=True) + return [pp for pp in pp_list if pp.get("ifindex") == nsim.ifindex] + + # No page pools when down + down() + ksft_eq(len(get_pp()), 0) + + # Up, empty page pool appears + up() + pp_list = get_pp() + ksft_ge(len(pp_list), 0) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 0) + + # Down, it disappears, again + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 0) + + # Up, allocate a page + up() + nsim.dfs_write("pp_hold", "y") + pp_list = nf.page_pool_get({}, dump=True) + refs = sum([pp["inflight"] for pp in pp_list if pp.get("ifindex") == nsim.ifindex]) + ksft_ge(refs, 1) + + # Now let's leak a page + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 1) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 1) + attached = [pp for pp in pp_list if "detach-time" not in pp] + ksft_eq(len(attached), 0) + + # New pp can get created, and we'll have two + up() + pp_list = get_pp() + attached = [pp for pp in pp_list if "detach-time" not in pp] + detached = [pp for pp in pp_list if "detach-time" in pp] + ksft_eq(len(attached), 1) + ksft_eq(len(detached), 1) + + # Free the old page and the old pp is gone + nsim.dfs_write("pp_hold", "n") + # Freeing check is once a second so we may need to retry + ksft_busy_wait(lambda: len(get_pp()) == 1, deadline=2) + + # And down... + down() + ksft_eq(len(get_pp()), 0) + + # Last, leave the page hanging for destroy, nothing to check + # we're trying to exercise the orphaning path in the kernel + up() + nsim.dfs_write("pp_hold", "y") + + +def main() -> None: + nf = NetdevFamily() + ksft_run([empty_check, lo_check, page_pool_check], + args=(nf, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 8b12071876..9f8dec2f65 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -489,7 +489,7 @@ class ovsactions(nla): actstr, reason = parse_extract_field( actstr, "drop(", - "([0-9]+)", + r"([0-9]+)", lambda x: int(x, 0), False, None, @@ -502,9 +502,9 @@ class ovsactions(nla): actstr = actstr[len("drop"): ] return (totallen - len(actstr)) - elif parse_starts_block(actstr, "^(\d+)", False, True): + elif parse_starts_block(actstr, r"^(\d+)", False, True): actstr, output = parse_extract_field( - actstr, None, "(\d+)", lambda x: int(x), False, "0" + actstr, None, r"(\d+)", lambda x: int(x), False, "0" ) self["attrs"].append(["OVS_ACTION_ATTR_OUTPUT", output]) parsed = True @@ -512,7 +512,7 @@ class ovsactions(nla): actstr, recircid = parse_extract_field( actstr, "recirc(", - "([0-9a-fA-Fx]+)", + r"([0-9a-fA-Fx]+)", lambda x: int(x, 0), False, 0, @@ -588,17 +588,17 @@ class ovsactions(nla): actstr = actstr[3:] actstr, ip_block_min = parse_extract_field( - actstr, "=", "([0-9a-fA-F\.]+)", str, False + actstr, "=", r"([0-9a-fA-F\.]+)", str, False ) actstr, ip_block_max = parse_extract_field( - actstr, "-", "([0-9a-fA-F\.]+)", str, False + actstr, "-", r"([0-9a-fA-F\.]+)", str, False ) actstr, proto_min = parse_extract_field( - actstr, ":", "(\d+)", int, False + actstr, ":", r"(\d+)", int, False ) actstr, proto_max = parse_extract_field( - actstr, "-", "(\d+)", int, False + actstr, "-", r"(\d+)", int, False ) if t is not None: diff --git a/tools/testing/selftests/net/sample_map_ret0.bpf.c b/tools/testing/selftests/net/sample_map_ret0.bpf.c new file mode 100644 index 0000000000..43ca925949 --- /dev/null +++ b/tools/testing/selftests/net/sample_map_ret0.bpf.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, long); + __uint(max_entries, 2); +} htab SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, long); + __uint(max_entries, 2); +} array SEC(".maps"); + +/* Sample program which should always load for testing control paths. */ +SEC("xdp") int func() +{ + __u64 key64 = 0; + __u32 key = 0; + long *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (!value) + return 1; + value = bpf_map_lookup_elem(&array, &key64); + if (!value) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/net/sample_ret0.bpf.c b/tools/testing/selftests/net/sample_ret0.bpf.c new file mode 100644 index 0000000000..1df5ca98bb --- /dev/null +++ b/tools/testing/selftests/net/sample_ret0.bpf.c @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ + +#define SEC(name) __attribute__((section(name), used)) + +/* Sample program which should always load for testing control paths. */ +SEC("xdp") +int func() +{ + return 0; +} diff --git a/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh new file mode 100755 index 0000000000..e23210aa54 --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh @@ -0,0 +1,335 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Jianguo Wu +# +# Mostly copied from tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh. +# +# This script is designed for testing the support of netfilter hooks for +# SRv6 End.DX4 behavior. +# +# Hereafter a network diagram is shown, where one tenants (named 100) offer +# IPv4 L3 VPN services allowing hosts to communicate with each other across +# an IPv6 network. +# +# Routers rt-1 and rt-2 implement IPv4 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DX4 behavior. +# +# To explain how an IPv4 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-1 pings +# the host hs-2. +# +# First of all, L2 reachability of the host hs-2 is taken into account by +# the router rt-1 which acts as an arp proxy. +# +# When the host hs-1 sends an IPv4 packet destined to hs-2, the router rt-1 +# receives the packet on the internal veth-t100 interface, rt-1 contains the +# SRv6 Encap route for encapsulating the IPv4 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and +# routs the packet to the specified nexthop. Afterwards, the packet is sent to +# the host hs-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a +# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING +# hooks. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-1 netns | | hs-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | 10.0.0.11/24 | +----------+ | | +----------+ | 10.0.0.22/24 | | +# | +-------+-------+ | route | | | | route | +-------+-------- | +# | | table | | | | table | | +# | +----------+ | | +----------+ | +# | +--------------+ | | +--------------+ | +# | | veth0 | | | | veth0 | | +# | | 2001:11::1/64 |.|...|.| 2001:11::2/64 | | +# | +--------------+ | | +--------------+ | +# | | | | +# | rt-1 netns | | rt-2 netns | +# | | | | +# +-----------------------------------+ +-----------------------------------+ +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table +# +----------------------------------------------------------------+ +# |SID |Action | +# +----------------------------------------------------------------+ +# |fc00:21:100::6004|apply SRv6 End.DX4 nh4 10.0.0.1 dev veth-t100 | +# +----------------------------------------------------------------+ +# +# rt-1: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.2 |apply seg6 encap segs fc00:12:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table +# +---------------------------------------------------------------+ +# |SID |Action | +# +---------------------------------------------------------------+ +# |fc00:12:100::6004|apply SRv6 End.DX4 nh4 10.0.0.2 dev veth-t100| +# +---------------------------------------------------------------+ +# +# rt-2: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.1 |apply seg6 encap segs fc00:21:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +readonly IPv6_RT_NETWORK=2001:11 +readonly IPv4_HS_NETWORK=10.0.0 +readonly SID_LOCATOR=fc00 + +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv4.ip_forward=1 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_rt_netfilter() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1 + ip netns exec ${nsname} iptables -t raw -A PREROUTING -m rpfilter --invert -j DROP +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv4_HS_NETWORK}.${hs}/24 dev veth0 + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + ip -netns ${rtname} addr add ${IPv4_HS_NETWORK}.${rt}${hs}/24 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.proxy_arp=1 +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004 + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -4 route add ${IPv4_HS_NETWORK}.${hsdst}/32 \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \ + via 2001:11::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \ + encap seg6local action End.DX4 nh4 ${IPv4_HS_NETWORK}.${hsdst} dev veth-t${tid} +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 + setup_hs 2 2 100 + + # setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv4_HS_NETWORK}.${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})" +} + +host_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +router_netfilter_tests() +{ + log_section "SRv6 VPN connectivity test with netfilter enabled in routers" + setup_rt_netfilter 1 + setup_rt_netfilter 2 + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup &>/dev/null + +setup + +host_tests +router_netfilter_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} diff --git a/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh new file mode 100755 index 0000000000..9e69a2ed5b --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh @@ -0,0 +1,340 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Jianguo Wu +# +# Mostly copied from tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh. +# +# This script is designed for testing the support of netfilter hooks for +# SRv6 End.DX4 behavior. +# +# Hereafter a network diagram is shown, where one tenants (named 100) offer +# IPv6 L3 VPN services allowing hosts to communicate with each other across +# an IPv6 network. +# +# Routers rt-1 and rt-2 implement IPv6 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DX4 behavior. +# +# To explain how an IPv6 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-1 pings +# the host hs-2. +# +# First of all, L2 reachability of the host hs-2 is taken into account by +# the router rt-1 which acts as an arp proxy. +# +# When the host hs-1 sends an IPv6 packet destined to hs-2, the router rt-1 +# receives the packet on the internal veth-t100 interface, rt-1 contains the +# SRv6 Encap route for encapsulating the IPv6 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and +# routs the packet to the specified nexthop. Afterwards, the packet is sent to +# the host hs-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a +# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING +# hooks. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-1 netns | | hs-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | cafe::1/64 | | | | cafe::2/64 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | cafe::11/64 | +----------+ | | +----------+ | cafe::22/64 | | +# | +-------+-------+ | route | | | | route | +-------+-------- | +# | | table | | | | table | | +# | +----------+ | | +----------+ | +# | +--------------+ | | +--------------+ | +# | | veth0 | | | | veth0 | | +# | | 2001:11::1/64 |.|...|.| 2001:11::2/64 | | +# | +--------------+ | | +--------------+ | +# | | | | +# | rt-1 netns | | rt-2 netns | +# | | | | +# +-----------------------------------+ +-----------------------------------+ +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table +# +----------------------------------------------------------------+ +# |SID |Action | +# +----------------------------------------------------------------+ +# |fc00:21:100::6004|apply SRv6 End.DX6 nh6 cafe::1 dev veth-t100 | +# +----------------------------------------------------------------+ +# +# rt-1: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::2 |apply seg6 encap segs fc00:12:100::6004| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table +# +---------------------------------------------------------------+ +# |SID |Action | +# +---------------------------------------------------------------+ +# |fc00:12:100::6004|apply SRv6 End.DX6 nh6 cafe::2 dev veth-t100 | +# +---------------------------------------------------------------+ +# +# rt-2: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::1 |apply seg6 encap segs fc00:21:100::6004| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +readonly IPv6_RT_NETWORK=2001:11 +readonly IPv6_HS_NETWORK=cafe +readonly SID_LOCATOR=fc00 + +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_rt_netfilter() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1 + ip netns exec ${nsname} ip6tables -t raw -A PREROUTING -m rpfilter --invert -j DROP +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv6_HS_NETWORK}::${hs}/64 dev veth0 nodad + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + ip -netns ${rtname} addr add ${IPv6_HS_NETWORK}::${rt}${hs}/64 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.${rtveth}.proxy_ndp=1 +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local rtveth=veth-t${tid} + local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004 + + ip -netns ${rtsrc_name} -6 neigh add proxy ${IPv6_HS_NETWORK}::${hsdst} dev ${rtveth} + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -6 route add ${IPv6_HS_NETWORK}::${hsdst}/128 \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \ + via 2001:11::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \ + encap seg6local action End.DX6 nh6 ${IPv6_HS_NETWORK}::${hsdst} dev veth-t${tid} +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 + setup_hs 2 2 100 + + # setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-${hssrc} ping -6 -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv6_HS_NETWORK}::${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})" +} + +host_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +router_netfilter_tests() +{ + log_section "SRv6 VPN connectivity test with netfilter enabled in routers" + setup_rt_netfilter 1 + setup_rt_netfilter 2 + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup &>/dev/null + +setup + +host_tests +router_netfilter_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 8802604148..11a1ebda56 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" # set global exit status, but never reset nonzero one. check_err() diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh index 7080eae531..c51ea90a13 100755 --- a/tools/testing/selftests/net/udpgro_bench.sh +++ b/tools/testing/selftests/net/udpgro_bench.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh index e1ff645bd3..17404f49cd 100755 --- a/tools/testing/selftests/net/udpgro_frglist.sh +++ b/tools/testing/selftests/net/udpgro_frglist.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" @@ -42,8 +42,8 @@ run_one() { ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp tc -n "${PEER_NS}" qdisc add dev veth1 clsact - tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.o section schedcls/ingress6/nat_6 direct-action - tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.o section schedcls/egress4/snat4 direct-action + tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.bpf.o section schedcls/ingress6/nat_6 direct-action + tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action echo ${rx_args} ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r & @@ -89,7 +89,7 @@ if [ ! -f ${BPF_FILE} ]; then exit -1 fi -if [ ! -f nat6to4.o ]; then +if [ ! -f nat6to4.bpf.o ]; then echo "Missing nat6to4 helper. Run 'make' first" exit -1 fi diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 83ed987cff..550d8eb3e2 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -3,7 +3,7 @@ source net_helper.sh -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly BASE="ns-$(mktemp -u XXXXXX)" readonly SRC=2 readonly DST=1 diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 3a394b43e2..4f1edbafb9 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" readonly BASE=`basename $STATS` readonly SRC=2 diff --git a/tools/testing/selftests/net/xdp_dummy.bpf.c b/tools/testing/selftests/net/xdp_dummy.bpf.c new file mode 100644 index 0000000000..d988b2e0ce --- /dev/null +++ b/tools/testing/selftests/net/xdp_dummy.bpf.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define KBUILD_MODNAME "xdp_dummy" +#include +#include + +SEC("xdp") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/xdp_dummy.c deleted file mode 100644 index d988b2e0ce..0000000000 --- a/tools/testing/selftests/net/xdp_dummy.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#define KBUILD_MODNAME "xdp_dummy" -#include -#include - -SEC("xdp") -int xdp_dummy_prog(struct xdp_md *ctx) -{ - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh index 4577895306..3eeeeffb40 100755 --- a/tools/testing/selftests/net/xfrm_policy.sh +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -293,7 +293,7 @@ check_random_order() local ns=$1 local log=$2 - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do ip -net $ns xfrm policy add dst $j.0.0.0/24 dir out priority 10 action allow @@ -306,7 +306,7 @@ check_random_order() done done - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do local addr=$(printf "e000:0000:%02x00::/56" $j) diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/netfilter/.gitignore deleted file mode 100644 index c2229b3e40..0000000000 --- a/tools/testing/selftests/netfilter/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -nf-queue -connect_close -audit_logread -conntrack_dump_flush -sctp_collision diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile deleted file mode 100644 index 936c3085bb..0000000000 --- a/tools/testing/selftests/netfilter/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for netfilter selftests - -TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ - conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ - nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ - ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \ - conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \ - conntrack_sctp_collision.sh xt_string.sh \ - bridge_netfilter.sh - -HOSTPKG_CONFIG := pkg-config - -CFLAGS += $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) -LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) - -TEST_GEN_FILES = nf-queue connect_close audit_logread sctp_collision \ - conntrack_dump_flush - -include ../lib.mk diff --git a/tools/testing/selftests/netfilter/audit_logread.c b/tools/testing/selftests/netfilter/audit_logread.c deleted file mode 100644 index a0a880fc2d..0000000000 --- a/tools/testing/selftests/netfilter/audit_logread.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int fd; - -#define MAX_AUDIT_MESSAGE_LENGTH 8970 -struct audit_message { - struct nlmsghdr nlh; - union { - struct audit_status s; - char data[MAX_AUDIT_MESSAGE_LENGTH]; - } u; -}; - -int audit_recv(int fd, struct audit_message *rep) -{ - struct sockaddr_nl addr; - socklen_t addrlen = sizeof(addr); - int ret; - - do { - ret = recvfrom(fd, rep, sizeof(*rep), 0, - (struct sockaddr *)&addr, &addrlen); - } while (ret < 0 && errno == EINTR); - - if (ret < 0 || - addrlen != sizeof(addr) || - addr.nl_pid != 0 || - rep->nlh.nlmsg_type == NLMSG_ERROR) /* short-cut for now */ - return -1; - - return ret; -} - -int audit_send(int fd, uint16_t type, uint32_t key, uint32_t val) -{ - static int seq = 0; - struct audit_message msg = { - .nlh = { - .nlmsg_len = NLMSG_SPACE(sizeof(msg.u.s)), - .nlmsg_type = type, - .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, - .nlmsg_seq = ++seq, - }, - .u.s = { - .mask = key, - .enabled = key == AUDIT_STATUS_ENABLED ? val : 0, - .pid = key == AUDIT_STATUS_PID ? val : 0, - } - }; - struct sockaddr_nl addr = { - .nl_family = AF_NETLINK, - }; - int ret; - - do { - ret = sendto(fd, &msg, msg.nlh.nlmsg_len, 0, - (struct sockaddr *)&addr, sizeof(addr)); - } while (ret < 0 && errno == EINTR); - - if (ret != (int)msg.nlh.nlmsg_len) - return -1; - return 0; -} - -int audit_set(int fd, uint32_t key, uint32_t val) -{ - struct audit_message rep = { 0 }; - int ret; - - ret = audit_send(fd, AUDIT_SET, key, val); - if (ret) - return ret; - - ret = audit_recv(fd, &rep); - if (ret < 0) - return ret; - return 0; -} - -int readlog(int fd) -{ - struct audit_message rep = { 0 }; - int ret = audit_recv(fd, &rep); - const char *sep = ""; - char *k, *v; - - if (ret < 0) - return ret; - - if (rep.nlh.nlmsg_type != AUDIT_NETFILTER_CFG) - return 0; - - /* skip the initial "audit(...): " part */ - strtok(rep.u.data, " "); - - while ((k = strtok(NULL, "="))) { - v = strtok(NULL, " "); - - /* these vary and/or are uninteresting, ignore */ - if (!strcmp(k, "pid") || - !strcmp(k, "comm") || - !strcmp(k, "subj")) - continue; - - /* strip the varying sequence number */ - if (!strcmp(k, "table")) - *strchrnul(v, ':') = '\0'; - - printf("%s%s=%s", sep, k, v); - sep = " "; - } - if (*sep) { - printf("\n"); - fflush(stdout); - } - return 0; -} - -void cleanup(int sig) -{ - audit_set(fd, AUDIT_STATUS_ENABLED, 0); - close(fd); - if (sig) - exit(0); -} - -int main(int argc, char **argv) -{ - struct sigaction act = { - .sa_handler = cleanup, - }; - - fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_AUDIT); - if (fd < 0) { - perror("Can't open netlink socket"); - return -1; - } - - if (sigaction(SIGTERM, &act, NULL) < 0 || - sigaction(SIGINT, &act, NULL) < 0) { - perror("Can't set signal handler"); - close(fd); - return -1; - } - - audit_set(fd, AUDIT_STATUS_ENABLED, 1); - audit_set(fd, AUDIT_STATUS_PID, getpid()); - - while (1) - readlog(fd); -} diff --git a/tools/testing/selftests/netfilter/bridge_brouter.sh b/tools/testing/selftests/netfilter/bridge_brouter.sh deleted file mode 100755 index 29f3955b9a..0000000000 --- a/tools/testing/selftests/netfilter/bridge_brouter.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# -# This test is for bridge 'brouting', i.e. make some packets being routed -# rather than getting bridged even though they arrive on interface that is -# part of a bridge. - -# eth0 br0 eth0 -# setup is: ns1 <-> ns0 <-> ns2 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ns0 -ip netns add ns1 -ip netns add ns2 - -ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi -ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 - -ip -net ns0 link set lo up -ip -net ns0 link set veth0 up -ip -net ns0 link set veth1 up - -ip -net ns0 link add br0 type bridge -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -ip -net ns0 link set veth0 master br0 -ip -net ns0 link set veth1 master br0 -ip -net ns0 link set br0 up -ip -net ns0 addr add 10.0.0.1/24 dev br0 - -# place both in same subnet, ns1 and ns2 connected via ns0:br0 -for i in 1 2; do - ip -net ns$i link set lo up - ip -net ns$i link set eth0 up - ip -net ns$i addr add 10.0.0.1$i/24 dev eth0 -done - -test_ebtables_broute() -{ - local cipt - - # redirect is needed so the dstmac is rewritten to the bridge itself, - # ip stack won't process OTHERHOST (foreign unicast mac) packets. - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - if [ $? -ne 0 ]; then - echo "SKIP: Could not add ebtables broute redirect rule" - return $ksft_skip - fi - - # ping netns1, expected to not work (ip forwarding is off) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed" 1>&2 - return 1 - fi - - # enable forwarding on both interfaces. - # neither needs an ip address, but at least the bridge needs - # an ip address in same network segment as ns1 and ns2 (ns0 - # needs to be able to determine route for to-be-forwarded packet). - ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1 - ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1 - - sleep 1 - - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule" - ip netns exec ns0 ebtables -t broute -F - - # ping netns1, expected to work (frames are bridged) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (bridged)" 1>&2 - return 1 - fi - - ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP - - # ping netns1, expected to not work (DROP in bridge forward) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 - return 1 - fi - - # re-activate brouter - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - - ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop" - return 0 -} - -# test basic connectivity -ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns2 from ns1" 1>&2 - ret=1 -fi - -ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns1 from ns2" 1>&2 - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: netns connectivity: ns1 and ns2 can reach each other" -fi - -test_ebtables_broute -ret=$? -for i in 0 1 2; do ip netns del ns$i;done - -exit $ret diff --git a/tools/testing/selftests/netfilter/bridge_netfilter.sh b/tools/testing/selftests/netfilter/bridge_netfilter.sh deleted file mode 100644 index 659b3ab02c..0000000000 --- a/tools/testing/selftests/netfilter/bridge_netfilter.sh +++ /dev/null @@ -1,188 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test bridge netfilter + conntrack, a combination that doesn't really work, -# with multicast/broadcast packets racing for hash table insertion. - -# eth0 br0 eth0 -# setup is: ns1 <->,ns0 <-> ns3 -# ns2 <-' `'-> ns4 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" -ns3="ns3-$sfx" -ns4="ns4-$sfx" - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -for i in $(seq 0 4); do - eval ip netns add \$ns$i -done - -cleanup() { - for i in $(seq 0 4); do eval ip netns del \$ns$i;done -} - -trap cleanup EXIT - -do_ping() -{ - fromns="$1" - dstip="$2" - - ip netns exec $fromns ping -c 1 -q $dstip > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - ret=1 - fi -} - -bcast_ping() -{ - fromns="$1" - dstip="$2" - - for i in $(seq 1 1000); do - ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "ERROR: ping -b from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - fi - done -} - -ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1} -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi - -ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2 -ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3 -ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4 - -ip -net ${ns0} link set lo up - -for i in $(seq 1 4); do - ip -net ${ns0} link set veth$i up -done - -ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -# make veth0,1,2 part of bridge. -for i in $(seq 1 3); do - ip -net ${ns0} link set veth$i master br0 -done - -# add a macvlan on top of the bridge. -MACVLAN_ADDR=ba:f3:13:37:42:23 -ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private -ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan0 up -ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0 - -# add a macvlan on top of veth4. -MACVLAN_ADDR=ba:f3:13:37:42:24 -ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa -ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan4 up - -# make the macvlan part of the bridge. -# veth4 is not a bridge port, only the macvlan on top of it. -ip -net ${ns0} link set macvlan4 master br0 - -ip -net ${ns0} link set br0 up -ip -net ${ns0} addr add 10.0.0.1/24 dev br0 -ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1 -ret=$? -if [ $ret -ne 0 ] ; then - echo "SKIP: bridge netfilter not available" - ret=$ksft_skip -fi - -# for testing, so namespaces will reply to ping -b probes. -ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 - -# enable conntrack in ns0 and drop broadcast packets in forward to -# avoid them from getting confirmed in the postrouting hook before -# the cloned skb is passed up the stack. -ip netns exec ${ns0} nft -f - < -#include -#include -#include -#include -#include - -#include -#include - -#define PORT 12345 -#define RUNTIME 10 - -static struct { - unsigned int timeout; - unsigned int port; -} opts = { - .timeout = RUNTIME, - .port = PORT, -}; - -static void handler(int sig) -{ - _exit(sig == SIGALRM ? 0 : 1); -} - -static void set_timeout(void) -{ - struct sigaction action = { - .sa_handler = handler, - }; - - sigaction(SIGALRM, &action, NULL); - - alarm(opts.timeout); -} - -static void do_connect(const struct sockaddr_in *dst) -{ - int s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - - if (s >= 0) - fcntl(s, F_SETFL, O_NONBLOCK); - - connect(s, (struct sockaddr *)dst, sizeof(*dst)); - close(s); -} - -static void do_accept(const struct sockaddr_in *src) -{ - int c, one = 1, s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - - if (s < 0) - return; - - setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); - setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); - - bind(s, (struct sockaddr *)src, sizeof(*src)); - - listen(s, 16); - - c = accept(s, NULL, NULL); - if (c >= 0) - close(c); - - close(s); -} - -static int accept_loop(void) -{ - struct sockaddr_in src = { - .sin_family = AF_INET, - .sin_port = htons(opts.port), - }; - - inet_pton(AF_INET, "127.0.0.1", &src.sin_addr); - - set_timeout(); - - for (;;) - do_accept(&src); - - return 1; -} - -static int connect_loop(void) -{ - struct sockaddr_in dst = { - .sin_family = AF_INET, - .sin_port = htons(opts.port), - }; - - inet_pton(AF_INET, "127.0.0.1", &dst.sin_addr); - - set_timeout(); - - for (;;) - do_connect(&dst); - - return 1; -} - -static void parse_opts(int argc, char **argv) -{ - int c; - - while ((c = getopt(argc, argv, "t:p:")) != -1) { - switch (c) { - case 't': - opts.timeout = atoi(optarg); - break; - case 'p': - opts.port = atoi(optarg); - break; - } - } -} - -int main(int argc, char *argv[]) -{ - pid_t p; - - parse_opts(argc, argv); - - p = fork(); - if (p < 0) - return 111; - - if (p > 0) - return accept_loop(); - - return connect_loop(); -} diff --git a/tools/testing/selftests/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/netfilter/conntrack_dump_flush.c deleted file mode 100644 index b11ea8ee67..0000000000 --- a/tools/testing/selftests/netfilter/conntrack_dump_flush.c +++ /dev/null @@ -1,471 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#define _GNU_SOURCE - -#include -#include -#include - -#include -#include -#include -#include -#include "../kselftest_harness.h" - -#define TEST_ZONE_ID 123 -#define NF_CT_DEFAULT_ZONE_ID 0 - -static int reply_counter; - -static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type, - uint32_t src_ip, uint32_t dst_ip, - uint16_t src_port, uint16_t dst_port) -{ - struct nlattr *nest, *nest_ip, *nest_proto; - - nest = mnl_attr_nest_start(nlh, type); - if (!nest) - return -1; - - nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); - if (!nest_ip) - return -1; - mnl_attr_put_u32(nlh, CTA_IP_V4_SRC, src_ip); - mnl_attr_put_u32(nlh, CTA_IP_V4_DST, dst_ip); - mnl_attr_nest_end(nlh, nest_ip); - - nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); - if (!nest_proto) - return -1; - mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); - mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); - mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); - mnl_attr_nest_end(nlh, nest_proto); - - mnl_attr_nest_end(nlh, nest); -} - -static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type, - struct in6_addr src_ip, struct in6_addr dst_ip, - uint16_t src_port, uint16_t dst_port) -{ - struct nlattr *nest, *nest_ip, *nest_proto; - - nest = mnl_attr_nest_start(nlh, type); - if (!nest) - return -1; - - nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); - if (!nest_ip) - return -1; - mnl_attr_put(nlh, CTA_IP_V6_SRC, sizeof(struct in6_addr), &src_ip); - mnl_attr_put(nlh, CTA_IP_V6_DST, sizeof(struct in6_addr), &dst_ip); - mnl_attr_nest_end(nlh, nest_ip); - - nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); - if (!nest_proto) - return -1; - mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); - mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); - mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); - mnl_attr_nest_end(nlh, nest_proto); - - mnl_attr_nest_end(nlh, nest); -} - -static int build_cta_proto(struct nlmsghdr *nlh) -{ - struct nlattr *nest, *nest_proto; - - nest = mnl_attr_nest_start(nlh, CTA_PROTOINFO); - if (!nest) - return -1; - - nest_proto = mnl_attr_nest_start(nlh, CTA_PROTOINFO_TCP); - if (!nest_proto) - return -1; - mnl_attr_put_u8(nlh, CTA_PROTOINFO_TCP_STATE, TCP_CONNTRACK_ESTABLISHED); - mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, 0x0a0a); - mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_REPLY, 0x0a0a); - mnl_attr_nest_end(nlh, nest_proto); - - mnl_attr_nest_end(nlh, nest); -} - -static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh, - uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *rplnlh; - unsigned int portid; - int err, ret; - - portid = mnl_socket_get_portid(sock); - - ret = build_cta_proto(nlh); - if (ret < 0) { - perror("build_cta_proto"); - return -1; - } - mnl_attr_put_u32(nlh, CTA_TIMEOUT, htonl(20000)); - mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); - - if (mnl_socket_sendto(sock, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - return -1; - } - - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - if (ret < 0) { - perror("mnl_socket_recvfrom"); - return ret; - } - - ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); - if (ret < 0) { - if (errno == EEXIST) { - /* The entries are probably still there from a previous - * run. So we are good - */ - return 0; - } - perror("mnl_cb_run"); - return ret; - } - - return 0; -} - -static int conntrack_data_generate_v4(struct mnl_socket *sock, uint32_t src_ip, - uint32_t dst_ip, uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh; - struct nfgenmsg *nfh; - int ret; - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | - NLM_F_ACK | NLM_F_EXCL; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_INET; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - ret = build_cta_tuple_v4(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, 12345, 443); - if (ret < 0) { - perror("build_cta_tuple_v4"); - return ret; - } - ret = build_cta_tuple_v4(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, 443, 12345); - if (ret < 0) { - perror("build_cta_tuple_v4"); - return ret; - } - return conntrack_data_insert(sock, nlh, zone); -} - -static int conntrack_data_generate_v6(struct mnl_socket *sock, - struct in6_addr src_ip, - struct in6_addr dst_ip, - uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh; - struct nfgenmsg *nfh; - int ret; - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | - NLM_F_ACK | NLM_F_EXCL; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_INET6; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - ret = build_cta_tuple_v6(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, - 12345, 443); - if (ret < 0) { - perror("build_cta_tuple_v6"); - return ret; - } - ret = build_cta_tuple_v6(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, - 12345, 443); - if (ret < 0) { - perror("build_cta_tuple_v6"); - return ret; - } - return conntrack_data_insert(sock, nlh, zone); -} - -static int count_entries(const struct nlmsghdr *nlh, void *data) -{ - reply_counter++; -} - -static int conntracK_count_zone(struct mnl_socket *sock, uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh, *rplnlh; - struct nfgenmsg *nfh; - struct nlattr *nest; - unsigned int portid; - int err, ret; - - portid = mnl_socket_get_portid(sock); - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_GET; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_UNSPEC; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); - - ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); - if (ret < 0) { - perror("mnl_socket_sendto"); - return ret; - } - - reply_counter = 0; - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - while (ret > 0) { - ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, - count_entries, NULL); - if (ret <= MNL_CB_STOP) - break; - - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - } - if (ret < 0) { - perror("mnl_socket_recvfrom"); - return ret; - } - - return reply_counter; -} - -static int conntrack_flush_zone(struct mnl_socket *sock, uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh, *rplnlh; - struct nfgenmsg *nfh; - struct nlattr *nest; - unsigned int portid; - int err, ret; - - portid = mnl_socket_get_portid(sock); - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_DELETE; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_UNSPEC; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); - - ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); - if (ret < 0) { - perror("mnl_socket_sendto"); - return ret; - } - - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - if (ret < 0) { - perror("mnl_socket_recvfrom"); - return ret; - } - - ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); - if (ret < 0) { - perror("mnl_cb_run"); - return ret; - } - - return 0; -} - -FIXTURE(conntrack_dump_flush) -{ - struct mnl_socket *sock; -}; - -FIXTURE_SETUP(conntrack_dump_flush) -{ - struct in6_addr src, dst; - int ret; - - self->sock = mnl_socket_open(NETLINK_NETFILTER); - if (!self->sock) { - perror("mnl_socket_open"); - exit(EXIT_FAILURE); - } - - if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) { - perror("mnl_socket_bind"); - exit(EXIT_FAILURE); - } - - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - if (ret < 0 && errno == EPERM) - SKIP(return, "Needs to be run as root"); - else if (ret < 0 && errno == EOPNOTSUPP) - SKIP(return, "Kernel does not seem to support conntrack zones"); - - ret = conntrack_data_generate_v4(self->sock, 0xf0f0f0f0, 0xf1f1f1f1, - TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntrack_data_generate_v4(self->sock, 0xf2f2f2f2, 0xf3f3f3f3, - TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 0); - ret = conntrack_data_generate_v4(self->sock, 0xf4f4f4f4, 0xf5f5f5f5, - TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 0); - ret = conntrack_data_generate_v4(self->sock, 0xf6f6f6f6, 0xf7f7f7f7, - NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); - - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x01000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x02000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x03000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x04000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 0); - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x05000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x06000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 0); - - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x07000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x08000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); - - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_GE(ret, 2); - if (ret > 2) - SKIP(return, "kernel does not support filtering by zone"); -} - -FIXTURE_TEARDOWN(conntrack_dump_flush) -{ -} - -TEST_F(conntrack_dump_flush, test_dump_by_zone) -{ - int ret; - - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 2); -} - -TEST_F(conntrack_dump_flush, test_flush_by_zone) -{ - int ret; - - ret = conntrack_flush_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 2); -} - -TEST_F(conntrack_dump_flush, test_flush_by_zone_default) -{ - int ret; - - ret = conntrack_flush_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); -} - -TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh deleted file mode 100755 index 76645aaf2b..0000000000 --- a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh +++ /dev/null @@ -1,315 +0,0 @@ -#!/bin/bash -# -# check that ICMP df-needed/pkttoobig icmp are set are set as related -# state -# -# Setup is: -# -# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2 -# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280). -# ping nsclient2 from nsclient1, checking that conntrack did set RELATED -# 'fragmentation needed' icmp packet. -# -# In addition, nsrouter1 will perform IP masquerading, i.e. also -# check the icmp errors are propagated to the correct host as per -# nat of "established" icmp-echo "connection". - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -cleanup() { - for i in 1 2;do ip netns del nsclient$i;done - for i in 1 2;do ip netns del nsrouter$i;done -} - -trap cleanup EXIT - -ipv4() { - echo -n 192.168.$1.2 -} - -ipv6 () { - echo -n dead:$1::2 -} - -check_counter() -{ - ns=$1 - name=$2 - expect=$3 - local lret=0 - - cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then - echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns nft list counter inet filter "$name" 1>&2 - lret=1 - fi - - return $lret -} - -check_unknown() -{ - expect="packets 0 bytes 0" - for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - check_counter $n "unknown" "$expect" - if [ $? -ne 0 ] ;then - return 1 - fi - done - - return 0 -} - -for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - ip netns add $n - ip -net $n link set lo up -done - -DEV=veth0 -ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1 -DEV=veth0 -ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2 - -DEV=veth0 -ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2 - -DEV=veth0 -for i in 1 2; do - ip -net nsclient$i link set $DEV up - ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV - ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV -done - -ip -net nsrouter1 link set eth1 up -ip -net nsrouter1 link set veth0 up - -ip -net nsrouter2 link set eth1 up -ip -net nsrouter2 link set eth2 up - -ip -net nsclient1 route add default via 192.168.1.1 -ip -net nsclient1 -6 route add default via dead:1::1 - -ip -net nsclient2 route add default via 192.168.2.1 -ip -net nsclient2 route add default via dead:2::1 - -i=3 -ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1 -ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0 -ip -net nsrouter1 addr add dead:1::1/64 dev eth1 -ip -net nsrouter1 addr add dead:3::1/64 dev veth0 -ip -net nsrouter1 route add default via 192.168.3.10 -ip -net nsrouter1 -6 route add default via dead:3::10 - -ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1 -ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2 -ip -net nsrouter2 addr add dead:2::1/64 dev eth1 -ip -net nsrouter2 addr add dead:3::10/64 dev eth2 -ip -net nsrouter2 route add default via 192.168.3.1 -ip -net nsrouter2 route add default via dead:3::1 - -sleep 2 -for i in 4 6; do - ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1 - ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1 -done - -for netns in nsrouter1 nsrouter2; do -ip netns exec $netns nft -f - </dev/null -if [ $? -ne 0 ]; then - echo "ERROR: netns ip routing/connectivity broken" 1>&2 - cleanup - exit 1 -fi -ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null -if [ $? -ne 0 ]; then - echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 - cleanup - exit 1 -fi - -check_unknown -if [ $? -ne 0 ]; then - ret=1 -fi - -expect="packets 0 bytes 0" -for netns in nsrouter1 nsrouter2 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then - ret=1 - fi -done - -expect="packets 2 bytes 2076" -check_counter nsclient2 "new" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null -if [ $? -eq 0 ]; then - echo "ERROR: ping should have failed with PMTU too big error" 1>&2 - ret=1 -fi - -# nsrouter2 should have generated the icmp error, so -# related counter should be 0 (its in forward). -expect="packets 0 bytes 0" -check_counter "nsrouter2" "related" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -# but nsrouter1 should have seen it, same for nsclient1. -expect="packets 1 bytes 576" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then - ret=1 - fi -done - -ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null -if [ $? -eq 0 ]; then - echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 - ret=1 -fi - -expect="packets 2 bytes 1856" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then - ret=1 - fi -done - -if [ $ret -eq 0 ];then - echo "PASS: icmp mtu error had RELATED state" -else - echo "ERROR: icmp error RELATED state test has failed" -fi - -# add 'bad' route, expect icmp REDIRECT to be generated -ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1 -ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1 - -ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null - -expect="packets 1 bytes 112" -check_counter nsclient1 "redir4" "$expect" -if [ $? -ne 0 ];then - ret=1 -fi - -ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null -expect="packets 1 bytes 192" -check_counter nsclient1 "redir6" "$expect" -if [ $? -ne 0 ];then - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: icmp redirects had RELATED state" -else - echo "ERROR: icmp redirect RELATED state test has failed" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh deleted file mode 100755 index a924e595cf..0000000000 --- a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Testing For SCTP COLLISION SCENARIO as Below: -# -# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] -# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] -# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] -# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] -# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] -# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] -# -# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS - -CLIENT_NS=$(mktemp -u client-XXXXXXXX) -CLIENT_IP="198.51.200.1" -CLIENT_PORT=1234 - -SERVER_NS=$(mktemp -u server-XXXXXXXX) -SERVER_IP="198.51.100.1" -SERVER_PORT=1234 - -ROUTER_NS=$(mktemp -u router-XXXXXXXX) -CLIENT_GW="198.51.200.2" -SERVER_GW="198.51.100.2" - -# setup the topo -setup() { - ip net add $CLIENT_NS - ip net add $SERVER_NS - ip net add $ROUTER_NS - ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS - ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS - - ip -n $SERVER_NS link set link0 up - ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0 - ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW - - ip -n $ROUTER_NS link set link1 up - ip -n $ROUTER_NS link set link2 up - ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1 - ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2 - ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1 - - ip -n $CLIENT_NS link set link3 up - ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3 - ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW - - # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with - # tc on $SERVER_NS side - tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb - tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit - tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ - 0xff match u8 2 0xff at 32 flowid 1:1 - tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms - - # simulate the ctstate check on OVS nf_conntrack - ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP - ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP - - # use a smaller number for assoc's max_retrans to reproduce the issue - modprobe sctp - ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3 -} - -cleanup() { - ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null - ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null - ip net del "$CLIENT_NS" - ip net del "$SERVER_NS" - ip net del "$ROUTER_NS" -} - -do_test() { - ip net exec $SERVER_NS ./sctp_collision server \ - $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & - ip net exec $CLIENT_NS ./sctp_collision client \ - $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT -} - -# NOTE: one way to work around the issue is set a smaller hb_interval -# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 - -# run the test case -trap cleanup EXIT -setup && \ -echo "Test for SCTP Collision in nf_conntrack:" && \ -do_test && echo "PASS!" -exit $? diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh deleted file mode 100755 index e7d7bf13cf..0000000000 --- a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Check that UNREPLIED tcp conntrack will eventually timeout. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -waittime=20 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - - ip netns del $ns1 - ip netns del $ns2 -} - -ipv4() { - echo -n 192.168.$1.2 -} - -check_counter() -{ - ns=$1 - name=$2 - expect=$3 - local lret=0 - - cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then - echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns2 nft list counter inet filter "$name" 1>&2 - lret=1 - fi - - return $lret -} - -# Create test namespaces -ip netns add $ns1 || exit 1 - -trap cleanup EXIT - -ip netns add $ns2 || exit 1 - -# Connect the namespace to the host using a veth pair -ip -net $ns1 link add name veth1 type veth peer name veth2 -ip -net $ns1 link set netns $ns2 dev veth2 - -ip -net $ns1 link set up dev lo -ip -net $ns2 link set up dev lo -ip -net $ns1 link set up dev veth1 -ip -net $ns2 link set up dev veth2 - -ip -net $ns2 addr add 10.11.11.2/24 dev veth2 -ip -net $ns2 route add default via 10.11.11.1 - -ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1 - -# add a rule inside NS so we enable conntrack -ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT - -ip -net $ns1 addr add 10.11.11.1/24 dev veth1 -ip -net $ns1 route add 10.99.99.99 via 10.11.11.2 - -# Check connectivity works -ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1 - -ip netns exec $ns2 nc -l -p 8080 < /dev/null & - -# however, conntrack entries are there - -ip netns exec $ns2 nft -f - < $ns2 to the virtual ip" -ip netns exec $ns1 bash -c 'while true ; do - nc -p 60000 10.99.99.99 80 - sleep 1 - done' & - -sleep 1 - -ip netns exec $ns2 nft -f - </dev/null | wc -l) -if [ $count -eq 0 ]; then - echo "ERROR: $ns2 did not pick up tcp connection from peer" - exit 1 -fi - -echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect" -for i in $(seq 1 $waittime); do - echo -n "." - - sleep 1 - - count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) - if [ $count -gt 0 ]; then - echo - echo "PASS: redirection took effect after $i seconds" - break - fi - - m=$((i%20)) - if [ $m -eq 0 ]; then - echo " waited for $i seconds" - fi -done - -expect="packets 1 bytes 60" -check_counter "$ns2" "redir" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: redirection counter has expected values" -else - echo "ERROR: no tcp connection was redirected" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/netfilter/conntrack_vrf.sh deleted file mode 100755 index 8b5ea92345..0000000000 --- a/tools/testing/selftests/netfilter/conntrack_vrf.sh +++ /dev/null @@ -1,241 +0,0 @@ -#!/bin/sh - -# This script demonstrates interaction of conntrack and vrf. -# The vrf driver calls the netfilter hooks again, with oif/iif -# pointing at the VRF device. -# -# For ingress, this means first iteration has iifname of lower/real -# device. In this script, thats veth0. -# Second iteration is iifname set to vrf device, tvrf in this script. -# -# For egress, this is reversed: first iteration has the vrf device, -# second iteration is done with the lower/real/veth0 device. -# -# test_ct_zone_in demonstrates unexpected change of nftables -# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack -# connection on VRF rcv" -# -# It was possible to assign conntrack zone to a packet (or mark it for -# `notracking`) in the prerouting chain before conntrack, based on real iif. -# -# After the change, the zone assignment is lost and the zone is assigned based -# on the VRF master interface (in case such a rule exists). -# assignment is lost. Instead, assignment based on the `iif` matching -# Thus it is impossible to distinguish packets based on the original -# interface. -# -# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem -# that was supposed to be fixed by the commit mentioned above to make sure -# that any fix to test case 1 won't break masquerade again. - -ksft_skip=4 - -IP0=172.30.30.1 -IP1=172.30.30.2 -PFXL=30 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" - -cleanup() -{ - ip netns pids $ns0 | xargs kill 2>/dev/null - ip netns pids $ns1 | xargs kill 2>/dev/null - - ip netns del $ns0 $ns1 -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi -ip netns add "$ns1" - -trap cleanup EXIT - -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 - -ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not add veth device" - exit $ksft_skip -fi - -ip -net $ns0 li add tvrf type vrf table 9876 -if [ $? -ne 0 ];then - echo "SKIP: Could not add vrf device" - exit $ksft_skip -fi - -ip -net $ns0 li set lo up - -ip -net $ns0 li set veth0 master tvrf -ip -net $ns0 li set tvrf up -ip -net $ns0 li set veth0 up -ip -net $ns1 li set veth0 up - -ip -net $ns0 addr add $IP0/$PFXL dev veth0 -ip -net $ns1 addr add $IP1/$PFXL dev veth0 - -ip netns exec $ns1 iperf3 -s > /dev/null 2>&1& -if [ $? -ne 0 ];then - echo "SKIP: Could not start iperf3" - exit $ksft_skip -fi - -# test vrf ingress handling. -# The incoming connection should be placed in conntrack zone 1, -# as decided by the first iteration of the ruleset. -test_ct_zone_in() -{ -ip netns exec $ns0 nft -f - < /dev/null - - # should be in zone 1, not zone 2 - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) - if [ $count -eq 1 ]; then - echo "PASS: entry found in conntrack zone 1" - else - echo "FAIL: entry not found in conntrack zone 1" - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) - if [ $count -eq 1 ]; then - echo "FAIL: entry found in zone 2 instead" - else - echo "FAIL: entry not in zone 1 or 2, dumping table" - ip netns exec $ns0 conntrack -L - ip netns exec $ns0 nft list ruleset - fi - fi -} - -# add masq rule that gets evaluated w. outif set to vrf device. -# This tests the first iteration of the packet through conntrack, -# oifname is the vrf device. -test_masquerade_vrf() -{ - local qdisc=$1 - - if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc add dev tvrf root $qdisc - fi - - ip netns exec $ns0 conntrack -F 2>/dev/null - -ip netns exec $ns0 nft -f - </dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device" - ret=1 - return - fi - - # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' && - ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" - else - echo "FAIL: vrf rules have unexpected counter value" - ret=1 - fi - - if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc del dev tvrf root - fi -} - -# add masq rule that gets evaluated w. outif set to veth device. -# This tests the 2nd iteration of the packet through conntrack, -# oifname is the lower device (veth0 in this case). -test_masquerade_veth() -{ - ip netns exec $ns0 conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - < /dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device" - ret=1 - return - fi - - # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device" - else - echo "FAIL: vrf masq rule has unexpected counter value" - ret=1 - fi -} - -test_ct_zone_in -test_masquerade_vrf "default" -test_masquerade_vrf "pfifo" -test_masquerade_veth - -exit $ret diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh deleted file mode 100755 index eb9553e498..0000000000 --- a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -# Conntrack needs to reassemble fragments in order to have complete -# packets for rule matching. Reassembly can lead to packet loss. - -# Consider the following setup: -# +--------+ +---------+ +--------+ -# |Router A|-------|Wanrouter|-------|Router B| -# | |.IPIP..| |..IPIP.| | -# +--------+ +---------+ +--------+ -# / mtu 1400 \ -# / \ -#+--------+ +--------+ -#|Client A| |Client B| -#| | | | -#+--------+ +--------+ - -# Router A and Router B use IPIP tunnel interfaces to tunnel traffic -# between Client A and Client B over WAN. Wanrouter has MTU 1400 set -# on its interfaces. - -rnd=$(mktemp -u XXXXXXXX) -rx=$(mktemp) - -r_a="ns-ra-$rnd" -r_b="ns-rb-$rnd" -r_w="ns-rw-$rnd" -c_a="ns-ca-$rnd" -c_b="ns-cb-$rnd" - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "iptables --version" "run test without iptables" -checktool "ip -Version" "run test without ip tool" -checktool "which socat" "run test without socat" -checktool "ip netns add ${r_a}" "create net namespace" - -for n in ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns add ${n} -done - -cleanup() { - for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns del ${n} - done - rm -f ${rx} -} - -trap cleanup EXIT - -test_path() { - msg="$1" - - ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null & - - sleep 1 - for i in 1 2 3; do - head -c1400 /dev/zero | tr "\000" "a" | \ - ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000 - done - - wait - - bytes=$(wc -c < ${rx}) - - if [ $bytes -eq 1400 ];then - echo "OK: PMTU $msg connection tracking" - else - echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" - exit 1 - fi -} - -# Detailed setup for Router A -# --------------------------- -# Interfaces: -# eth0: 10.2.2.1/24 -# eth1: 192.168.10.1/24 -# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1 -# Routes: -# 192.168.20.0/24 dev ipip0 (192.168.20.0/24 is subnet of Client B) -# 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) -# No iptables rules at all. - -ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w} -ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a} - -l_addr="10.2.2.1" -r_addr="10.4.4.1" -ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip - -for dev in lo veth0 veth1 ipip0; do - ip -net ${r_a} link set $dev up -done - -ip -net ${r_a} addr add 10.2.2.1/24 dev veth0 -ip -net ${r_a} addr add 192.168.10.1/24 dev veth1 - -ip -net ${r_a} route add 192.168.20.0/24 dev ipip0 -ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254 - -ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null - -# Detailed setup for Router B -# --------------------------- -# Interfaces: -# eth0: 10.4.4.1/24 -# eth1: 192.168.20.1/24 -# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1 -# Routes: -# 192.168.10.0/24 dev ipip0 (192.168.10.0/24 is subnet of Client A) -# 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) -# No iptables rules at all. - -ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w} -ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b} - -l_addr="10.4.4.1" -r_addr="10.2.2.1" - -ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip - -for dev in lo veth0 veth1 ipip0; do - ip -net ${r_b} link set $dev up -done - -ip -net ${r_b} addr add 10.4.4.1/24 dev veth0 -ip -net ${r_b} addr add 192.168.20.1/24 dev veth1 - -ip -net ${r_b} route add 192.168.10.0/24 dev ipip0 -ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254 -ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null - -# Client A -ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 -ip -net ${c_a} link set dev lo up -ip -net ${c_a} link set dev veth0 up -ip -net ${c_a} route add default via 192.168.10.1 - -# Client A -ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 -ip -net ${c_b} link set dev veth0 up -ip -net ${c_b} link set dev lo up -ip -net ${c_b} route add default via 192.168.20.1 - -# Wan -ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 -ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 - -ip -net ${r_w} link set dev lo up -ip -net ${r_w} link set dev veth0 up mtu 1400 -ip -net ${r_w} link set dev veth1 up mtu 1400 - -ip -net ${r_a} link set dev veth0 mtu 1400 -ip -net ${r_b} link set dev veth0 mtu 1400 - -ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null - -# Path MTU discovery -# ------------------ -# Running tracepath from Client A to Client B shows PMTU discovery is working -# as expected: -# -# clienta:~# tracepath 192.168.20.2 -# 1?: [LOCALHOST] pmtu 1500 -# 1: 192.168.10.1 0.867ms -# 1: 192.168.10.1 0.302ms -# 2: 192.168.10.1 0.312ms pmtu 1480 -# 2: no reply -# 3: 192.168.10.1 0.510ms pmtu 1380 -# 3: 192.168.20.2 2.320ms reached -# Resume: pmtu 1380 hops 3 back 3 - -# ip netns exec ${c_a} traceroute --mtu 192.168.20.2 - -# Router A has learned PMTU (1400) to Router B from Wanrouter. -# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B -# from Router A. - -#Send large UDP packet -#--------------------- -#Now we send a 1400 bytes UDP packet from Client A to Client B: - -# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000 -test_path "without" - -# The IPv4 stack on Client A already knows the PMTU to Client B, so the -# UDP packet is sent as two fragments (1380 + 20). Router A forwards the -# fragments between eth1 and ipip0. The fragments fit into the tunnel and -# reach their destination. - -#When sending the large UDP packet again, Router A now reassembles the -#fragments before routing the packet over ipip0. The resulting IPIP -#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is -#dropped on Router A before sending. - -ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW -test_path "with" diff --git a/tools/testing/selftests/netfilter/ipvs.sh b/tools/testing/selftests/netfilter/ipvs.sh deleted file mode 100755 index c3b8f90c49..0000000000 --- a/tools/testing/selftests/netfilter/ipvs.sh +++ /dev/null @@ -1,228 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# End-to-end ipvs test suite -# Topology: -#--------------------------------------------------------------+ -# | | -# ns0 | ns1 | -# ----------- | ----------- ----------- | -# | veth01 | --------- | veth10 | | veth12 | | -# ----------- peer ----------- ----------- | -# | | | | -# ----------- | | | -# | br0 | |----------------- peer |--------------| -# ----------- | | | -# | | | | -# ---------- peer ---------- ----------- | -# | veth02 | --------- | veth20 | | veth21 | | -# ---------- | ---------- ----------- | -# | ns2 | -# | | -#--------------------------------------------------------------+ -# -# We assume that all network driver are loaded -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 -GREEN='\033[0;92m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -readonly port=8080 - -readonly vip_v4=207.175.44.110 -readonly cip_v4=10.0.0.2 -readonly gip_v4=10.0.0.1 -readonly dip_v4=172.16.0.1 -readonly rip_v4=172.16.0.2 -readonly sip_v4=10.0.0.3 - -readonly infile="$(mktemp)" -readonly outfile="$(mktemp)" -readonly datalen=32 - -sysipvsnet="/proc/sys/net/ipv4/vs/" -if [ ! -d $sysipvsnet ]; then - modprobe -q ip_vs - if [ $? -ne 0 ]; then - echo "skip: could not run test without ipvs module" - exit $ksft_skip - fi -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ipvsadm -v > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ipvsadm" - exit $ksft_skip -fi - -setup() { - ip netns add ns0 - ip netns add ns1 - ip netns add ns2 - - ip link add veth01 netns ns0 type veth peer name veth10 netns ns1 - ip link add veth02 netns ns0 type veth peer name veth20 netns ns2 - ip link add veth12 netns ns1 type veth peer name veth21 netns ns2 - - ip netns exec ns0 ip link set veth01 up - ip netns exec ns0 ip link set veth02 up - ip netns exec ns0 ip link add br0 type bridge - ip netns exec ns0 ip link set veth01 master br0 - ip netns exec ns0 ip link set veth02 master br0 - ip netns exec ns0 ip link set br0 up - ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0 - - ip netns exec ns1 ip link set lo up - ip netns exec ns1 ip link set veth10 up - ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10 - ip netns exec ns1 ip link set veth12 up - ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12 - - ip netns exec ns2 ip link set lo up - ip netns exec ns2 ip link set veth21 up - ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21 - ip netns exec ns2 ip link set veth20 up - ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20 - - sleep 1 - - dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none -} - -cleanup() { - for i in 0 1 2 - do - ip netns del ns$i > /dev/null 2>&1 - done - - if [ -f "${outfile}" ]; then - rm "${outfile}" - fi - if [ -f "${infile}" ]; then - rm "${infile}" - fi -} - -server_listen() { - ip netns exec ns2 nc -l -p 8080 > "${outfile}" & - server_pid=$! - sleep 0.2 -} - -client_connect() { - ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}" -} - -verify_data() { - wait "${server_pid}" - cmp "$infile" "$outfile" 2>/dev/null -} - -test_service() { - server_listen - client_connect - verify_data -} - - -test_dr() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - # avoid incorrect arp response - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - # avoid reverse route lookup - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -test_nat() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 ip link del veth20 - ip netns exec ns2 ip route add default via ${dip_v4} dev veth21 - - test_service -} - -test_tun() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 modprobe ipip - ip netns exec ns1 ip link set tunl0 up - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 modprobe ipip - ip netns exec ns2 ip link set tunl0 up - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -run_tests() { - local errors= - - echo "Testing DR mode..." - cleanup - setup - test_dr - errors=$(( $errors + $? )) - - echo "Testing NAT mode..." - cleanup - setup - test_nat - errors=$(( $errors + $? )) - - echo "Testing Tunnel mode..." - cleanup - setup - test_tun - errors=$(( $errors + $? )) - - return $errors -} - -trap cleanup EXIT - -run_tests - -if [ $? -ne 0 ]; then - echo -e "$(basename $0): ${RED}FAIL${NC}" - exit 1 -fi -echo -e "$(basename $0): ${GREEN}PASS${NC}" -exit 0 diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/netfilter/nf-queue.c deleted file mode 100644 index 9e56b9d470..0000000000 --- a/tools/testing/selftests/netfilter/nf-queue.c +++ /dev/null @@ -1,395 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -struct options { - bool count_packets; - bool gso_enabled; - int verbose; - unsigned int queue_num; - unsigned int timeout; - uint32_t verdict; - uint32_t delay_ms; -}; - -static unsigned int queue_stats[5]; -static struct options opts; - -static void help(const char *p) -{ - printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); -} - -static int parse_attr_cb(const struct nlattr *attr, void *data) -{ - const struct nlattr **tb = data; - int type = mnl_attr_get_type(attr); - - /* skip unsupported attribute in user-space */ - if (mnl_attr_type_valid(attr, NFQA_MAX) < 0) - return MNL_CB_OK; - - switch (type) { - case NFQA_MARK: - case NFQA_IFINDEX_INDEV: - case NFQA_IFINDEX_OUTDEV: - case NFQA_IFINDEX_PHYSINDEV: - case NFQA_IFINDEX_PHYSOUTDEV: - if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0) { - perror("mnl_attr_validate"); - return MNL_CB_ERROR; - } - break; - case NFQA_TIMESTAMP: - if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, - sizeof(struct nfqnl_msg_packet_timestamp)) < 0) { - perror("mnl_attr_validate2"); - return MNL_CB_ERROR; - } - break; - case NFQA_HWADDR: - if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, - sizeof(struct nfqnl_msg_packet_hw)) < 0) { - perror("mnl_attr_validate2"); - return MNL_CB_ERROR; - } - break; - case NFQA_PAYLOAD: - break; - } - tb[type] = attr; - return MNL_CB_OK; -} - -static int queue_cb(const struct nlmsghdr *nlh, void *data) -{ - struct nlattr *tb[NFQA_MAX+1] = { 0 }; - struct nfqnl_msg_packet_hdr *ph = NULL; - uint32_t id = 0; - - (void)data; - - mnl_attr_parse(nlh, sizeof(struct nfgenmsg), parse_attr_cb, tb); - if (tb[NFQA_PACKET_HDR]) { - ph = mnl_attr_get_payload(tb[NFQA_PACKET_HDR]); - id = ntohl(ph->packet_id); - - if (opts.verbose > 0) - printf("packet hook=%u, hwproto 0x%x", - ntohs(ph->hw_protocol), ph->hook); - - if (ph->hook >= 5) { - fprintf(stderr, "Unknown hook %d\n", ph->hook); - return MNL_CB_ERROR; - } - - if (opts.verbose > 0) { - uint32_t skbinfo = 0; - - if (tb[NFQA_SKB_INFO]) - skbinfo = ntohl(mnl_attr_get_u32(tb[NFQA_SKB_INFO])); - if (skbinfo & NFQA_SKB_CSUMNOTREADY) - printf(" csumnotready"); - if (skbinfo & NFQA_SKB_GSO) - printf(" gso"); - if (skbinfo & NFQA_SKB_CSUM_NOTVERIFIED) - printf(" csumnotverified"); - puts(""); - } - - if (opts.count_packets) - queue_stats[ph->hook]++; - } - - return MNL_CB_OK + id; -} - -static struct nlmsghdr * -nfq_build_cfg_request(char *buf, uint8_t command, int queue_num) -{ - struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); - struct nfqnl_msg_config_cmd cmd = { - .command = command, - .pf = htons(AF_INET), - }; - struct nfgenmsg *nfg; - - nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; - nlh->nlmsg_flags = NLM_F_REQUEST; - - nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); - - nfg->nfgen_family = AF_UNSPEC; - nfg->version = NFNETLINK_V0; - nfg->res_id = htons(queue_num); - - mnl_attr_put(nlh, NFQA_CFG_CMD, sizeof(cmd), &cmd); - - return nlh; -} - -static struct nlmsghdr * -nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) -{ - struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); - struct nfqnl_msg_config_params params = { - .copy_range = htonl(range), - .copy_mode = mode, - }; - struct nfgenmsg *nfg; - - nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; - nlh->nlmsg_flags = NLM_F_REQUEST; - - nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); - nfg->nfgen_family = AF_UNSPEC; - nfg->version = NFNETLINK_V0; - nfg->res_id = htons(queue_num); - - mnl_attr_put(nlh, NFQA_CFG_PARAMS, sizeof(params), ¶ms); - - return nlh; -} - -static struct nlmsghdr * -nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd) -{ - struct nfqnl_msg_verdict_hdr vh = { - .verdict = htonl(verd), - .id = htonl(id), - }; - struct nlmsghdr *nlh; - struct nfgenmsg *nfg; - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_VERDICT; - nlh->nlmsg_flags = NLM_F_REQUEST; - nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); - nfg->nfgen_family = AF_UNSPEC; - nfg->version = NFNETLINK_V0; - nfg->res_id = htons(queue_num); - - mnl_attr_put(nlh, NFQA_VERDICT_HDR, sizeof(vh), &vh); - - return nlh; -} - -static void print_stats(void) -{ - unsigned int last, total; - int i; - - total = 0; - last = queue_stats[0]; - - for (i = 0; i < 5; i++) { - printf("hook %d packets %08u\n", i, queue_stats[i]); - last = queue_stats[i]; - total += last; - } - - printf("%u packets total\n", total); -} - -struct mnl_socket *open_queue(void) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - unsigned int queue_num; - struct mnl_socket *nl; - struct nlmsghdr *nlh; - struct timeval tv; - uint32_t flags; - - nl = mnl_socket_open(NETLINK_NETFILTER); - if (nl == NULL) { - perror("mnl_socket_open"); - exit(EXIT_FAILURE); - } - - if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { - perror("mnl_socket_bind"); - exit(EXIT_FAILURE); - } - - queue_num = opts.queue_num; - nlh = nfq_build_cfg_request(buf, NFQNL_CFG_CMD_BIND, queue_num); - - if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - exit(EXIT_FAILURE); - } - - nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); - - flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; - flags |= NFQA_CFG_F_UID_GID; - mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); - mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); - - if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - exit(EXIT_FAILURE); - } - - memset(&tv, 0, sizeof(tv)); - tv.tv_sec = opts.timeout; - if (opts.timeout && setsockopt(mnl_socket_get_fd(nl), - SOL_SOCKET, SO_RCVTIMEO, - &tv, sizeof(tv))) { - perror("setsockopt(SO_RCVTIMEO)"); - exit(EXIT_FAILURE); - } - - return nl; -} - -static void sleep_ms(uint32_t delay) -{ - struct timespec ts = { .tv_sec = delay / 1000 }; - - delay %= 1000; - - ts.tv_nsec = delay * 1000llu * 1000llu; - - nanosleep(&ts, NULL); -} - -static int mainloop(void) -{ - unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; - struct mnl_socket *nl; - struct nlmsghdr *nlh; - unsigned int portid; - char *buf; - int ret; - - buf = malloc(buflen); - if (!buf) { - perror("malloc"); - exit(EXIT_FAILURE); - } - - nl = open_queue(); - portid = mnl_socket_get_portid(nl); - - for (;;) { - uint32_t id; - - ret = mnl_socket_recvfrom(nl, buf, buflen); - if (ret == -1) { - if (errno == ENOBUFS || errno == EINTR) - continue; - - if (errno == EAGAIN) { - errno = 0; - ret = 0; - break; - } - - perror("mnl_socket_recvfrom"); - exit(EXIT_FAILURE); - } - - ret = mnl_cb_run(buf, ret, 0, portid, queue_cb, NULL); - if (ret < 0) { - perror("mnl_cb_run"); - exit(EXIT_FAILURE); - } - - id = ret - MNL_CB_OK; - if (opts.delay_ms) - sleep_ms(opts.delay_ms); - - nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict); - if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - exit(EXIT_FAILURE); - } - } - - mnl_socket_close(nl); - - return ret; -} - -static void parse_opts(int argc, char **argv) -{ - int c; - - while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { - switch (c) { - case 'c': - opts.count_packets = true; - break; - case 'h': - help(argv[0]); - exit(0); - break; - case 'q': - opts.queue_num = atoi(optarg); - if (opts.queue_num > 0xffff) - opts.queue_num = 0; - break; - case 'Q': - opts.verdict = atoi(optarg); - if (opts.verdict > 0xffff) { - fprintf(stderr, "Expected destination queue number\n"); - exit(1); - } - - opts.verdict <<= 16; - opts.verdict |= NF_QUEUE; - break; - case 'd': - opts.delay_ms = atoi(optarg); - if (opts.delay_ms == 0) { - fprintf(stderr, "Expected nonzero delay (in milliseconds)\n"); - exit(1); - } - break; - case 't': - opts.timeout = atoi(optarg); - break; - case 'G': - opts.gso_enabled = false; - break; - case 'v': - opts.verbose++; - break; - } - } - - if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) { - fprintf(stderr, "Cannot use same destination and source queue\n"); - exit(1); - } -} - -int main(int argc, char *argv[]) -{ - int ret; - - opts.verdict = NF_ACCEPT; - opts.gso_enabled = true; - - parse_opts(argc, argv); - - ret = mainloop(); - if (opts.count_packets) - print_stats(); - - return ret; -} diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh deleted file mode 100755 index a1aa8f4a58..0000000000 --- a/tools/testing/selftests/netfilter/nf_nat_edemux.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test NAT source port clash resolution -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -socatpid=0 - -cleanup() -{ - [ $socatpid -gt 0 ] && kill $socatpid - ip netns del $ns1 - ip netns del $ns2 -} - -socat -h > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without socat" - exit $ksft_skip -fi - -iptables --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without iptables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add $ns2 - -# Connect the namespaces using a veth pair -ip link add name veth2 type veth peer name veth1 -ip link set netns $ns1 dev veth1 -ip link set netns $ns2 dev veth2 - -ip netns exec $ns1 ip link set up dev lo -ip netns exec $ns1 ip link set up dev veth1 -ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 - -ip netns exec $ns2 ip link set up dev lo -ip netns exec $ns2 ip link set up dev veth2 -ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 - -# Create a server in one namespace -ip netns exec $ns1 socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & -socatpid=$! - -# Restrict source port to just one so we don't have to exhaust -# all others. -ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" - -# add a virtual IP using DNAT -ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 - -# ... and route it to the other namespace -ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 - -sleep 1 - -# add a persistent connection from the other namespace -ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & - -sleep 1 - -# ip daddr:dport will be rewritten to 192.168.1.1 5201 -# NAT must reallocate source port 10000 because -# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use -echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null -ret=$? - -# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). -if [ $ret -eq 0 ]; then - echo "PASS: socat can connect via NAT'd address" -else - echo "FAIL: socat cannot connect via NAT'd address" -fi - -# check sport clashres. -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 - -sleep 5 | ip netns exec $ns2 socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & -cpid1=$! -sleep 1 - -# if connect succeeds, client closes instantly due to EOF on stdin. -# if connect hangs, it will time out after 5s. -echo | ip netns exec $ns2 socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & -cpid2=$! - -time_then=$(date +%s) -wait $cpid2 -rv=$? -time_now=$(date +%s) - -# Check how much time has elapsed, expectation is for -# 'cpid2' to connect and then exit (and no connect delay). -delta=$((time_now - time_then)) - -if [ $delta -lt 2 -a $rv -eq 0 ]; then - echo "PASS: could connect to service via redirected ports" -else - echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" - ret=1 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_audit.sh b/tools/testing/selftests/netfilter/nft_audit.sh deleted file mode 100755 index 99ed5bd6e8..0000000000 --- a/tools/testing/selftests/netfilter/nft_audit.sh +++ /dev/null @@ -1,245 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Check that audit logs generated for nft commands are as expected. - -SKIP_RC=4 -RC=0 - -nft --version >/dev/null 2>&1 || { - echo "SKIP: missing nft tool" - exit $SKIP_RC -} - -# Run everything in a separate network namespace -[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } - -# give other scripts a chance to finish - audit_logread sees all activity -sleep 1 - -logfile=$(mktemp) -rulefile=$(mktemp) -echo "logging into $logfile" -./audit_logread >"$logfile" & -logread_pid=$! -trap 'kill $logread_pid; rm -f $logfile $rulefile' EXIT -exec 3<"$logfile" - -do_test() { # (cmd, log) - echo -n "testing for cmd: $1 ... " - cat <&3 >/dev/null - $1 >/dev/null || exit 1 - sleep 0.1 - res=$(diff -a -u <(echo "$2") - <&3) - [ $? -eq 0 ] && { echo "OK"; return; } - echo "FAIL" - grep -v '^\(---\|+++\|@@\)' <<< "$res" - ((RC--)) -} - -nft flush ruleset - -# adding tables, chains and rules - -for table in t1 t2; do - do_test "nft add table $table" \ - "table=$table family=2 entries=1 op=nft_register_table" - - do_test "nft add chain $table c1" \ - "table=$table family=2 entries=1 op=nft_register_chain" - - do_test "nft add chain $table c2; add chain $table c3" \ - "table=$table family=2 entries=2 op=nft_register_chain" - - cmd="add rule $table c1 counter" - - do_test "nft $cmd" \ - "table=$table family=2 entries=1 op=nft_register_rule" - - do_test "nft $cmd; $cmd" \ - "table=$table family=2 entries=2 op=nft_register_rule" - - cmd="" - sep="" - for chain in c2 c3; do - for i in {1..3}; do - cmd+="$sep add rule $table $chain counter" - sep=";" - done - done - do_test "nft $cmd" \ - "table=$table family=2 entries=6 op=nft_register_rule" -done - -for ((i = 0; i < 500; i++)); do - echo "add rule t2 c3 counter accept comment \"rule $i\"" -done >$rulefile -do_test "nft -f $rulefile" \ -'table=t2 family=2 entries=500 op=nft_register_rule' - -# adding sets and elements - -settype='type inet_service; counter' -setelem='{ 22, 80, 443 }' -setblock="{ $settype; elements = $setelem; }" -do_test "nft add set t1 s $setblock" \ -"table=t1 family=2 entries=4 op=nft_register_set" - -do_test "nft add set t1 s2 $setblock; add set t1 s3 { $settype; }" \ -"table=t1 family=2 entries=5 op=nft_register_set" - -do_test "nft add element t1 s3 $setelem" \ -"table=t1 family=2 entries=3 op=nft_register_setelem" - -# adding counters - -do_test 'nft add counter t1 c1' \ -'table=t1 family=2 entries=1 op=nft_register_obj' - -do_test 'nft add counter t2 c1; add counter t2 c2' \ -'table=t2 family=2 entries=2 op=nft_register_obj' - -for ((i = 3; i <= 500; i++)); do - echo "add counter t2 c$i" -done >$rulefile -do_test "nft -f $rulefile" \ -'table=t2 family=2 entries=498 op=nft_register_obj' - -# adding/updating quotas - -do_test 'nft add quota t1 q1 { 10 bytes }' \ -'table=t1 family=2 entries=1 op=nft_register_obj' - -do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \ -'table=t2 family=2 entries=2 op=nft_register_obj' - -for ((i = 3; i <= 500; i++)); do - echo "add quota t2 q$i { 10 bytes }" -done >$rulefile -do_test "nft -f $rulefile" \ -'table=t2 family=2 entries=498 op=nft_register_obj' - -# changing the quota value triggers obj update path -do_test 'nft add quota t1 q1 { 20 bytes }' \ -'table=t1 family=2 entries=1 op=nft_register_obj' - -# resetting rules - -do_test 'nft reset rules t1 c2' \ -'table=t1 family=2 entries=3 op=nft_reset_rule' - -do_test 'nft reset rules table t1' \ -'table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule' - -do_test 'nft reset rules t2 c3' \ -'table=t2 family=2 entries=189 op=nft_reset_rule -table=t2 family=2 entries=188 op=nft_reset_rule -table=t2 family=2 entries=126 op=nft_reset_rule' - -do_test 'nft reset rules t2' \ -'table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=186 op=nft_reset_rule -table=t2 family=2 entries=188 op=nft_reset_rule -table=t2 family=2 entries=129 op=nft_reset_rule' - -do_test 'nft reset rules' \ -'table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=180 op=nft_reset_rule -table=t2 family=2 entries=188 op=nft_reset_rule -table=t2 family=2 entries=135 op=nft_reset_rule' - -# resetting sets and elements - -elem=(22 ,80 ,443) -relem="" -for i in {1..3}; do - relem+="${elem[((i - 1))]}" - do_test "nft reset element t1 s { $relem }" \ - "table=t1 family=2 entries=$i op=nft_reset_setelem" -done - -do_test 'nft reset set t1 s' \ -'table=t1 family=2 entries=3 op=nft_reset_setelem' - -# resetting counters - -do_test 'nft reset counter t1 c1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset counters t1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset counters t2' \ -'table=t2 family=2 entries=342 op=nft_reset_obj -table=t2 family=2 entries=158 op=nft_reset_obj' - -do_test 'nft reset counters' \ -'table=t1 family=2 entries=1 op=nft_reset_obj -table=t2 family=2 entries=341 op=nft_reset_obj -table=t2 family=2 entries=159 op=nft_reset_obj' - -# resetting quotas - -do_test 'nft reset quota t1 q1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset quotas t1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset quotas t2' \ -'table=t2 family=2 entries=315 op=nft_reset_obj -table=t2 family=2 entries=185 op=nft_reset_obj' - -do_test 'nft reset quotas' \ -'table=t1 family=2 entries=1 op=nft_reset_obj -table=t2 family=2 entries=314 op=nft_reset_obj -table=t2 family=2 entries=186 op=nft_reset_obj' - -# deleting rules - -readarray -t handles < <(nft -a list chain t1 c1 | \ - sed -n 's/.*counter.* handle \(.*\)$/\1/p') - -do_test "nft delete rule t1 c1 handle ${handles[0]}" \ -'table=t1 family=2 entries=1 op=nft_unregister_rule' - -cmd='delete rule t1 c1 handle' -do_test "nft $cmd ${handles[1]}; $cmd ${handles[2]}" \ -'table=t1 family=2 entries=2 op=nft_unregister_rule' - -do_test 'nft flush chain t1 c2' \ -'table=t1 family=2 entries=3 op=nft_unregister_rule' - -do_test 'nft flush table t2' \ -'table=t2 family=2 entries=509 op=nft_unregister_rule' - -# deleting chains - -do_test 'nft delete chain t2 c2' \ -'table=t2 family=2 entries=1 op=nft_unregister_chain' - -# deleting sets and elements - -do_test 'nft delete element t1 s { 22 }' \ -'table=t1 family=2 entries=1 op=nft_unregister_setelem' - -do_test 'nft delete element t1 s { 80, 443 }' \ -'table=t1 family=2 entries=2 op=nft_unregister_setelem' - -do_test 'nft flush set t1 s2' \ -'table=t1 family=2 entries=3 op=nft_unregister_setelem' - -do_test 'nft delete set t1 s2' \ -'table=t1 family=2 entries=1 op=nft_unregister_set' - -do_test 'nft delete set t1 s3' \ -'table=t1 family=2 entries=1 op=nft_unregister_set' - -exit $RC diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh deleted file mode 100755 index e908009576..0000000000 --- a/tools/testing/selftests/netfilter/nft_concat_range.sh +++ /dev/null @@ -1,1645 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# nft_concat_range.sh - Tests for sets with concatenation of ranged fields -# -# Copyright (c) 2019 Red Hat GmbH -# -# Author: Stefano Brivio -# -# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031 -# ^ Configuration and templates sourced with eval, counters reused in subshells - -KSELFTEST_SKIP=4 - -# Available test groups: -# - reported_issues: check for issues that were reported in the past -# - correctness: check that packets match given entries, and only those -# - concurrency: attempt races between insertion, deletion and lookup -# - timeout: check that packets match entries until they expire -# - performance: estimate matching rate, compare with rbtree and hash baselines -TESTS="reported_issues correctness concurrency timeout" -[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance" - -# Set types, defined by TYPE_ variables below -TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto - net_port_net net_mac mac_net net_mac_icmp net6_mac_icmp - net6_port_net6_port net_port_mac_proto_net" - -# Reported bugs, also described by TYPE_ variables below -BUGS="flush_remove_add reload" - -# List of possible paths to pktgen script from kernel tree for performance tests -PKTGEN_SCRIPT_PATHS=" - ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh - pktgen/pktgen_bench_xmit_mode_netif_receive.sh" - -# Definition of set types: -# display display text for test report -# type_spec nftables set type specifier -# chain_spec nftables type specifier for rules mapping to set -# dst call sequence of format_*() functions for destination fields -# src call sequence of format_*() functions for source fields -# start initial integer used to generate addresses and ports -# count count of entries to generate and match -# src_delta number summed to destination generator for source fields -# tools list of tools for correctness and timeout tests, any can be used -# proto L4 protocol of test packets -# -# race_repeat race attempts per thread, 0 disables concurrency test for type -# flood_tools list of tools for concurrency tests, any can be used -# flood_proto L4 protocol of test packets for concurrency tests -# flood_spec nftables type specifier for concurrency tests -# -# perf_duration duration of single pktgen injection test -# perf_spec nftables type specifier for performance tests -# perf_dst format_*() functions for destination fields in performance test -# perf_src format_*() functions for source fields in performance test -# perf_entries number of set entries for performance test -# perf_proto L3 protocol of test packets -TYPE_net_port=" -display net,port -type_spec ipv4_addr . inet_service -chain_spec ip daddr . udp dport -dst addr4 port -src -start 1 -count 5 -src_delta 2000 -tools sendip nc bash -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto udp -flood_spec ip daddr . udp dport - -perf_duration 5 -perf_spec ip daddr . udp dport -perf_dst addr4 port -perf_src -perf_entries 1000 -perf_proto ipv4 -" - -TYPE_port_net=" -display port,net -type_spec inet_service . ipv4_addr -chain_spec udp dport . ip daddr -dst port addr4 -src -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto udp -flood_spec udp dport . ip daddr - -perf_duration 5 -perf_spec udp dport . ip daddr -perf_dst port addr4 -perf_src -perf_entries 100 -perf_proto ipv4 -" - -TYPE_net6_port=" -display net6,port -type_spec ipv6_addr . inet_service -chain_spec ip6 daddr . udp dport -dst addr6 port -src -start 10 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp6 - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp6 -flood_spec ip6 daddr . udp dport - -perf_duration 5 -perf_spec ip6 daddr . udp dport -perf_dst addr6 port -perf_src -perf_entries 1000 -perf_proto ipv6 -" - -TYPE_port_proto=" -display port,proto -type_spec inet_service . inet_proto -chain_spec udp dport . meta l4proto -dst port proto -src -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 5 -perf_spec udp dport . meta l4proto -perf_dst port proto -perf_src -perf_entries 30000 -perf_proto ipv4 -" - -TYPE_net6_port_mac=" -display net6,port,mac -type_spec ipv6_addr . inet_service . ether_addr -chain_spec ip6 daddr . udp dport . ether saddr -dst addr6 port -src mac -start 10 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp6 - -race_repeat 0 - -perf_duration 5 -perf_spec ip6 daddr . udp dport . ether daddr -perf_dst addr6 port mac -perf_src -perf_entries 10 -perf_proto ipv6 -" - -TYPE_net6_port_mac_proto=" -display net6,port,mac,proto -type_spec ipv6_addr . inet_service . ether_addr . inet_proto -chain_spec ip6 daddr . udp dport . ether saddr . meta l4proto -dst addr6 port -src mac proto -start 10 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp6 - -race_repeat 0 - -perf_duration 5 -perf_spec ip6 daddr . udp dport . ether daddr . meta l4proto -perf_dst addr6 port mac proto -perf_src -perf_entries 1000 -perf_proto ipv6 -" - -TYPE_net_port_net=" -display net,port,net -type_spec ipv4_addr . inet_service . ipv4_addr -chain_spec ip daddr . udp dport . ip saddr -dst addr4 port -src addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp -flood_spec ip daddr . udp dport . ip saddr - -perf_duration 0 -" - -TYPE_net6_port_net6_port=" -display net6,port,net6,port -type_spec ipv6_addr . inet_service . ipv6_addr . inet_service -chain_spec ip6 daddr . udp dport . ip6 saddr . udp sport -dst addr6 port -src addr6 port -start 10 -count 5 -src_delta 2000 -tools sendip socat nc -proto udp6 - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp6 -flood_spec ip6 daddr . tcp dport . ip6 saddr . tcp sport - -perf_duration 0 -" - -TYPE_net_port_mac_proto_net=" -display net,port,mac,proto,net -type_spec ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr -chain_spec ip daddr . udp dport . ether saddr . meta l4proto . ip saddr -dst addr4 port -src mac proto addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net_mac=" -display net,mac -type_spec ipv4_addr . ether_addr -chain_spec ip daddr . ether saddr -dst addr4 -src mac -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 5 -perf_spec ip daddr . ether daddr -perf_dst addr4 mac -perf_src -perf_entries 1000 -perf_proto ipv4 -" - -TYPE_mac_net=" -display mac,net -type_spec ether_addr . ipv4_addr -chain_spec ether saddr . ip saddr -dst -src mac addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net_mac_icmp=" -display net,mac - ICMP -type_spec ipv4_addr . ether_addr -chain_spec ip daddr . ether saddr -dst addr4 -src mac -start 1 -count 5 -src_delta 2000 -tools ping -proto icmp - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net6_mac_icmp=" -display net6,mac - ICMPv6 -type_spec ipv6_addr . ether_addr -chain_spec ip6 daddr . ether saddr -dst addr6 -src mac -start 10 -count 50 -src_delta 2000 -tools ping -proto icmp6 - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net_port_proto_net=" -display net,port,proto,net -type_spec ipv4_addr . inet_service . inet_proto . ipv4_addr -chain_spec ip daddr . udp dport . meta l4proto . ip saddr -dst addr4 port proto -src addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp -flood_spec ip daddr . tcp dport . meta l4proto . ip saddr - -perf_duration 0 -" - -# Definition of tests for bugs reported in the past: -# display display text for test report -TYPE_flush_remove_add=" -display Add two elements, flush, re-add -" - -TYPE_reload=" -display net,mac with reload -type_spec ipv4_addr . ether_addr -chain_spec ip daddr . ether saddr -dst addr4 -src mac -start 1 -count 1 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 0 -" - -# Set template for all tests, types and rules are filled in depending on test -set_template=' -flush ruleset - -table inet filter { - counter test { - packets 0 bytes 0 - } - - set test { - type ${type_spec} - flags interval,timeout - } - - chain input { - type filter hook prerouting priority 0; policy accept; - ${chain_spec} @test counter name \"test\" - } -} - -table netdev perf { - counter test { - packets 0 bytes 0 - } - - counter match { - packets 0 bytes 0 - } - - set test { - type ${type_spec} - flags interval - } - - set norange { - type ${type_spec} - } - - set noconcat { - type ${type_spec%% *} - flags interval - } - - chain test { - type filter hook ingress device veth_a priority 0; - } -} -' - -err_buf= -info_buf= - -# Append string to error buffer -err() { - err_buf="${err_buf}${1} -" -} - -# Append string to information buffer -info() { - info_buf="${info_buf}${1} -" -} - -# Flush error buffer to stdout -err_flush() { - printf "%s" "${err_buf}" - err_buf= -} - -# Flush information buffer to stdout -info_flush() { - printf "%s" "${info_buf}" - info_buf= -} - -# Setup veth pair: this namespace receives traffic, B generates it -setup_veth() { - ip netns add B - ip link add veth_a type veth peer name veth_b || return 1 - - ip link set veth_a up - ip link set veth_b netns B - - ip -n B link set veth_b up - - ip addr add dev veth_a 10.0.0.1 - ip route add default dev veth_a - - ip -6 addr add fe80::1/64 dev veth_a nodad - ip -6 addr add 2001:db8::1/64 dev veth_a nodad - ip -6 route add default dev veth_a - - ip -n B route add default dev veth_b - - ip -6 -n B addr add fe80::2/64 dev veth_b nodad - ip -6 -n B addr add 2001:db8::2/64 dev veth_b nodad - ip -6 -n B route add default dev veth_b - - B() { - ip netns exec B "$@" >/dev/null 2>&1 - } - - sleep 2 -} - -# Fill in set template and initialise set -setup_set() { - eval "echo \"${set_template}\"" | nft -f - -} - -# Check that at least one of the needed tools is available -check_tools() { - [ -z "${tools}" ] && return 0 - - __tools= - for tool in ${tools}; do - if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \ - ! nc -u -w0 1.1.1.1 1 2>/dev/null; then - # Some GNU netcat builds might not support IPv6 - __tools="${__tools} netcat-openbsd" - continue - fi - __tools="${__tools} ${tool}" - - command -v "${tool}" >/dev/null && return 0 - done - err "need one of:${__tools}, skipping" && return 1 -} - -# Set up function to send ICMP packets -setup_send_icmp() { - send_icmp() { - B ping -c1 -W1 "${dst_addr4}" >/dev/null 2>&1 - } -} - -# Set up function to send ICMPv6 packets -setup_send_icmp6() { - if command -v ping6 >/dev/null; then - send_icmp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - B ping6 -q -c1 -W1 "${dst_addr6}" - } - else - send_icmp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - B ping -q -6 -c1 -W1 "${dst_addr6}" - } - fi -} - -# Set up function to send single UDP packets on IPv4 -setup_send_udp() { - if command -v sendip >/dev/null; then - send_udp() { - [ -n "${src_port}" ] && src_port="-us ${src_port}" - [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" - [ -n "${src_addr4}" ] && src_addr4="-is ${src_addr4}" - - # shellcheck disable=SC2086 # sendip needs split options - B sendip -p ipv4 -p udp ${src_addr4} ${src_port} \ - ${dst_port} "${dst_addr4}" - - src_port= - dst_port= - src_addr4= - } - elif command -v socat -v >/dev/null; then - send_udp() { - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}" dev veth_b - __socatbind=",bind=${src_addr4}" - if [ -n "${src_port}" ];then - __socatbind="${__socatbind}:${src_port}" - fi - fi - - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - [ -z "${dst_port}" ] && dst_port=12345 - - echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}" - - src_addr4= - src_port= - } - elif command -v nc >/dev/null; then - if nc -u -w0 1.1.1.1 1 2>/dev/null; then - # OpenBSD netcat - nc_opt="-w0" - else - # GNU netcat - nc_opt="-q0" - fi - - send_udp() { - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}" dev veth_b - __src_addr4="-s ${src_addr4}" - fi - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \ - "${src_port}" "${dst_addr4}" "${dst_port}" - - src_addr4= - src_port= - } - elif [ -z "$(bash -c 'type -p')" ]; then - send_udp() { - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - B ip route add default dev veth_b - fi - - B bash -c "echo > /dev/udp/${dst_addr4}/${dst_port}" - - if [ -n "${src_addr4}" ]; then - B ip addr del "${src_addr4}/16" dev veth_b - fi - src_addr4= - } - else - return 1 - fi -} - -# Set up function to send single UDP packets on IPv6 -setup_send_udp6() { - if command -v sendip >/dev/null; then - send_udp6() { - [ -n "${src_port}" ] && src_port="-us ${src_port}" - [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" - if [ -n "${src_addr6}" ]; then - src_addr6="-6s ${src_addr6}" - else - src_addr6="-6s 2001:db8::2" - fi - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - B sendip -p ipv6 -p udp ${src_addr6} ${src_port} \ - ${dst_port} "${dst_addr6}" - - src_port= - dst_port= - src_addr6= - } - elif command -v socat -v >/dev/null; then - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - __socatbind6= - - if [ -n "${src_addr6}" ]; then - if [ -n "${src_addr6} != "${src_addr6_added} ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - - src_addr6_added=${src_addr6} - fi - - __socatbind6=",bind=[${src_addr6}]" - - if [ -n "${src_port}" ] ;then - __socatbind6="${__socatbind6}:${src_port}" - fi - fi - - echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}" - } - elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then - # GNU netcat might not work with IPv6, try next tool - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - else - src_addr6="2001:db8::2" - fi - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - # shellcheck disable=SC2086 # this needs split options - echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \ - ${dst_addr6} ${dst_port} - - src_addr6= - src_port= - } - elif [ -z "$(bash -c 'type -p')" ]; then - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - B ip addr add "${src_addr6}" dev veth_b nodad - B bash -c "echo > /dev/udp/${dst_addr6}/${dst_port}" - ip -6 addr del "${dst_addr6}" dev veth_a 2>/dev/null - } - else - return 1 - fi -} - -# Set up function to send TCP traffic on IPv4 -setup_flood_tcp() { - if command -v iperf3 >/dev/null; then - flood_tcp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_port="--cport ${src_port}" - fi - B ip route add default dev veth_b 2>/dev/null - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ - ${src_addr4} -l16 -t 1000 - - src_addr4= - src_port= - dst_port= - } - elif command -v iperf >/dev/null; then - flood_tcp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 2>/dev/null - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_addr4="${src_addr4}:${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ - -l20 -t 1000 - - src_addr4= - src_port= - dst_port= - } - elif command -v netperf >/dev/null; then - flood_tcp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="10.0.0.2" - fi - if [ -n "${src_port}" ]; then - dst_port="${dst_port},${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - netserver -4 ${dst_port} -L "${dst_addr4}" \ - >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B netperf -4 -H "${dst_addr4}" ${dst_port} \ - -L "${src_addr4}" -l 1000 -t TCP_STREAM - - src_addr4= - src_port= - dst_port= - } - else - return 1 - fi -} - -# Set up function to send TCP traffic on IPv6 -setup_flood_tcp6() { - if command -v iperf3 >/dev/null; then - flood_tcp6() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - src_addr6="-B ${src_addr6}" - else - src_addr6="-B 2001:db8::2" - fi - if [ -n "${src_port}" ]; then - src_port="--cport ${src_port}" - fi - B ip route add default dev veth_b - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf3 -c "${dst_addr6}" ${dst_port} \ - ${src_port} ${src_addr6} -l16 -t 1000 - - src_addr6= - src_port= - dst_port= - } - elif command -v iperf >/dev/null; then - flood_tcp6() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - src_addr6="-B ${src_addr6}" - else - src_addr6="-B 2001:db8::2" - fi - if [ -n "${src_port}" ]; then - src_addr6="${src_addr6}:${src_port}" - fi - B ip route add default dev veth_b - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf -c "${dst_addr6}" -V ${dst_port} \ - ${src_addr6} -l1 -t 1000 - - src_addr6= - src_port= - dst_port= - } - elif command -v netperf >/dev/null; then - flood_tcp6() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - else - src_addr6="2001:db8::2" - fi - if [ -n "${src_port}" ]; then - dst_port="${dst_port},${src_port}" - fi - B ip route add default dev veth_b - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - netserver -6 ${dst_port} -L "${dst_addr6}" \ - >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B netperf -6 -H "${dst_addr6}" ${dst_port} \ - -L "${src_addr6}" -l 1000 -t TCP_STREAM - - src_addr6= - src_port= - dst_port= - } - else - return 1 - fi -} - -# Set up function to send UDP traffic on IPv4 -setup_flood_udp() { - if command -v iperf3 >/dev/null; then - flood_udp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 2>/dev/null - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_port="--cport ${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf3 -s -DB "${dst_addr4}" ${dst_port} - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ - ${dst_port} ${src_port} ${src_addr4} - - src_addr4= - src_port= - dst_port= - } - elif command -v iperf >/dev/null; then - flood_udp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_addr4="${src_addr4}:${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ - ${dst_port} ${src_addr4} - - src_addr4= - src_port= - dst_port= - } - elif command -v netperf >/dev/null; then - flood_udp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="10.0.0.2" - fi - if [ -n "${src_port}" ]; then - dst_port="${dst_port},${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - netserver -4 ${dst_port} -L "${dst_addr4}" \ - >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B netperf -4 -H "${dst_addr4}" ${dst_port} \ - -L "${src_addr4}" -l 1000 -t UDP_STREAM - - src_addr4= - src_port= - dst_port= - } - else - return 1 - fi -} - -# Find pktgen script and set up function to start pktgen injection -setup_perf() { - for pktgen_script_path in ${PKTGEN_SCRIPT_PATHS} __notfound; do - command -v "${pktgen_script_path}" >/dev/null && break - done - [ "${pktgen_script_path}" = "__notfound" ] && return 1 - - perf_ipv4() { - ${pktgen_script_path} -s80 \ - -i veth_a -d "${dst_addr4}" -p "${dst_port}" \ - -m "${dst_mac}" \ - -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & - perf_pid=$! - } - perf_ipv6() { - IP6=6 ${pktgen_script_path} -s100 \ - -i veth_a -d "${dst_addr6}" -p "${dst_port}" \ - -m "${dst_mac}" \ - -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & - perf_pid=$! - } -} - -# Clean up before each test -cleanup() { - nft reset counter inet filter test >/dev/null 2>&1 - nft flush ruleset >/dev/null 2>&1 - ip link del dummy0 2>/dev/null - ip route del default 2>/dev/null - ip -6 route del default 2>/dev/null - ip netns del B 2>/dev/null - ip link del veth_a 2>/dev/null - timeout= - killall iperf3 2>/dev/null - killall iperf 2>/dev/null - killall netperf 2>/dev/null - killall netserver 2>/dev/null - rm -f ${tmp} - sleep 2 -} - -# Entry point for setup functions -setup() { - if [ "$(id -u)" -ne 0 ]; then - echo " need to run as root" - exit ${KSELFTEST_SKIP} - fi - - cleanup - check_tools || return 1 - for arg do - if ! eval setup_"${arg}"; then - err " ${arg} not supported" - return 1 - fi - done -} - -# Format integer into IPv4 address, summing 10.0.0.5 (arbitrary) to it -format_addr4() { - a=$((${1} + 16777216 * 10 + 5)) - printf "%i.%i.%i.%i" \ - "$((a / 16777216))" "$((a % 16777216 / 65536))" \ - "$((a % 65536 / 256))" "$((a % 256))" -} - -# Format integer into IPv6 address, summing 2001:db8:: to it -format_addr6() { - printf "2001:db8::%04x:%04x" "$((${1} / 65536))" "$((${1} % 65536))" -} - -# Format integer into EUI-48 address, summing 00:01:00:00:00:00 to it -format_mac() { - printf "00:01:%02x:%02x:%02x:%02x" \ - "$((${1} / 16777216))" "$((${1} % 16777216 / 65536))" \ - "$((${1} % 65536 / 256))" "$((${1} % 256))" -} - -# Format integer into port, avoid 0 port -format_port() { - printf "%i" "$((${1} % 65534 + 1))" -} - -# Drop suffixed '6' from L4 protocol, if any -format_proto() { - printf "%s" "${proto}" | tr -d 6 -} - -# Format destination and source fields into nft concatenated type -format() { - __start= - __end= - __expr="{ " - - for f in ${dst}; do - [ "${__expr}" != "{ " ] && __expr="${__expr} . " - - __start="$(eval format_"${f}" "${start}")" - __end="$(eval format_"${f}" "${end}")" - - if [ "${f}" = "proto" ]; then - __expr="${__expr}${__start}" - else - __expr="${__expr}${__start}-${__end}" - fi - done - for f in ${src}; do - [ "${__expr}" != "{ " ] && __expr="${__expr} . " - - __start="$(eval format_"${f}" "${srcstart}")" - __end="$(eval format_"${f}" "${srcend}")" - - if [ "${f}" = "proto" ]; then - __expr="${__expr}${__start}" - else - __expr="${__expr}${__start}-${__end}" - fi - done - - if [ -n "${timeout}" ]; then - echo "${__expr} timeout ${timeout}s }" - else - echo "${__expr} }" - fi -} - -# Format destination and source fields into nft type, start element only -format_norange() { - __expr="{ " - - for f in ${dst}; do - [ "${__expr}" != "{ " ] && __expr="${__expr} . " - - __expr="${__expr}$(eval format_"${f}" "${start}")" - done - for f in ${src}; do - __expr="${__expr} . $(eval format_"${f}" "${start}")" - done - - echo "${__expr} }" -} - -# Format first destination field into nft type -format_noconcat() { - for f in ${dst}; do - __start="$(eval format_"${f}" "${start}")" - __end="$(eval format_"${f}" "${end}")" - - if [ "${f}" = "proto" ]; then - echo "{ ${__start} }" - else - echo "{ ${__start}-${__end} }" - fi - return - done -} - -# Add single entry to 'test' set in 'inet filter' table -add() { - if ! nft add element inet filter test "${1}"; then - err "Failed to add ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Format and output entries for sets in 'netdev perf' table -add_perf() { - if [ "${1}" = "test" ]; then - echo "add element netdev perf test $(format)" - elif [ "${1}" = "norange" ]; then - echo "add element netdev perf norange $(format_norange)" - elif [ "${1}" = "noconcat" ]; then - echo "add element netdev perf noconcat $(format_noconcat)" - fi -} - -# Add single entry to 'norange' set in 'netdev perf' table -add_perf_norange() { - if ! nft add element netdev perf norange "${1}"; then - err "Failed to add ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Add single entry to 'noconcat' set in 'netdev perf' table -add_perf_noconcat() { - if ! nft add element netdev perf noconcat "${1}"; then - err "Failed to add ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Delete single entry from set -del() { - if ! nft delete element inet filter test "${1}"; then - err "Failed to delete ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Return packet count from 'test' counter in 'inet filter' table -count_packets() { - found=0 - for token in $(nft list counter inet filter test); do - [ ${found} -eq 1 ] && echo "${token}" && return - [ "${token}" = "packets" ] && found=1 - done -} - -# Return packet count from 'test' counter in 'netdev perf' table -count_perf_packets() { - found=0 - for token in $(nft list counter netdev perf test); do - [ ${found} -eq 1 ] && echo "${token}" && return - [ "${token}" = "packets" ] && found=1 - done -} - -# Set MAC addresses, send traffic according to specifier -flood() { - ip link set veth_a address "$(format_mac "${1}")" - ip -n B link set veth_b address "$(format_mac "${2}")" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval flood_\$proto -} - -# Set MAC addresses, start pktgen injection -perf() { - dst_mac="$(format_mac "${1}")" - ip link set veth_a address "${dst_mac}" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval perf_\$perf_proto -} - -# Set MAC addresses, send single packet, check that it matches, reset counter -send_match() { - ip link set veth_a address "$(format_mac "${1}")" - ip -n B link set veth_b address "$(format_mac "${2}")" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval send_\$proto - if [ "$(count_packets)" != "1" ]; then - err "${proto} packet to:" - err " $(for f in ${dst}; do - eval format_\$f "${1}"; printf ' '; done)" - err "from:" - err " $(for f in ${src}; do - eval format_\$f "${2}"; printf ' '; done)" - err "should have matched ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi - nft reset counter inet filter test >/dev/null -} - -# Set MAC addresses, send single packet, check that it doesn't match -send_nomatch() { - ip link set veth_a address "$(format_mac "${1}")" - ip -n B link set veth_b address "$(format_mac "${2}")" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval send_\$proto - if [ "$(count_packets)" != "0" ]; then - err "${proto} packet to:" - err " $(for f in ${dst}; do - eval format_\$f "${1}"; printf ' '; done)" - err "from:" - err " $(for f in ${src}; do - eval format_\$f "${2}"; printf ' '; done)" - err "should not have matched ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Correctness test template: -# - add ranged element, check that packets match it -# - check that packets outside range don't match it -# - remove some elements, check that packets don't match anymore -test_correctness() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} - - range_size=1 - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - - # Avoid negative or zero-sized port ranges - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - fi - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_match "${j}" $((j + src_delta)) || return 1 - done - send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 - - # Delete elements now and then - if [ $((i % 3)) -eq 0 ]; then - del "$(format)" || return 1 - for j in $(seq ${start} \ - $((range_size / 2 + 1)) ${end}); do - send_nomatch "${j}" $((j + src_delta)) \ - || return 1 - done - fi - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done -} - -# Concurrency test template: -# - add all the elements -# - start a thread for each physical thread that: -# - adds all the elements -# - flushes the set -# - adds all the elements -# - flushes the entire ruleset -# - adds the set back -# - adds all the elements -# - delete all the elements -test_concurrency() { - proto=${flood_proto} - tools=${flood_tools} - chain_spec=${flood_spec} - setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP} - - range_size=1 - cstart=${start} - flood_pids= - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - - flood "${i}" $((i + src_delta)) & flood_pids="${flood_pids} $!" - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - sleep 10 - - pids= - for c in $(seq 1 "$(nproc)"); do ( - for r in $(seq 1 "${race_repeat}"); do - range_size=1 - - # $start needs to be local to this subshell - # shellcheck disable=SC2030 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - nft flush inet filter test 2>/dev/null - - range_size=1 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - nft flush ruleset - setup set 2>/dev/null - - range_size=1 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - range_size=1 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - del "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - done - ) & pids="${pids} $!" - done - - # shellcheck disable=SC2046,SC2086 # word splitting wanted here - wait $(for pid in ${pids}; do echo ${pid}; done) - # shellcheck disable=SC2046,SC2086 - kill $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null - # shellcheck disable=SC2046,SC2086 - wait $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null - - return 0 -} - -# Timeout test template: -# - add all the elements with 3s timeout while checking that packets match -# - wait 3s after the last insertion, check that packets don't match any entry -test_timeout() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} - - timeout=3 - range_size=1 - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_match "${j}" $((j + src_delta)) || return 1 - done - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - sleep 3 - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_nomatch "${j}" $((j + src_delta)) || return 1 - done - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done -} - -# Performance test template: -# - add concatenated ranged entries -# - add non-ranged concatenated entries (for hash set matching rate baseline) -# - add ranged entries with first field only (for rbhash baseline) -# - start pktgen injection directly on device rx path of this namespace -# - measure drop only rate, hash and rbtree baselines, then matching rate -test_performance() { - chain_spec=${perf_spec} - dst="${perf_dst}" - src="${perf_src}" - setup veth perf set || return ${KSELFTEST_SKIP} - - first=${start} - range_size=1 - for set in test norange noconcat; do - start=${first} - for i in $(seq ${start} $((start + perf_entries))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - elif [ ${start} -eq ${end} ]; then - end=$((start + 1)) - fi - - add_perf ${set} - - start=$((end + range_size)) - done > "${tmp}" - nft -f "${tmp}" - done - - perf $((end - 1)) ${srcstart} - - sleep 2 - - nft add rule netdev perf test counter name \"test\" drop - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - info " baseline (drop from netdev hook): ${pps}pps" - handle="$(nft -a list chain netdev perf test | grep counter)" - handle="${handle##* }" - nft delete rule netdev perf test handle "${handle}" - - nft add rule "netdev perf test ${chain_spec} @norange \ - counter name \"test\" drop" - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - info " baseline hash (non-ranged entries): ${pps}pps" - handle="$(nft -a list chain netdev perf test | grep counter)" - handle="${handle##* }" - nft delete rule netdev perf test handle "${handle}" - - nft add rule "netdev perf test ${chain_spec%%. *} @noconcat \ - counter name \"test\" drop" - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - info " baseline rbtree (match on first field only): ${pps}pps" - handle="$(nft -a list chain netdev perf test | grep counter)" - handle="${handle##* }" - nft delete rule netdev perf test handle "${handle}" - - nft add rule "netdev perf test ${chain_spec} @test \ - counter name \"test\" drop" - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - p5="$(printf %5s "${perf_entries}")" - info " set with ${p5} full, ranged entries: ${pps}pps" - kill "${perf_pid}" -} - -test_bug_flush_remove_add() { - set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' - elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' - elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' - for i in `seq 1 100`; do - nft add table t ${set_cmd} || return ${KSELFTEST_SKIP} - nft add element t s ${elem1} 2>/dev/null || return 1 - nft flush set t s 2>/dev/null || return 1 - nft add element t s ${elem2} 2>/dev/null || return 1 - done - nft flush ruleset -} - -# - add ranged element, check that packets match it -# - reload the set, check packets still match -test_bug_reload() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} - rstart=${start} - - range_size=1 - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - - # Avoid negative or zero-sized port ranges - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - fi - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - # check kernel does allocate pcpu sctrach map - # for reload with no elemet add/delete - ( echo flush set inet filter test ; - nft list set inet filter test ) | nft -f - - - start=${rstart} - range_size=1 - - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - - # Avoid negative or zero-sized port ranges - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - fi - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_match "${j}" $((j + src_delta)) || return 1 - done - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - nft flush ruleset -} - -test_reported_issues() { - eval test_bug_"${subtest}" -} - -# Run everything in a separate network namespace -[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } -tmp="$(mktemp)" -trap cleanup EXIT - -# Entry point for test runs -passed=0 -for name in ${TESTS}; do - printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')" - if [ "${name}" = "reported_issues" ]; then - SUBTESTS="${BUGS}" - else - SUBTESTS="${TYPES}" - fi - - for subtest in ${SUBTESTS}; do - eval desc=\$TYPE_"${subtest}" - IFS=' -' - for __line in ${desc}; do - # shellcheck disable=SC2086 - eval ${__line%% *}=\"${__line##* }\"; - done - IFS=' -' - - if [ "${name}" = "concurrency" ] && \ - [ "${race_repeat}" = "0" ]; then - continue - fi - if [ "${name}" = "performance" ] && \ - [ "${perf_duration}" = "0" ]; then - continue - fi - - printf " %-60s " "${display}" - eval test_"${name}" - ret=$? - - if [ $ret -eq 0 ]; then - printf "[ OK ]\n" - info_flush - passed=$((passed + 1)) - elif [ $ret -eq 1 ]; then - printf "[FAIL]\n" - err_flush - exit 1 - elif [ $ret -eq ${KSELFTEST_SKIP} ]; then - printf "[SKIP]\n" - err_flush - fi - done -done - -[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0 diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh deleted file mode 100755 index faa7778d7b..0000000000 --- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# -# This tests connection tracking helper assignment: -# 1. can attach ftp helper to a connection from nft ruleset. -# 2. auto-assign still works. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -testipv6=1 - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -which nc >/dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without netcat tool" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set veth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set veth0 up - -ip -net ${ns1} addr add 10.0.1.1/24 dev veth0 -ip -net ${ns1} addr add dead:1::1/64 dev veth0 - -ip -net ${ns2} addr add 10.0.1.2/24 dev veth0 -ip -net ${ns2} addr add dead:1::2/64 dev veth0 - -load_ruleset_family() { - local family=$1 - local ns=$2 - -ip netns exec ${ns} nft -f - < /dev/null |grep -q 'helper=ftp' - if [ $? -ne 0 ] ; then - if [ $autoassign -eq 0 ] ;then - echo "FAIL: ${netns} did not show attached helper $message" 1>&2 - ret=1 - else - echo "PASS: ${netns} did not show attached helper $message" 1>&2 - fi - else - if [ $autoassign -eq 0 ] ;then - echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 - else - echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 - ret=1 - fi - fi - - return 0 -} - -test_helper() -{ - local port=$1 - local autoassign=$2 - - if [ $autoassign -eq 0 ] ;then - msg="set via ruleset" - else - msg="auto-assign" - fi - - sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ip $msg" $port $autoassign - check_for_helper "$ns2" "ip $msg" $port $autoassign - - wait - - if [ $testipv6 -eq 0 ] ;then - return 0 - fi - - ip netns exec ${ns1} conntrack -F 2> /dev/null - ip netns exec ${ns2} conntrack -F 2> /dev/null - - sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ipv6 $msg" $port - check_for_helper "$ns2" "ipv6 $msg" $port - - wait -} - -load_ruleset_family ip ${ns1} -if [ $? -ne 0 ];then - echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 - exit 1 -fi - -load_ruleset_family ip6 ${ns1} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 - testipv6=0 -fi - -load_ruleset_family inet ${ns2} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 - load_ruleset_family ip ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 - exit 1 - fi - - if [ $testipv6 -eq 1 ] ;then - load_ruleset_family ip6 ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 - exit 1 - fi - fi -fi - -test_helper 2121 0 -ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -test_helper 21 1 - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh deleted file mode 100755 index dff476e45e..0000000000 --- a/tools/testing/selftests/netfilter/nft_fib.sh +++ /dev/null @@ -1,273 +0,0 @@ -#!/bin/bash -# -# This tests the fib expression. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -trap cleanup EXIT - -dmesg | grep -q ' nft_rpfilter: ' -if [ $? -eq 0 ]; then - dmesg -c | grep ' nft_rpfilter: ' - echo "WARN: a previous test run has failed" 1>&2 -fi - -sysctl -q net.netfilter.nf_log_all_netns=1 -ip netns add ${ns1} -ip netns add ${ns2} - -load_ruleset() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <&2 - ip netns exec ${ns} nft list table inet filter - return 1 - fi - - if [ $want -gt 0 ]; then - echo "PASS: fib expression did drop packets for $address" - fi - - return 0 -} - -load_ruleset ${nsrouter} -load_ruleset ${ns1} -load_ruleset ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -test_ping() { - local daddr4=$1 - local daddr6=$2 - - ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 - return 1 - fi - - ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 - return 1 - fi - - return 0 -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null - -sleep 3 - -test_ping 10.0.2.1 dead:2::1 || exit 1 -check_drops || exit 1 - -test_ping 10.0.2.99 dead:2::99 || exit 1 -check_drops || exit 1 - -echo "PASS: fib expression did not cause unwanted packet drops" - -ip netns exec ${nsrouter} nft flush table inet filter - -ip -net ${ns1} route del default -ip -net ${ns1} -6 route del default - -ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr del dead:1::99/64 dev eth0 - -ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr add dead:2::99/64 dev eth0 - -ip -net ${ns1} route add default via 10.0.2.1 -ip -net ${ns1} -6 route add default via dead:2::1 - -ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 - -# switch to ruleset that doesn't log, this time -# its expected that this does drop the packets. -load_ruleset_count ${nsrouter} - -# ns1 has a default route, but nsrouter does not. -# must not check return value, ping to 1.1.1.1 will -# fail. -check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 -check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 - -ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null -check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 - -sleep 2 -ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null -check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 - -# delete all rules -ip netns exec ${ns1} nft flush ruleset -ip netns exec ${ns2} nft flush ruleset -ip netns exec ${nsrouter} nft flush ruleset - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 - -ip -net ${ns1} addr del 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr del dead:2::99/64 dev eth0 - -ip -net ${nsrouter} addr del dead:2::1/64 dev veth0 - -# ... pbr ruleset for the router, check iif+oif. -load_pbr_ruleset ${nsrouter} -if [ $? -ne 0 ] ; then - echo "SKIP: Could not load fib forward ruleset" - exit $ksft_skip -fi - -ip -net ${nsrouter} rule add from all table 128 -ip -net ${nsrouter} rule add from all iif veth0 table 129 -ip -net ${nsrouter} route add table 128 to 10.0.1.0/24 dev veth0 -ip -net ${nsrouter} route add table 129 to 10.0.2.0/24 dev veth1 - -# drop main ipv4 table -ip -net ${nsrouter} -4 rule delete table main - -test_ping 10.0.2.99 dead:2::99 -if [ $? -ne 0 ] ; then - ip -net ${nsrouter} nft list ruleset - echo "FAIL: fib mismatch in pbr setup" - exit 1 -fi - -echo "PASS: fib expression forward check with policy based routing" -exit 0 diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh deleted file mode 100755 index a32f490f75..0000000000 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ /dev/null @@ -1,672 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This tests basic flowtable functionality. -# Creates following default topology: -# -# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) -# Router1 is the one doing flow offloading, Router2 has no special -# purpose other than having a link that is smaller than either Originator -# and responder, i.e. TCPMSS announced values are too large and will still -# result in fragmentation and/or PMTU discovery. -# -# You can check with different Orgininator/Link/Responder MTU eg: -# nft_flowtable.sh -o8000 -l1500 -r2000 -# - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsr1="nsr1-$sfx" -nsr2="nsr2-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -nsin="" -ns1out="" -ns2out="" - -log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "which nc" "run test without nc (netcat)" -checktool "ip netns add $nsr1" "create net namespace $nsr1" - -ip netns add $ns1 -ip netns add $ns2 -ip netns add $nsr2 - -cleanup() { - ip netns del $ns1 - ip netns del $ns2 - ip netns del $nsr1 - ip netns del $nsr2 - - rm -f "$nsin" "$ns1out" "$ns2out" - - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns -} - -trap cleanup EXIT - -sysctl -q net.netfilter.nf_log_all_netns=1 - -ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 - -ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 - -for dev in lo veth0 veth1; do - ip -net $nsr1 link set $dev up - ip -net $nsr2 link set $dev up -done - -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 - -ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 -ip -net $nsr2 addr add dead:2::1/64 dev veth1 - -# set different MTUs so we need to push packets coming from ns1 (large MTU) -# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), -# or to do PTMU discovery (send ICMP error back to originator). -# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers -# is NOT the lowest link mtu. - -omtu=9000 -lmtu=1500 -rmtu=2000 - -usage(){ - echo "nft_flowtable.sh [OPTIONS]" - echo - echo "MTU options" - echo " -o originator" - echo " -l link" - echo " -r responder" - exit 1 -} - -while getopts "o:l:r:" o -do - case $o in - o) omtu=$OPTARG;; - l) lmtu=$OPTARG;; - r) rmtu=$OPTARG;; - *) usage;; - esac -done - -if ! ip -net $nsr1 link set veth0 mtu $omtu; then - exit 1 -fi - -ip -net $ns1 link set eth0 mtu $omtu - -if ! ip -net $nsr2 link set veth1 mtu $rmtu; then - exit 1 -fi - -ip -net $ns2 link set eth0 mtu $rmtu - -# transfer-net between nsr1 and nsr2. -# these addresses are not used for connections. -ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 -ip -net $nsr1 addr add fee1:2::1/64 dev veth1 - -ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 -ip -net $nsr2 addr add fee1:2::2/64 dev veth0 - -for i in 0 1; do - ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null - ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null -done - -for ns in $ns1 $ns2;do - ip -net $ns link set lo up - ip -net $ns link set eth0 up - - if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then - echo "ERROR: Check Originator/Responder values (problem during address addition)" - exit 1 - fi - # don't set ip DF bit for first two tests - ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null -done - -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns2 addr add dead:2::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $ns2 route add default via dead:2::1 - -ip -net $nsr1 route add default via 192.168.10.2 -ip -net $nsr2 route add default via 192.168.10.1 - -ip netns exec $nsr1 nft -f - < /dev/null; then - echo "ERROR: $ns1 cannot reach ns2" 1>&2 - exit 1 -fi - -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then - echo "ERROR: $ns2 cannot reach $ns1" 1>&2 - exit 1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" -fi - -nsin=$(mktemp) -ns1out=$(mktemp) -ns2out=$(mktemp) - -make_file() -{ - name=$1 - - SIZE=$((RANDOM % (1024 * 128))) - SIZE=$((SIZE + (1024 * 8))) - TSIZE=$((SIZE * 1024)) - - dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null - - SIZE=$((RANDOM % 1024)) - SIZE=$((SIZE + 128)) - TSIZE=$((TSIZE + SIZE)) - dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null -} - -check_counters() -{ - local what=$1 - local ok=1 - - local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) - local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) - - local orig_cnt=${orig#*bytes} - local repl_cnt=${repl#*bytes} - - local fs=$(du -sb $nsin) - local max_orig=${fs%%/*} - local max_repl=$((max_orig/4)) - - if [ $orig_cnt -gt $max_orig ];then - echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 - ret=1 - ok=0 - fi - - if [ $repl_cnt -gt $max_repl ];then - echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 - ret=1 - ok=0 - fi - - if [ $ok -eq 1 ]; then - echo "PASS: $what" - fi -} - -check_dscp() -{ - local what=$1 - local ok=1 - - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) - - local pc4=${counter%*bytes*} - local pc4=${pc4#*packets} - - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) - local pc4z=${counter%*bytes*} - local pc4z=${pc4z#*packets} - - case "$what" in - "dscp_none") - if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - "dscp_fwd") - if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - "dscp_ingress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - "dscp_egress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - *) - echo "FAIL: Unknown DSCP check" 1>&2 - ret=1 - ok=0 - esac - - if [ $ok -eq 1 ] ;then - echo "PASS: $what: dscp packet counters match" - fi -} - -check_transfer() -{ - in=$1 - out=$2 - what=$3 - - if ! cmp "$in" "$out" > /dev/null 2>&1; then - echo "FAIL: file mismatch for $what" 1>&2 - ls -l "$in" - ls -l "$out" - return 1 - fi - - return 0 -} - -test_tcp_forwarding_ip() -{ - local nsa=$1 - local nsb=$2 - local dstip=$3 - local dstport=$4 - local lret=0 - - ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & - lpid=$! - - sleep 1 - ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & - cpid=$! - - sleep 1 - - prev="$(ls -l $ns1out $ns2out)" - sleep 1 - - while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do - sleep 1; - prev="$(ls -l $ns1out $ns2out)" - done - - if test -d /proc/"$lpid"/; then - kill $lpid - fi - - if test -d /proc/"$cpid"/; then - kill $cpid - fi - - wait $lpid - wait $cpid - - if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then - lret=1 - fi - - if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then - lret=1 - fi - - return $lret -} - -test_tcp_forwarding() -{ - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - - return $? -} - -test_tcp_forwarding_set_dscp() -{ - check_dscp "dscp_none" - -ip netns exec $nsr1 nft -f - <&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - -# delete default route, i.e. ns2 won't be able to reach ns1 and -# will depend on ns1 being masqueraded in nsr1. -# expect ns1 has nsr1 address. -ip -net $ns2 route del default via 10.0.2.1 -ip -net $ns2 route del default via dead:2::1 -ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 - -# Second test: -# Same, but with NAT enabled. Same as in first test: we expect normal forward path -# to handle most packets. -ip netns exec $nsr1 nft -f - <&2 - exit 0 -fi - -if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then - echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - -# Third test: -# Same as second test, but with PMTU discovery enabled. This -# means that we expect the fastpath to handle packets as soon -# as the endpoints adjust the packet size. -ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null - -# reset counters. -# With pmtu in-place we'll also check that nft counters -# are lower than file size and packets were forwarded via flowtable layer. -# For earlier tests (large mtus), packets cannot be handled via flowtable -# (except pure acks and other small packets). -ip netns exec $nsr1 nft reset counters table inet filter >/dev/null - -if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then - echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 - ip netns exec $nsr1 nft list ruleset -fi - -# Another test: -# Add bridge interface br0 to Router1, with NAT enabled. -ip -net $nsr1 link add name br0 type bridge -ip -net $nsr1 addr flush dev veth0 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set veth0 master br0 -ip -net $nsr1 addr add 10.0.1.1/24 dev br0 -ip -net $nsr1 addr add dead:1::1/64 dev br0 -ip -net $nsr1 link set up dev br0 - -ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null - -# br0 with NAT enabled. -ip netns exec $nsr1 nft -f - <&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - - -# Another test: -# Add bridge interface br0 to Router1, with NAT and VLAN. -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set down dev veth0 -ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set up dev veth0.10 -ip -net $nsr1 link set veth0.10 master br0 - -ip -net $ns1 addr flush dev eth0 -ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 -ip -net $ns1 link set eth0 up -ip -net $ns1 link set eth0.10 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0.10 - -if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then - echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - -# restore test topology (remove bridge and VLAN) -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set veth0 down -ip -net $nsr1 link set veth0.10 down -ip -net $nsr1 link delete veth0.10 type vlan -ip -net $nsr1 link delete br0 type bridge -ip -net $ns1 addr flush dev eth0.10 -ip -net $ns1 link set eth0.10 down -ip -net $ns1 link set eth0 down -ip -net $ns1 link delete eth0.10 type vlan - -# restore address in ns1 and nsr1 -ip -net $ns1 link set eth0 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 -ip -net $nsr1 link set up dev veth0 - -KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) -KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) -SPI1=$RANDOM -SPI2=$RANDOM - -if [ $SPI1 -eq $SPI2 ]; then - SPI2=$((SPI2+1)) -fi - -do_esp() { - local ns=$1 - local me=$2 - local remote=$3 - local lnet=$4 - local rnet=$5 - local spi_out=$6 - local spi_in=$7 - - ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet - ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet - - # to encrypt packets as they go out (includes forwarded packets that need encapsulation) - ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow - # to fwd decrypted packets after esp processing: - ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow - -} - -do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 - -do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 - -ip netns exec $nsr1 nft delete table ip nat - -# restore default routes -ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns2 route add default via dead:2::1 - -if test_tcp_forwarding $ns1 $ns2; then - check_counters "ipsec tunnel mode for ns1/ns2" -else - echo "FAIL: ipsec tunnel mode for ns1/ns2" - ip netns exec $nsr1 nft list ruleset 1>&2 - ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh deleted file mode 100755 index f33154c04d..0000000000 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/bin/bash - -# check iif/iifname/oifgroup/iiftype match. - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" - -if ! nft --version > /dev/null 2>&1; then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -cleanup() -{ - ip netns del "$ns0" -} - -ip netns add "$ns0" -ip -net "$ns0" link set lo up -ip -net "$ns0" addr add 127.0.0.1 dev lo - -trap cleanup EXIT - -currentyear=$(date +%Y) -lastyear=$((currentyear-1)) -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - -check_lo_counters "2" true - -check_one_counter oskuidcounter "1" true -check_one_counter oskgidcounter "1" true -check_one_counter imarkcounter "1" true -check_one_counter omarkcounter "1" true -check_one_counter ilastyearcounter "0" true - -if [ $ret -eq 0 ];then - echo "OK: nftables meta iif/oif counters at expected values" -else - exit $ret -fi - -#First CPU execution and counter -taskset -p 01 $$ > /dev/null -ip netns exec "$ns0" nft reset counters > /dev/null -ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null -check_one_counter icpu0counter "2" true - -if [ $ret -eq 0 ];then - echo "OK: nftables meta cpu counter at expected values" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh deleted file mode 100755 index dd40d9f6f2..0000000000 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ /dev/null @@ -1,1224 +0,0 @@ -#!/bin/bash -# -# This test is for basic NAT functionality: snat, dnat, redirect, masquerade. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 -test_inet_nat=true - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" - -cleanup() -{ - for i in 0 1 2; do ip netns del ns$i-"$sfx";done -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi - -ip netns add "$ns2" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns2" - exit $ksft_skip -fi - -ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2" - -ip -net "$ns0" link set lo up -ip -net "$ns0" link set veth0 up -ip -net "$ns0" addr add 10.0.1.1/24 dev veth0 -ip -net "$ns0" addr add dead:1::1/64 dev veth0 - -ip -net "$ns0" link set veth1 up -ip -net "$ns0" addr add 10.0.2.1/24 dev veth1 -ip -net "$ns0" addr add dead:2::1/64 dev veth1 - -for i in 1 2; do - ip -net ns$i-$sfx link set lo up - ip -net ns$i-$sfx link set eth0 up - ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0 - ip -net ns$i-$sfx route add default via 10.0.$i.1 - ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0 - ip -net ns$i-$sfx route add default via dead:$i::1 -done - -bad_counter() -{ - local ns=$1 - local counter=$2 - local expect=$3 - local tag=$4 - - echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2 - ip netns exec $ns nft list counter inet filter $counter 1>&2 -} - -check_counters() -{ - ns=$1 - local lret=0 - - cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1" - lret=1 - fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2" - lret=1 - fi - - expect="packets 1 bytes 104" - cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in6 "$expect" "check_counters 3" - lret=1 - fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out6 "$expect" "check_counters 4" - lret=1 - fi - - return $lret -} - -check_ns0_counters() -{ - local ns=$1 - local lret=0 - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0in6 "packets 0 bytes 0" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2" - lret=1 - fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 " - lret=1 - fi - - for dir in "in" "out" ; do - expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4" - lret=1 - fi - - expect="packets 1 bytes 104" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5" - lret=1 - fi - done - - return $lret -} - -reset_counters() -{ - for i in 0 1 2;do - ip netns exec ns$i-$sfx nft reset counters inet > /dev/null - done -} - -test_local_dnat6() -{ - local family=$1 - local lret=0 - local IPF="" - - if [ $family = "inet" ];then - IPF="ip6" - fi - -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ]; then - lret=1 - echo "ERROR: ping6 failed" - return $lret - fi - - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1" - lret=1 - fi - done - - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2" - lret=1 - fi - done - - # expect 0 count in ns1 - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3" - lret=1 - fi - done - - # expect 1 packet in ns2 - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4" - lret=1 - fi - done - - test $lret -eq 0 && echo "PASS: ipv6 ping to $ns1 was $family NATted to $ns2" - ip netns exec "$ns0" nft flush chain ip6 nat output - - return $lret -} - -test_local_dnat() -{ - local family=$1 - local lret=0 - local IPF="" - - if [ $family = "inet" ];then - IPF="ip" - fi - -ip netns exec "$ns0" nft -f /dev/stdin </dev/null -table $family nat { - chain output { - type nat hook output priority 0; policy accept; - ip daddr 10.0.1.99 dnat $IPF to 10.0.2.99 - } -} -EOF - if [ $? -ne 0 ]; then - if [ $family = "inet" ];then - echo "SKIP: inet nat tests" - test_inet_nat=false - return $ksft_skip - fi - echo "SKIP: Could not add add $family dnat hook" - return $ksft_skip - fi - - # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then - lret=1 - echo "ERROR: ping failed" - return $lret - fi - - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1" - lret=1 - fi - done - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2" - lret=1 - fi - done - - # expect 0 count in ns1 - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3" - lret=1 - fi - done - - # expect 1 packet in ns2 - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4" - lret=1 - fi - done - - test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2" - - ip netns exec "$ns0" nft flush chain $family nat output - - reset_counters - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then - lret=1 - echo "ERROR: ping failed" - return $lret - fi - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5" - lret=1 - fi - done - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6" - lret=1 - fi - done - - # expect 1 count in ns1 - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7" - lret=1 - fi - done - - # expect 0 packet in ns2 - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8" - lret=1 - fi - done - - test $lret -eq 0 && echo "PASS: ping to $ns1 OK after $family nat output chain flush" - - return $lret -} - -test_local_dnat_portonly() -{ - local family=$1 - local daddr=$2 - local lret=0 - local sr_s - local sr_r - -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" - return 1 - lret=1 - fi - - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2" - lret=1 - fi - done - - reset_counters - -# add masquerading rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" - lret=1 - fi - - # ns1 should have seen packets from ns0, due to masquerade - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4" - lret=1 - fi - done - - # ns1 should not have seen packets from ns2, due to masquerade - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6" - lret=1 - fi - done - - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)" - lret=1 - fi - - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then - echo "ERROR: Could not flush $family nat postrouting" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IPv6 masquerade $natflags for $ns2" - - return $lret -} - -test_masquerade() -{ - local family=$1 - local natflags=$2 - local lret=0 - - ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from "$ns2" $natflags" - lret=1 - fi - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2" - lret=1 - fi - done - - reset_counters - -# add masquerading rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" - lret=1 - fi - - # ns1 should have seen packets from ns0, due to masquerade - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4" - lret=1 - fi - done - - # ns1 should not have seen packets from ns2, due to masquerade - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6" - lret=1 - fi - done - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)" - lret=1 - fi - - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then - echo "ERROR: Could not flush $family nat postrouting" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IP masquerade $natflags for $ns2" - - return $lret -} - -test_redirect6() -{ - local family=$1 - local lret=0 - - ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" - lret=1 - fi - - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2" - lret=1 - fi - done - - reset_counters - -# add redirect rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect" - lret=1 - fi - - # ns1 should have seen no packets from ns2, due to redirection - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3" - lret=1 - fi - done - - # ns0 should have seen packets from ns2, due to masquerade - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4" - lret=1 - fi - done - - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then - echo "ERROR: Could not delete $family nat table" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IPv6 redirection for $ns2" - - return $lret -} - -test_redirect() -{ - local family=$1 - local lret=0 - - ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2" - lret=1 - fi - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2" - lret=1 - fi - done - - reset_counters - -# add redirect rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect" - lret=1 - fi - - # ns1 should have seen no packets from ns2, due to redirection - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3" - lret=1 - fi - done - - # ns0 should have seen packets from ns2, due to masquerade - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4" - lret=1 - fi - done - - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then - echo "ERROR: Could not delete $family nat table" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IP redirection for $ns2" - - return $lret -} - -# test port shadowing. -# create two listening services, one on router (ns0), one -# on client (ns2), which is masqueraded from ns1 point of view. -# ns2 sends udp packet coming from service port to ns1, on a highport. -# Later, if n1 uses same highport to connect to ns0:service, packet -# might be port-forwarded to ns2 instead. - -# second argument tells if we expect the 'fake-entry' to take effect -# (CLIENT) or not (ROUTER). -test_port_shadow() -{ - local test=$1 - local expect=$2 - local daddrc="10.0.1.99" - local daddrs="10.0.1.1" - local result="" - local logmsg="" - - # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. - echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405 - - echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 & - sc_r=$! - - echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport & - sc_c=$! - - sleep 0.3 - - # ns1 tries to connect to ns0:1405. With default settings this should connect - # to client, it matches the conntrack entry created above. - - result=$(echo "data" | ip netns exec "$ns1" timeout 1 socat - UDP:"$daddrs":1405,sourceport=41404) - - if [ "$result" = "$expect" ] ;then - echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}" - else - echo "ERROR: portshadow test $test: got reply from \"$result\", not $expect as intended" - ret=1 - fi - - kill $sc_r $sc_c 2>/dev/null - - # flush udp entries for next test round, if any - ip netns exec "$ns0" conntrack -F >/dev/null 2>&1 -} - -# This prevents port shadow of router service via packet filter, -# packets claiming to originate from service port from internal -# network are dropped. -test_port_shadow_filter() -{ - local family=$1 - -ip netns exec "$ns0" nft -f /dev/stdin </dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: Could not run nat port shadowing test without conntrack tool" - return - fi - - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: Could not run nat port shadowing test without socat tool" - return - fi - - ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" - return 1 - fi - -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" - lret=1 - fi - - # ns1 should have seen packets from .2.2, due to stateless rewrite. - expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" - lret=1 - fi - - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" - lret=1 - fi - done - - # ns1 should not have seen packets from ns2, due to masquerade - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" - lret=1 - fi - done - - reset_counters - - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: Could not run stateless nat frag test without socat tool" - if [ $lret -eq 0 ]; then - return $ksft_skip - fi - - ip netns exec "$ns0" nft delete table ip stateless - return $lret - fi - - local tmpfile=$(mktemp) - dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null - - local outfile=$(mktemp) - ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & - sc_r=$! - - sleep 1 - # re-do with large ping -> ip fragmentation - ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null - if [ $? -ne 0 ] ; then - echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 - lret=1 - fi - - wait - - cmp "$tmpfile" "$outfile" - if [ $? -ne 0 ]; then - ls -l "$tmpfile" "$outfile" - echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 - lret=1 - fi - - rm -f "$tmpfile" "$outfile" - - # ns1 should have seen packets from 2.2, due to stateless rewrite. - expect="packets 3 bytes 4164" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" - lret=1 - fi - - ip netns exec "$ns0" nft delete table ip stateless - if [ $? -ne 0 ]; then - echo "ERROR: Could not delete table ip stateless" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: IP statless for $ns2" - - return $lret -} - -# ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 -for i in 0 1 2; do -ip netns exec ns$i-$sfx nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s)" 1>&2 - ret=1 - fi - - ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 - ret=1 - fi - check_counters ns$i-$sfx - if [ $? -ne 0 ]; then - ret=1 - fi - - check_ns0_counters ns$i - if [ $? -ne 0 ]; then - ret=1 - fi - reset_counters -done - -if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2" -fi - -reset_counters -test_local_dnat ip -test_local_dnat6 ip6 - -reset_counters -test_local_dnat_portonly inet 10.0.1.99 - -reset_counters -$test_inet_nat && test_local_dnat inet -$test_inet_nat && test_local_dnat6 inet - -for flags in "" "fully-random"; do -reset_counters -test_masquerade ip $flags -test_masquerade6 ip6 $flags -reset_counters -$test_inet_nat && test_masquerade inet $flags -$test_inet_nat && test_masquerade6 inet $flags -done - -reset_counters -test_redirect ip -test_redirect6 ip6 -reset_counters -$test_inet_nat && test_redirect inet -$test_inet_nat && test_redirect6 inet - -test_port_shadowing -test_stateless_nat_ip - -if [ $ret -ne 0 ];then - echo -n "FAIL: " - nft --version -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/netfilter/nft_nat_zones.sh deleted file mode 100755 index b9ab37380f..0000000000 --- a/tools/testing/selftests/netfilter/nft_nat_zones.sh +++ /dev/null @@ -1,309 +0,0 @@ -#!/bin/bash -# -# Test connection tracking zone and NAT source port reallocation support. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -# Don't increase too much, 2000 clients should work -# just fine but script can then take several minutes with -# KASAN/debug builds. -maxclients=100 - -have_iperf=1 -ret=0 - -# client1---. -# veth1-. -# | -# NAT Gateway --veth0--> Server -# | | -# veth2-' | -# client2---' | -# .... | -# clientX----vethX---' - -# All clients share identical IP address. -# NAT Gateway uses policy routing and conntrack zones to isolate client -# namespaces. Each client connects to Server, each with colliding tuples: -# clientsaddr:10000 -> serveraddr:dport -# NAT Gateway is supposed to do port reallocation for each of the -# connections. - -sfx=$(mktemp -u "XXXXXXXX") -gw="ns-gw-$sfx" -cl1="ns-cl1-$sfx" -cl2="ns-cl2-$sfx" -srv="ns-srv-$sfx" - -v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) -v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) -v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) -v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) -v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) -v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) - -cleanup() -{ - ip netns del $gw - ip netns del $srv - for i in $(seq 1 $maxclients); do - ip netns del ns-cl$i-$sfx 2>/dev/null - done - - sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -iperf3 -v >/dev/null 2>&1 -if [ $? -ne 0 ];then - have_iperf=0 -fi - -ip netns add "$gw" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $gw" - exit $ksft_skip -fi -ip -net "$gw" link set lo up - -trap cleanup EXIT - -ip netns add "$srv" -if [ $? -ne 0 ];then - echo "SKIP: Could not create server netns $srv" - exit $ksft_skip -fi - -ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" -ip -net "$gw" link set veth0 up -ip -net "$srv" link set lo up -ip -net "$srv" link set eth0 up - -sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null -sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null -sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null -sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null -sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null -sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null - -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" - - ip netns add "$cl" - if [ $? -ne 0 ];then - echo "SKIP: Could not create client netns $cl" - exit $ksft_skip - fi - ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip - fi -done - -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" - echo netns exec "$cl" ip link set lo up - echo netns exec "$cl" ip link set eth0 up - echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 - echo netns exec "$gw" ip link set veth$i up - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 - - # clients have same IP addresses. - echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 - echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 - echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 - echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 - - # NB: same addresses on client-facing interfaces. - echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i - echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i - - # gw: policy routing - echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) - echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) - echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) - echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) - echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) -done | ip -batch /dev/stdin - -ip -net "$gw" addr add 10.3.0.1/24 dev veth0 -ip -net "$gw" addr add dead:3::1/64 dev veth0 - -ip -net "$srv" addr add 10.3.0.99/24 dev eth0 -ip -net "$srv" addr add dead:3::99/64 dev eth0 - -ip netns exec $gw nft -f /dev/stdin< /dev/null -ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null - -# useful for debugging: allows to use 'ping' from clients to gateway. -ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null -ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null - -for i in $(seq 1 $maxclients); do - cl="ns-cl$i-$sfx" - ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & - if [ $? -ne 0 ]; then - echo FAIL: Ping failure from $cl 1>&2 - ret=1 - break - fi -done - -wait - -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" - if [ $? -ne 0 ];then - ret=1 - echo "FAIL: counter icmp mismatch for veth$i" 1>&2 - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 - break - fi -done - -ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" -if [ $? -ne 0 ];then - ret=1 - echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" - ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 -fi - -if [ $ret -eq 0 ]; then - echo "PASS: ping test from all $maxclients namespaces" -fi - -if [ $have_iperf -eq 0 ];then - echo "SKIP: iperf3 not installed" - if [ $ret -ne 0 ];then - exit $ret - fi - exit $ksft_skip -fi - -ip netns exec $srv iperf3 -s > /dev/null 2>&1 & -iperfpid=$! -sleep 1 - -for i in $(seq 1 $maxclients); do - if [ $ret -ne 0 ]; then - break - fi - cl="ns-cl$i-$sfx" - ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null - if [ $? -ne 0 ]; then - echo FAIL: Failure to connect for $cl 1>&2 - ip netns exec $gw conntrack -S 1>&2 - ret=1 - fi -done -if [ $ret -eq 0 ];then - echo "PASS: iperf3 connections for all $maxclients net namespaces" -fi - -kill $iperfpid -wait - -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null - if [ $? -ne 0 ];then - ret=1 - echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 - break - fi -done -if [ $ret -eq 0 ];then - echo "PASS: Found client connection for all $maxclients net namespaces" -fi - -ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null -if [ $? -ne 0 ];then - ret=1 - echo "FAIL: cannot find return entry on veth0" 1>&2 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh deleted file mode 100755 index e127297533..0000000000 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ /dev/null @@ -1,449 +0,0 @@ -#!/bin/bash -# -# This tests nf_queue: -# 1. can process packets from all hooks -# 2. support running nfqueue from more than one base chain -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -cleanup() -{ - ip netns pids ${ns1} | xargs kill 2>/dev/null - ip netns pids ${ns2} | xargs kill 2>/dev/null - ip netns pids ${nsrouter} | xargs kill 2>/dev/null - - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - rm -f "$TMPFILE0" - rm -f "$TMPFILE1" - rm -f "$TMPFILE2" "$TMPFILE3" -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -TMPFILE0=$(mktemp) -TMPFILE1=$(mktemp) -TMPFILE2=$(mktemp) -TMPFILE3=$(mktemp) -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -load_ruleset() { - local name=$1 - local prio=$2 - -ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_ping_router() { - ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_queue_blackhole() { - local proto=$1 - -ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null - lret=$? - elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null - lret=$? - else - lret=111 - fi - - # queue without bypass keyword should drop traffic if no listener exists. - if [ $lret -eq 0 ];then - echo "FAIL: $proto expected failure, got $lret" 1>&2 - exit 1 - fi - - ip netns exec ${nsrouter} nft delete table $proto blackh - if [ $? -ne 0 ] ;then - echo "FAIL: $proto: Could not delete blackh table" - exit 1 - fi - - echo "PASS: $proto: statement with no listener results in packet drop" -} - -test_queue() -{ - local expected=$1 - local last="" - - # spawn nf-queue listeners - ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" & - sleep 1 - test_ping - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - test_ping_router - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - wait - ret=$? - - for file in $TMPFILE0 $TMPFILE1; do - last=$(tail -n1 "$file") - if [ x"$last" != x"$expected packets total" ]; then - echo "FAIL: Expected $expected packets total, but got $last" 1>&2 - cat "$file" 1>&2 - - ip netns exec ${nsrouter} nft list ruleset - exit 1 - fi - done - - echo "PASS: Expected and received $last" -} - -test_tcp_forward() -{ - ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout & - local nfqpid=$! - - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - sleep 1 - ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null & - - rm -f "$tmpfile" - - wait $rpid - wait $lpid - [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" -} - -test_tcp_localhost() -{ - tmpfile=$(mktemp) || exit 1 - - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback" - wait 2>/dev/null -} - -test_tcp_localhost_connectclose() -{ - tmpfile=$(mktemp) || exit 1 - - ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" - wait 2>/dev/null -} - -test_tcp_localhost_requeue() -{ -ip netns exec ${nsrouter} nft -f /dev/stdin </dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" & - - # nfqueue 1 will be called via output hook. But this time, - # re-queue the packet to nfqueue program on queue 2. - ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait - - if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then - echo "FAIL: lost packets during requeue?!" 1>&2 - return - fi - - echo "PASS: tcp via loopback and re-queueing" -} - -test_icmp_vrf() { - ip -net $ns1 link add tvrf type vrf table 9876 - if [ $? -ne 0 ];then - echo "SKIP: Could not add vrf device" - return - fi - - ip -net $ns1 li set eth0 master tvrf - ip -net $ns1 li set tvrf up - - ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 -ip netns exec ${ns1} nft -f /dev/stdin < /dev/null - - for n in output post; do - for d in tvrf eth0; do - ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1" - if [ $? -ne 0 ] ; then - echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 - ip netns exec ${ns1} nft list ruleset - ret=1 - return - fi - done - done - - wait $nfqpid - [ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf" - wait 2>/dev/null -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - -load_ruleset "filter" 0 - -sleep 3 - -test_ping -ret=$? -if [ $ret -eq 0 ];then - # queue bypass works (rules were skipped, no listener) - echo "PASS: ${ns1} can reach ${ns2}" -else - echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 - exit $ret -fi - -test_queue_blackhole ip -test_queue_blackhole ip6 - -# dummy ruleset to add base chains between the -# queueing rules. We don't want the second reinject -# to re-execute the old hooks. -load_counter_ruleset 10 - -# we are hooking all: prerouting/input/forward/output/postrouting. -# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: -# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). -# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. -# so we expect that userspace program receives 10 packets. -test_queue 10 - -# same. We queue to a second program as well. -load_ruleset "filter2" 20 -test_queue 20 - -test_tcp_forward -test_tcp_localhost -test_tcp_localhost_connectclose -test_tcp_localhost_requeue -test_icmp_vrf - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_synproxy.sh b/tools/testing/selftests/netfilter/nft_synproxy.sh deleted file mode 100755 index b62933b680..0000000000 --- a/tools/testing/selftests/netfilter/nft_synproxy.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -rnd=$(mktemp -u XXXXXXXX) -nsr="nsr-$rnd" # synproxy machine -ns1="ns1-$rnd" # iperf client -ns2="ns2-$rnd" # iperf server - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "iperf3 --version" "run test without iperf3" -checktool "ip netns add $nsr" "create net namespace" - -modprobe -q nf_conntrack - -ip netns add $ns1 -ip netns add $ns2 - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - ip netns del $ns1 - ip netns del $ns2 - - ip netns del $nsr -} - -trap cleanup EXIT - -ip link add veth0 netns $nsr type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr type veth peer name eth0 netns $ns2 - -for dev in lo veth0 veth1; do -ip -net $nsr link set $dev up -done - -ip -net $nsr addr add 10.0.1.1/24 dev veth0 -ip -net $nsr addr add 10.0.2.1/24 dev veth1 - -ip netns exec $nsr sysctl -q net.ipv4.conf.veth0.forwarding=1 -ip netns exec $nsr sysctl -q net.ipv4.conf.veth1.forwarding=1 -ip netns exec $nsr sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 - -for n in $ns1 $ns2; do - ip -net $n link set lo up - ip -net $n link set eth0 up -done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 - -# test basic connectivity -if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then - echo "ERROR: $ns1 cannot reach $ns2" 1>&2 - exit 1 -fi - -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then - echo "ERROR: $ns2 cannot reach $ns1" 1>&2 - exit 1 -fi - -ip netns exec $ns2 iperf3 -s > /dev/null 2>&1 & -# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & - -sleep 1 - -ip netns exec $nsr nft -f - < /dev/null - -if [ $? -ne 0 ]; then - echo "FAIL: iperf3 returned an error" 1>&2 - ret=$? - ip netns exec $nsr nft list ruleset -else - echo "PASS: synproxy connection successful" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_trans_stress.sh b/tools/testing/selftests/netfilter/nft_trans_stress.sh deleted file mode 100755 index 2ffba45a78..0000000000 --- a/tools/testing/selftests/netfilter/nft_trans_stress.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash -# -# This test is for stress-testing the nf_tables config plane path vs. -# packet path processing: Make sure we never release rules that are -# still visible to other cpus. -# -# set -e - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -testns=testns-$(mktemp -u "XXXXXXXX") -tmp="" - -tables="foo bar baz quux" -global_ret=0 -eret=0 -lret=0 - -cleanup() { - ip netns pids "$testns" | xargs kill 2>/dev/null - ip netns del "$testns" - - rm -f "$tmp" -} - -check_result() -{ - local r=$1 - local OK="PASS" - - if [ $r -ne 0 ] ;then - OK="FAIL" - global_ret=$r - fi - - echo "$OK: nft $2 test returned $r" - - eret=0 -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -trap cleanup EXIT -tmp=$(mktemp) - -for table in $tables; do - echo add table inet "$table" >> "$tmp" - echo flush table inet "$table" >> "$tmp" - - echo "add chain inet $table INPUT { type filter hook input priority 0; }" >> "$tmp" - echo "add chain inet $table OUTPUT { type filter hook output priority 0; }" >> "$tmp" - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - echo "add chain inet $table $chain" >> "$tmp" - done - - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - for BASE in INPUT OUTPUT; do - echo "add rule inet $table $BASE counter jump $chain" >> "$tmp" - done - echo "add rule inet $table $chain counter return" >> "$tmp" - done -done - -ip netns add "$testns" -ip -netns "$testns" link set lo up - -lscpu | grep ^CPU\(s\): | ( read cpu cpunum ; -cpunum=$((cpunum-1)) -for i in $(seq 0 $cpunum);do - mask=$(printf 0x%x $((1<<$i))) - ip netns exec "$testns" taskset $mask ping -4 127.0.0.1 -fq > /dev/null & - ip netns exec "$testns" taskset $mask ping -6 ::1 -fq > /dev/null & -done) - -sleep 1 - -ip netns exec "$testns" nft -f "$tmp" -for i in $(seq 1 10) ; do ip netns exec "$testns" nft -f "$tmp" & done - -for table in $tables;do - randsleep=$((RANDOM%2)) - sleep $randsleep - ip netns exec "$testns" nft delete table inet $table - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "reload" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp" - echo "insert rule inet foo INPUT meta nftrace set 1" - echo "insert rule inet foo OUTPUT meta nftrace set 1" - ) | ip netns exec "$testns" nft -f /dev/stdin - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi - - (echo "flush ruleset"; cat "$tmp" - ) | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete with nftrace enabled" - -echo "insert rule inet foo INPUT meta nftrace set 1" >> $tmp -echo "insert rule inet foo OUTPUT meta nftrace set 1" >> $tmp - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=1 - fi -done - -check_result $lret "add/delete with nftrace enabled" - -exit $global_ret diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh deleted file mode 100755 index 5a8db0b489..0000000000 --- a/tools/testing/selftests/netfilter/nft_zones_many.sh +++ /dev/null @@ -1,163 +0,0 @@ -#!/bin/bash - -# Test insertion speed for packets with identical addresses/ports -# that are all placed in distinct conntrack zones. - -sfx=$(mktemp -u "XXXXXXXX") -ns="ns-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -zones=2000 -have_ct_tool=0 -ret=0 - -cleanup() -{ - ip netns del $ns -} - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "socat -V" "run test without socat tool" -checktool "ip netns add $ns" "create net namespace" - -trap cleanup EXIT - -conntrack -V > /dev/null 2>&1 -if [ $? -eq 0 ];then - have_ct_tool=1 -fi - -ip -net "$ns" link set lo up - -test_zones() { - local max_zones=$1 - -ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 -ip netns exec $ns nft -f /dev/stdin</dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 - if [ $? -ne 0 ] ;then - ret=1 - break - fi - - stop=$(date +%s%3N) - local duration=$((stop-start)) - echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" - done - - if [ $have_ct_tool -eq 1 ]; then - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) - - if [ $count -eq $max_zones ]; then - echo "PASS: inserted $count entries from packet path in $duration ms total" - else - ip netns exec $ns conntrack -S 1>&2 - echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" - ret=1 - fi - fi - - if [ $ret -ne 0 ];then - echo "FAIL: insert $max_zones entries from packet path" 1>&2 - fi -} - -test_conntrack_tool() { - local max_zones=$1 - - ip netns exec $ns conntrack -F >/dev/null 2>/dev/null - - local outerstart=$(date +%s%3N) - local start=$(date +%s%3N) - local stop=$start - local i=0 - while [ $i -lt $max_zones ]; do - i=$((i + 1)) - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ - --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 - if [ $? -ne 0 ];then - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ - --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null - echo "FAIL: conntrack -I returned an error" - ret=1 - break - fi - - if [ $((i%1000)) -eq 0 ];then - stop=$(date +%s%3N) - - local duration=$((stop-start)) - echo "PASS: added 1000 entries in $duration ms (now $i total)" - start=$stop - fi - done - - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) - - if [ $count -eq $max_zones ]; then - echo "PASS: inserted $count entries via ctnetlink in $duration ms" - else - ip netns exec $ns conntrack -S 1>&2 - echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" - ret=1 - fi -} - -test_zones $zones - -if [ $have_ct_tool -eq 1 ];then - test_conntrack_tool $zones -else - echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" - if [ $ret -eq 0 ];then - exit $ksft_skip - fi -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/netfilter/rpath.sh deleted file mode 100755 index 5289c8447a..0000000000 --- a/tools/testing/selftests/netfilter/rpath.sh +++ /dev/null @@ -1,169 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# return code to signal skipped test -ksft_skip=4 - -# search for legacy iptables (it uses the xtables extensions -if iptables-legacy --version >/dev/null 2>&1; then - iptables='iptables-legacy' -elif iptables --version >/dev/null 2>&1; then - iptables='iptables' -else - iptables='' -fi - -if ip6tables-legacy --version >/dev/null 2>&1; then - ip6tables='ip6tables-legacy' -elif ip6tables --version >/dev/null 2>&1; then - ip6tables='ip6tables' -else - ip6tables='' -fi - -if nft --version >/dev/null 2>&1; then - nft='nft' -else - nft='' -fi - -if [ -z "$iptables$ip6tables$nft" ]; then - echo "SKIP: Test needs iptables, ip6tables or nft" - exit $ksft_skip -fi - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -trap "ip netns del $ns1; ip netns del $ns2" EXIT - -# create two netns, disable rp_filter in ns2 and -# keep IPv6 address when moving into VRF -ip netns add "$ns1" -ip netns add "$ns2" -ip netns exec "$ns2" sysctl -q net.ipv4.conf.all.rp_filter=0 -ip netns exec "$ns2" sysctl -q net.ipv4.conf.default.rp_filter=0 -ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1 - -# a standard connection between the netns, should not trigger rp filter -ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2" -ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up -ip -net "$ns1" a a 192.168.23.2/24 dev v0 -ip -net "$ns2" a a 192.168.23.1/24 dev v0 -ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad -ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad - -# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0 -ip -net "$ns2" link add d0 type dummy -ip -net "$ns2" link set d0 up -ip -net "$ns1" a a 192.168.42.2/24 dev v0 -ip -net "$ns2" a a 192.168.42.1/24 dev d0 -ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad -ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad - -# firewall matches to test -[ -n "$iptables" ] && { - common='-t raw -A PREROUTING -s 192.168.0.0/16' - ip netns exec "$ns2" "$iptables" $common -m rpfilter - ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert -} -[ -n "$ip6tables" ] && { - common='-t raw -A PREROUTING -s fec0::/16' - ip netns exec "$ns2" "$ip6tables" $common -m rpfilter - ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert -} -[ -n "$nft" ] && ip netns exec "$ns2" $nft -f - </dev/null -} - -clear_counters() { - [ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z - [ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z - if [ -n "$nft" ]; then - ( - echo "delete table inet t"; - ip netns exec "$ns2" $nft -s list table inet t; - ) | ip netns exec "$ns2" $nft -f - - fi -} - -testrun() { - clear_counters - - # test 1: martian traffic should fail rpfilter matches - netns_ping "$ns1" -I v0 192.168.42.1 && \ - die "martian ping 192.168.42.1 succeeded" - netns_ping "$ns1" -I v0 fec0:42::1 && \ - die "martian ping fec0:42::1 succeeded" - - ipt_zero_rule "$iptables" || die "iptables matched martian" - ipt_zero_rule "$ip6tables" || die "ip6tables matched martian" - ipt_zero_reverse_rule "$iptables" && die "iptables not matched martian" - ipt_zero_reverse_rule "$ip6tables" && die "ip6tables not matched martian" - nft_zero_rule ip || die "nft IPv4 matched martian" - nft_zero_rule ip6 || die "nft IPv6 matched martian" - - clear_counters - - # test 2: rpfilter match should pass for regular traffic - netns_ping "$ns1" 192.168.23.1 || \ - die "regular ping 192.168.23.1 failed" - netns_ping "$ns1" fec0:23::1 || \ - die "regular ping fec0:23::1 failed" - - ipt_zero_rule "$iptables" && die "iptables match not effective" - ipt_zero_rule "$ip6tables" && die "ip6tables match not effective" - ipt_zero_reverse_rule "$iptables" || die "iptables match over-effective" - ipt_zero_reverse_rule "$ip6tables" || die "ip6tables match over-effective" - nft_zero_rule ip && die "nft IPv4 match not effective" - nft_zero_rule ip6 && die "nft IPv6 match not effective" - -} - -testrun - -# repeat test with vrf device in $ns2 -ip -net "$ns2" link add vrf0 type vrf table 10 -ip -net "$ns2" link set vrf0 up -ip -net "$ns2" link set v0 master vrf0 - -testrun - -echo "PASS: netfilter reverse path match works as intended" -exit 0 diff --git a/tools/testing/selftests/netfilter/sctp_collision.c b/tools/testing/selftests/netfilter/sctp_collision.c deleted file mode 100644 index 21bb1cfd8a..0000000000 --- a/tools/testing/selftests/netfilter/sctp_collision.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include - -int main(int argc, char *argv[]) -{ - struct sockaddr_in saddr = {}, daddr = {}; - int sd, ret, len = sizeof(daddr); - struct timeval tv = {25, 0}; - char buf[] = "hello"; - - if (argc != 6 || (strcmp(argv[1], "server") && strcmp(argv[1], "client"))) { - printf("%s \n", - argv[0]); - return -1; - } - - sd = socket(AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP); - if (sd < 0) { - printf("Failed to create sd\n"); - return -1; - } - - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = inet_addr(argv[2]); - saddr.sin_port = htons(atoi(argv[3])); - - ret = bind(sd, (struct sockaddr *)&saddr, sizeof(saddr)); - if (ret < 0) { - printf("Failed to bind to address\n"); - goto out; - } - - ret = listen(sd, 5); - if (ret < 0) { - printf("Failed to listen on port\n"); - goto out; - } - - daddr.sin_family = AF_INET; - daddr.sin_addr.s_addr = inet_addr(argv[4]); - daddr.sin_port = htons(atoi(argv[5])); - - /* make test shorter than 25s */ - ret = setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); - if (ret < 0) { - printf("Failed to setsockopt SO_RCVTIMEO\n"); - goto out; - } - - if (!strcmp(argv[1], "server")) { - sleep(1); /* wait a bit for client's INIT */ - ret = connect(sd, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to connect to peer\n"); - goto out; - } - ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); - if (ret < 0) { - printf("Failed to recv msg %d\n", ret); - goto out; - } - ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to send msg %d\n", ret); - goto out; - } - printf("Server: sent! %d\n", ret); - } - - if (!strcmp(argv[1], "client")) { - usleep(300000); /* wait a bit for server's listening */ - ret = connect(sd, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to connect to peer\n"); - goto out; - } - sleep(1); /* wait a bit for server's delayed INIT_ACK to reproduce the issue */ - ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to send msg %d\n", ret); - goto out; - } - ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); - if (ret < 0) { - printf("Failed to recv msg %d\n", ret); - goto out; - } - printf("Client: rcvd! %d\n", ret); - } - ret = 0; -out: - close(sd); - return ret; -} diff --git a/tools/testing/selftests/netfilter/settings b/tools/testing/selftests/netfilter/settings deleted file mode 100644 index 6091b45d22..0000000000 --- a/tools/testing/selftests/netfilter/settings +++ /dev/null @@ -1 +0,0 @@ -timeout=120 diff --git a/tools/testing/selftests/netfilter/xt_string.sh b/tools/testing/selftests/netfilter/xt_string.sh deleted file mode 100755 index 1802653a47..0000000000 --- a/tools/testing/selftests/netfilter/xt_string.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# return code to signal skipped test -ksft_skip=4 -rc=0 - -if ! iptables --version >/dev/null 2>&1; then - echo "SKIP: Test needs iptables" - exit $ksft_skip -fi -if ! ip -V >/dev/null 2>&1; then - echo "SKIP: Test needs iproute2" - exit $ksft_skip -fi -if ! nc -h >/dev/null 2>&1; then - echo "SKIP: Test needs netcat" - exit $ksft_skip -fi - -pattern="foo bar baz" -patlen=11 -hdrlen=$((20 + 8)) # IPv4 + UDP -ns="ns-$(mktemp -u XXXXXXXX)" -trap 'ip netns del $ns' EXIT -ip netns add "$ns" -ip -net "$ns" link add d0 type dummy -ip -net "$ns" link set d0 up -ip -net "$ns" addr add 10.1.2.1/24 dev d0 - -#ip netns exec "$ns" tcpdump -npXi d0 & -#tcpdump_pid=$! -#trap 'kill $tcpdump_pid; ip netns del $ns' EXIT - -add_rule() { # (alg, from, to) - ip netns exec "$ns" \ - iptables -A OUTPUT -o d0 -m string \ - --string "$pattern" --algo $1 --from $2 --to $3 -} -showrules() { # () - ip netns exec "$ns" iptables -v -S OUTPUT | grep '^-A' -} -zerorules() { - ip netns exec "$ns" iptables -Z OUTPUT -} -countrule() { # (pattern) - showrules | grep -c -- "$*" -} -send() { # (offset) - ( for ((i = 0; i < $1 - $hdrlen; i++)); do - printf " " - done - printf "$pattern" - ) | ip netns exec "$ns" nc -w 1 -u 10.1.2.2 27374 -} - -add_rule bm 1000 1500 -add_rule bm 1400 1600 -add_rule kmp 1000 1500 -add_rule kmp 1400 1600 - -zerorules -send 0 -send $((1000 - $patlen)) -if [ $(countrule -c 0 0) -ne 4 ]; then - echo "FAIL: rules match data before --from" - showrules - ((rc--)) -fi - -zerorules -send 1000 -send $((1400 - $patlen)) -if [ $(countrule -c 2) -ne 2 ]; then - echo "FAIL: only two rules should match at low offset" - showrules - ((rc--)) -fi - -zerorules -send $((1500 - $patlen)) -if [ $(countrule -c 1) -ne 4 ]; then - echo "FAIL: all rules should match at end of packet" - showrules - ((rc--)) -fi - -zerorules -send 1495 -if [ $(countrule -c 1) -ne 1 ]; then - echo "FAIL: only kmp with proper --to should match pattern spanning fragments" - showrules - ((rc--)) -fi - -zerorules -send 1500 -if [ $(countrule -c 1) -ne 2 ]; then - echo "FAIL: two rules should match pattern at start of second fragment" - showrules - ((rc--)) -fi - -zerorules -send $((1600 - $patlen)) -if [ $(countrule -c 1) -ne 2 ]; then - echo "FAIL: two rules should match pattern at end of largest --to" - showrules - ((rc--)) -fi - -zerorules -send $((1600 - $patlen + 1)) -if [ $(countrule -c 1) -ne 0 ]; then - echo "FAIL: no rules should match pattern extending largest --to" - showrules - ((rc--)) -fi - -zerorules -send 1600 -if [ $(countrule -c 1) -ne 0 ]; then - echo "FAIL: no rule should match pattern past largest --to" - showrules - ((rc--)) -fi - -exit $rc diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 6ba4f8275a..994477ee87 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -600,6 +601,25 @@ int expect_strne(const char *expr, int llen, const char *cmp) return ret; } +#define EXPECT_STRBUFEQ(cond, expr, buf, val, cmp) \ + do { if (!(cond)) result(llen, SKIPPED); else ret += expect_str_buf_eq(expr, buf, val, llen, cmp); } while (0) + +static __attribute__((unused)) +int expect_str_buf_eq(size_t expr, const char *buf, size_t val, int llen, const char *cmp) +{ + llen += printf(" = %lu <%s> ", (unsigned long)expr, buf); + if (strcmp(buf, cmp) != 0) { + result(llen, FAIL); + return 1; + } + if (expr != val) { + result(llen, FAIL); + return 1; + } + + result(llen, OK); + return 0; +} /* declare tests based on line numbers. There must be exactly one test per line. */ #define CASE_TEST(name) \ @@ -761,6 +781,45 @@ int test_stat_timestamps(void) return 0; } +int test_uname(void) +{ + struct utsname buf; + char osrelease[sizeof(buf.release)]; + ssize_t r; + int fd; + + memset(&buf.domainname, 'P', sizeof(buf.domainname)); + + if (uname(&buf)) + return 1; + + if (strncmp("Linux", buf.sysname, sizeof(buf.sysname))) + return 1; + + fd = open("/proc/sys/kernel/osrelease", O_RDONLY); + if (fd == -1) + return 1; + + r = read(fd, osrelease, sizeof(osrelease)); + if (r == -1) + return 1; + + close(fd); + + if (osrelease[r - 1] == '\n') + r--; + + /* Validate one of the later fields to ensure field sizes are correct */ + if (strncmp(osrelease, buf.release, r)) + return 1; + + /* Ensure the field domainname is set, it is missing from struct old_utsname */ + if (strnlen(buf.domainname, sizeof(buf.domainname)) == sizeof(buf.domainname)) + return 1; + + return 0; +} + int test_mmap_munmap(void) { int ret, fd, i, page_size; @@ -966,6 +1025,8 @@ int run_syscall(int min, int max) CASE_TEST(stat_fault); EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break; CASE_TEST(stat_timestamps); EXPECT_SYSZR(1, test_stat_timestamps()); break; CASE_TEST(symlink_root); EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break; + CASE_TEST(uname); EXPECT_SYSZR(proc, test_uname()); break; + CASE_TEST(uname_fault); EXPECT_SYSER(1, uname(NULL), -1, EFAULT); break; CASE_TEST(unlink_root); EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break; CASE_TEST(unlink_blah); EXPECT_SYSER(1, unlink("/proc/self/blah"), -1, ENOENT); break; CASE_TEST(wait_child); EXPECT_SYSER(1, wait(&tmp), -1, ECHILD); break; @@ -991,6 +1052,14 @@ int run_stdlib(int min, int max) for (test = min; test >= 0 && test <= max; test++) { int llen = 0; /* line length */ + /* For functions that take a long buffer, like strlcat() + * Add some more chars after the \0, to test functions that overwrite the buffer set + * the \0 at the exact right position. + */ + char buf[10] = "test123456"; + buf[4] = '\0'; + + /* avoid leaving empty lines below, this will insert holes into * test numbers. */ @@ -1007,6 +1076,19 @@ int run_stdlib(int min, int max) CASE_TEST(strchr_foobar_z); EXPECT_STRZR(1, strchr("foobar", 'z')); break; CASE_TEST(strrchr_foobar_o); EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break; CASE_TEST(strrchr_foobar_z); EXPECT_STRZR(1, strrchr("foobar", 'z')); break; +#ifdef NOLIBC + CASE_TEST(strlcat_0); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcat_1); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 1), buf, 4, "test"); break; + CASE_TEST(strlcat_5); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 5), buf, 7, "test"); break; + CASE_TEST(strlcat_6); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 6), buf, 7, "testb"); break; + CASE_TEST(strlcat_7); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 7), buf, 7, "testba"); break; + CASE_TEST(strlcat_8); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 8), buf, 7, "testbar"); break; + CASE_TEST(strlcpy_0); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcpy_1); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 1), buf, 3, ""); break; + CASE_TEST(strlcpy_2); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 2), buf, 3, "b"); break; + CASE_TEST(strlcpy_3); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 3), buf, 3, "ba"); break; + CASE_TEST(strlcpy_4); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 4), buf, 3, "bar"); break; +#endif CASE_TEST(memcmp_20_20); EXPECT_EQ(1, memcmp("aaa\x20", "aaa\x20", 4), 0); break; CASE_TEST(memcmp_20_60); EXPECT_LT(1, memcmp("aaa\x20", "aaa\x60", 4), 0); break; CASE_TEST(memcmp_60_20); EXPECT_GT(1, memcmp("aaa\x60", "aaa\x20", 4), 0); break; diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile index 254d676a26..185dc76ebb 100644 --- a/tools/testing/selftests/openat2/Makefile +++ b/tools/testing/selftests/openat2/Makefile @@ -1,8 +1,18 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test +# gcc requires -static-libasan in order to ensure that Address Sanitizer's +# library is the first one loaded. However, clang already statically links the +# Address Sanitizer if -fsanitize is specified. Therefore, simply omit +# -static-libasan for clang builds. +ifeq ($(LLVM),) + CFLAGS += -static-libasan +endif + +LOCAL_HDRS += helpers.h + include ../lib.mk -$(TEST_GEN_PROGS): helpers.c helpers.h +$(TEST_GEN_PROGS): helpers.c diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore index 790c47001e..ee93dc4969 100644 --- a/tools/testing/selftests/perf_events/.gitignore +++ b/tools/testing/selftests/perf_events/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only sigtrap_threads remove_on_exec +watermark_signal diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile index db93c4ff08..70e3ff2112 100644 --- a/tools/testing/selftests/perf_events/Makefile +++ b/tools/testing/selftests/perf_events/Makefile @@ -2,5 +2,5 @@ CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES) LDFLAGS += -lpthread -TEST_GEN_PROGS := sigtrap_threads remove_on_exec +TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal include ../lib.mk diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c new file mode 100644 index 0000000000..49dc1e8311 --- /dev/null +++ b/tools/testing/selftests/perf_events/watermark_signal.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define __maybe_unused __attribute__((__unused__)) + +static int sigio_count; + +static void handle_sigio(int signum __maybe_unused, + siginfo_t *oh __maybe_unused, + void *uc __maybe_unused) +{ + ++sigio_count; +} + +static void do_child(void) +{ + raise(SIGSTOP); + + for (int i = 0; i < 20; ++i) + sleep(1); + + raise(SIGSTOP); + + exit(0); +} + +TEST(watermark_signal) +{ + struct perf_event_attr attr; + struct perf_event_mmap_page *p = NULL; + struct sigaction previous_sigio, sigio = { 0 }; + pid_t child = -1; + int child_status; + int fd = -1; + long page_size = sysconf(_SC_PAGE_SIZE); + + sigio.sa_sigaction = handle_sigio; + EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0); + + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_DUMMY; + attr.sample_period = 1; + attr.disabled = 1; + attr.watermark = 1; + attr.context_switch = 1; + attr.wakeup_watermark = 1; + + child = fork(); + EXPECT_GE(child, 0); + if (child == 0) + do_child(); + else if (child < 0) { + perror("fork()"); + goto cleanup; + } + + if (waitpid(child, &child_status, WSTOPPED) != child || + !(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) { + fprintf(stderr, + "failed to sycnhronize with child errno=%d status=%x\n", + errno, + child_status); + goto cleanup; + } + + fd = syscall(__NR_perf_event_open, &attr, child, -1, -1, + PERF_FLAG_FD_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "failed opening event %llx\n", attr.config); + goto cleanup; + } + + if (fcntl(fd, F_SETFL, FASYNC)) { + perror("F_SETFL FASYNC"); + goto cleanup; + } + + if (fcntl(fd, F_SETOWN, getpid())) { + perror("F_SETOWN getpid()"); + goto cleanup; + } + + if (fcntl(fd, F_SETSIG, SIGIO)) { + perror("F_SETSIG SIGIO"); + goto cleanup; + } + + p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == NULL) { + perror("mmap"); + goto cleanup; + } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) { + perror("PERF_EVENT_IOC_ENABLE"); + goto cleanup; + } + + if (kill(child, SIGCONT) < 0) { + perror("SIGCONT"); + goto cleanup; + } + + if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR) + fprintf(stderr, + "expected SIGIO to terminate wait errno=%d status=%x\n%d", + errno, + child_status, + sigio_count); + + EXPECT_GE(sigio_count, 1); + +cleanup: + if (p != NULL) + munmap(p, 2 * page_size); + + if (fd >= 0) + close(fd); + + if (child > 0) { + kill(child, SIGKILL); + waitpid(child, NULL, 0); + } + + sigaction(SIGIO, &previous_sigio, NULL); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index 01cc37bf61..f062a986e3 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -307,5 +307,5 @@ int main(int argc, char **argv) test_pidfd_fdinfo_nspid(); test_pidfd_dead_fdinfo(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index 8a59438ccc..c62564c264 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -159,5 +159,7 @@ on_error: if (pidfd >= 0) close(pidfd); - return !ret ? ksft_exit_pass() : ksft_exit_fail(); + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c index 6108112753..55d74a5035 100644 --- a/tools/testing/selftests/pidfd/pidfd_poll_test.c +++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c @@ -112,5 +112,5 @@ int main(int argc, char **argv) } ksft_test_result_pass("pidfd poll test: pass\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index c081ae9131..9faa686f90 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -572,5 +572,5 @@ int main(int argc, char **argv) test_pidfd_send_signal_exited_fail(); test_pidfd_send_signal_recycled_pid_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index c376151982..b175e94e19 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -7,12 +7,6 @@ ARCH := $(shell echo $(ARCH) | sed -e s/ppc.*/powerpc/) ifeq ($(ARCH),powerpc) -GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown") - -CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR)/include $(CFLAGS) - -export CFLAGS - SUB_DIRS = alignment \ benchmarks \ cache_shape \ @@ -46,6 +40,7 @@ $(SUB_DIRS): BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all include ../lib.mk +include ./flags.mk override define RUN_TESTS +@for TARGET in $(SUB_DIRS); do \ @@ -57,14 +52,14 @@ endef override define INSTALL_RULE +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install;\ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install;\ done; endef emit_tests: +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET $@;\ done; override define CLEAN diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile index 93e9af3744..66d5d7aaeb 100644 --- a/tools/testing/selftests/powerpc/alignment/Makefile +++ b/tools/testing/selftests/powerpc/alignment/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := copy_first_unaligned alignment_handler top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index a32a6ab899..1321922038 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -4,10 +4,11 @@ TEST_GEN_FILES := exec_target TEST_FILES := settings -CFLAGS += -O2 - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -O2 $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 689f6c8ebc..3a3ca956ac 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := cache_shape top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 77594e697f..42940f92d8 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -1,14 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -m64 -CFLAGS += -I$(CURDIR) -CFLAGS += -D SELFTEST -CFLAGS += -maltivec -CFLAGS += -mcpu=power4 - -# Use our CFLAGS for the implicit .S rule & set the asm machine type -ASFLAGS = $(CFLAGS) -Wa,-mpower4 - TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ copyuser_p7_t0 copyuser_p7_t1 \ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ @@ -20,6 +10,17 @@ EXTRA_SOURCES := validate.c ../harness.c stubs.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +# The loops are all 64-bit code +CFLAGS += -m64 +CFLAGS += -I$(CURDIR) +CFLAGS += -D SELFTEST +CFLAGS += -maltivec +CFLAGS += -mcpu=power4 + +# Use our CFLAGS for the implicit .S rule & set the asm machine type +ASFLAGS = $(CFLAGS) -Wa,-mpower4 $(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) $(CC) $(CPPFLAGS) $(CFLAGS) \ diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index b82f45dd46..11eefb4b9f 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1,2 +1,4 @@ +dexcr_test hashchk_test +chdexcr lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 829ad075b4..58cf9f7229 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,7 +1,10 @@ -TEST_GEN_PROGS := hashchk_test -TEST_GEN_FILES := lsdexcr +TEST_GEN_PROGS := dexcr_test hashchk_test +TEST_GEN_FILES := lsdexcr chdexcr include ../../lib.mk +include ../flags.mk + +CFLAGS += $(KHDR_INCLUDES) $(OUTPUT)/hashchk_test: CFLAGS += -fno-pie -no-pie $(call cc-option,-mno-rop-protect) diff --git a/tools/testing/selftests/powerpc/dexcr/chdexcr.c b/tools/testing/selftests/powerpc/dexcr/chdexcr.c new file mode 100644 index 0000000000..c548d7a5bb --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/chdexcr.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include + +#include "dexcr.h" +#include "utils.h" + +static void die(const char *msg) +{ + printf("%s\n", msg); + exit(1); +} + +static void help(void) +{ + printf("Invoke a provided program with a custom DEXCR on-exec reset value\n" + "\n" + "usage: chdexcr [CHDEXCR OPTIONS] -- PROGRAM [ARGS...]\n" + "\n" + "Each configurable DEXCR aspect is exposed as an option.\n" + "\n" + "The normal option sets the aspect in the DEXCR. The --no- variant\n" + "clears that aspect. For example, --ibrtpd sets the IBRTPD aspect bit,\n" + "so indirect branch prediction will be disabled in the provided program.\n" + "Conversely, --no-ibrtpd clears the aspect bit, so indirect branch\n" + "prediction may occur.\n" + "\n" + "CHDEXCR OPTIONS:\n"); + + for (int i = 0; i < ARRAY_SIZE(aspects); i++) { + const struct dexcr_aspect *aspect = &aspects[i]; + + if (aspect->prctl == -1) + continue; + + printf(" --%-6s / --no-%-6s : %s\n", aspect->opt, aspect->opt, aspect->desc); + } +} + +static const struct dexcr_aspect *opt_to_aspect(const char *opt) +{ + for (int i = 0; i < ARRAY_SIZE(aspects); i++) + if (aspects[i].prctl != -1 && !strcmp(aspects[i].opt, opt)) + return &aspects[i]; + + return NULL; +} + +static int apply_option(const char *option) +{ + const struct dexcr_aspect *aspect; + const char *opt = NULL; + const char *set_prefix = "--"; + const char *clear_prefix = "--no-"; + unsigned long ctrl = 0; + int err; + + if (!strcmp(option, "-h") || !strcmp(option, "--help")) { + help(); + exit(0); + } + + /* Strip out --(no-) prefix and determine ctrl value */ + if (!strncmp(option, clear_prefix, strlen(clear_prefix))) { + opt = &option[strlen(clear_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC; + } else if (!strncmp(option, set_prefix, strlen(set_prefix))) { + opt = &option[strlen(set_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_SET_ONEXEC; + } + + if (!opt || !*opt) + return 1; + + aspect = opt_to_aspect(opt); + if (!aspect) + die("unknown aspect"); + + err = pr_set_dexcr(aspect->prctl, ctrl); + if (err) + die("failed to apply option"); + + return 0; +} + +int main(int argc, char *const argv[]) +{ + int i; + + if (!dexcr_exists()) + die("DEXCR not detected on this hardware"); + + for (i = 1; i < argc; i++) + if (apply_option(argv[i])) + break; + + if (i < argc && !strcmp(argv[i], "--")) + i++; + + if (i >= argc) + die("missing command"); + + execvp(argv[i], &argv[i]); + perror("execve"); + + return errno; +} diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.c b/tools/testing/selftests/powerpc/dexcr/dexcr.c index 65ec5347de..468fd0dc99 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,45 @@ out: return exists; } +unsigned int pr_which_to_aspect(unsigned long which) +{ + switch (which) { + case PR_PPC_DEXCR_SBHE: + return DEXCR_PR_SBHE; + case PR_PPC_DEXCR_IBRTPD: + return DEXCR_PR_IBRTPD; + case PR_PPC_DEXCR_SRAPD: + return DEXCR_PR_SRAPD; + case PR_PPC_DEXCR_NPHIE: + return DEXCR_PR_NPHIE; + default: + FAIL_IF_EXIT_MSG(true, "unknown PR aspect"); + } +} + +int pr_get_dexcr(unsigned long which) +{ + return prctl(PR_PPC_GET_DEXCR, which, 0UL, 0UL, 0UL); +} + +int pr_set_dexcr(unsigned long which, unsigned long ctrl) +{ + return prctl(PR_PPC_SET_DEXCR, which, ctrl, 0UL, 0UL); +} + +bool pr_dexcr_aspect_supported(unsigned long which) +{ + if (pr_get_dexcr(which) == -1) + return errno == ENODEV; + + return true; +} + +bool pr_dexcr_aspect_editable(unsigned long which) +{ + return pr_get_dexcr(which) & PR_PPC_DEXCR_CTRL_EDITABLE; +} + /* * Just test if a bad hashchk triggers a signal, without checking * for support or if the NPHIE aspect is enabled. diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.h b/tools/testing/selftests/powerpc/dexcr/dexcr.h index f55cbbc864..51e9ba3b09 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.h +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.h @@ -9,6 +9,7 @@ #define _SELFTESTS_POWERPC_DEXCR_DEXCR_H #include +#include #include #include "reg.h" @@ -26,8 +27,64 @@ #define PPC_RAW_HASHCHK(b, i, a) \ str(.long (0x7C0005E4 | PPC_RAW_HASH_ARGS(b, i, a));) +struct dexcr_aspect { + const char *name; /* Short display name */ + const char *opt; /* Option name for chdexcr */ + const char *desc; /* Expanded aspect meaning */ + unsigned int index; /* Aspect bit index in DEXCR */ + unsigned long prctl; /* 'which' value for get/set prctl */ +}; + +static const struct dexcr_aspect aspects[] = { + { + .name = "SBHE", + .opt = "sbhe", + .desc = "Speculative branch hint enable", + .index = 0, + .prctl = PR_PPC_DEXCR_SBHE, + }, + { + .name = "IBRTPD", + .opt = "ibrtpd", + .desc = "Indirect branch recurrent target prediction disable", + .index = 3, + .prctl = PR_PPC_DEXCR_IBRTPD, + }, + { + .name = "SRAPD", + .opt = "srapd", + .desc = "Subroutine return address prediction disable", + .index = 4, + .prctl = PR_PPC_DEXCR_SRAPD, + }, + { + .name = "NPHIE", + .opt = "nphie", + .desc = "Non-privileged hash instruction enable", + .index = 5, + .prctl = PR_PPC_DEXCR_NPHIE, + }, + { + .name = "PHIE", + .opt = "phie", + .desc = "Privileged hash instruction enable", + .index = 6, + .prctl = -1, + }, +}; + bool dexcr_exists(void); +bool pr_dexcr_aspect_supported(unsigned long which); + +bool pr_dexcr_aspect_editable(unsigned long which); + +int pr_get_dexcr(unsigned long pr_aspect); + +int pr_set_dexcr(unsigned long pr_aspect, unsigned long ctrl); + +unsigned int pr_which_to_aspect(unsigned long which); + bool hashchk_triggers(void); enum dexcr_source { diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr_test.c b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c new file mode 100644 index 0000000000..7a86571649 --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include + +#include "dexcr.h" +#include "utils.h" + +/* + * Helper function for testing the behaviour of a newly exec-ed process + */ +static int dexcr_prctl_onexec_test_child(unsigned long which, const char *status) +{ + unsigned long dexcr = mfspr(SPRN_DEXCR_RO); + unsigned long aspect = pr_which_to_aspect(which); + int ctrl = pr_get_dexcr(which); + + if (!strcmp(status, "set")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), + "setting aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(!(aspect & dexcr), "setting aspect across exec did not take effect"); + } else if (!strcmp(status, "clear")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), + "clearing aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(aspect & dexcr, "clearing aspect across exec did not take effect"); + } else { + FAIL_IF_EXIT_MSG(true, "unknown expected status"); + } + + return 0; +} + +/* + * Test that the given prctl value can be manipulated freely + */ +static int dexcr_prctl_aspect_test(unsigned long which) +{ + unsigned long aspect = pr_which_to_aspect(which); + pid_t pid; + int ctrl; + int err; + int errno_save; + + SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_supported(which), "DEXCR aspect not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_editable(which), "DEXCR aspect not editable with prctl"); + + /* We reject invalid combinations of arguments */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear should be rejected with EINVAL"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear on exec should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear on exec should be rejected with EINVAL"); + + /* We set the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR, "config value unexpected clear flag"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "setting aspect did not take effect"); + + /* We clear the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET, "config value unexpected set flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "clearing aspect did not take effect"); + + /* We make it set on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC, "config value unexpected clear on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "scheduling aspect to set on exec should not change it now"); + + /* We make it clear on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect config should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC, "config value unexpected set on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should still be cleared"); + + /* We allow setting the current and on-exec value in a single call */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "process aspect should be set"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should be clear"); + + /* Verify the onexec value is applied across exec */ + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "set", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(mfspr(SPRN_DEXCR_RO) & aspect, + "setting aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "clear", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(!(mfspr(SPRN_DEXCR_RO) & aspect), + "clearing aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + return 0; +} + +static int dexcr_prctl_ibrtpd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_IBRTPD); +} + +static int dexcr_prctl_srapd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_SRAPD); +} + +static int dexcr_prctl_nphie_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_NPHIE); +} + +int main(int argc, char *argv[]) +{ + int err = 0; + + /* + * Some tests require checking what happens across exec, so we may be + * invoked as the child of a particular test + */ + if (argc > 1) { + if (argc == 3 && !strcmp(argv[0], "dexcr_prctl_onexec_test_child")) { + unsigned long which; + + err = parse_ulong(argv[1], strlen(argv[1]), &which, 10); + FAIL_IF_MSG(err, "failed to parse which value for child"); + + return dexcr_prctl_onexec_test_child(which, argv[2]); + } + + FAIL_IF_MSG(true, "unknown test case"); + } + + /* + * Otherwise we are the main test invocation and run the full suite + */ + err |= test_harness(dexcr_prctl_ibrtpd_test, "dexcr_prctl_ibrtpd"); + err |= test_harness(dexcr_prctl_srapd_test, "dexcr_prctl_srapd"); + err |= test_harness(dexcr_prctl_nphie_test, "dexcr_prctl_nphie"); + + return err; +} diff --git a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c index 7d5658c9eb..645224bdc1 100644 --- a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c +++ b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c @@ -21,8 +21,14 @@ static int require_nphie(void) { SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + + pr_set_dexcr(PR_PPC_DEXCR_NPHIE, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + + if (get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE) + return 0; + SKIP_IF_MSG(!(get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE), - "DEXCR[NPHIE] not enabled"); + "Failed to enable DEXCR[NPHIE]"); return 0; } diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c index 94abbfcc38..7588929180 100644 --- a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0+ -#include #include #include #include +#include #include "dexcr.h" #include "utils.h" @@ -12,40 +12,6 @@ static unsigned int dexcr; static unsigned int hdexcr; static unsigned int effective; -struct dexcr_aspect { - const char *name; - const char *desc; - unsigned int index; -}; - -static const struct dexcr_aspect aspects[] = { - { - .name = "SBHE", - .desc = "Speculative branch hint enable", - .index = 0, - }, - { - .name = "IBRTPD", - .desc = "Indirect branch recurrent target prediction disable", - .index = 3, - }, - { - .name = "SRAPD", - .desc = "Subroutine return address prediction disable", - .index = 4, - }, - { - .name = "NPHIE", - .desc = "Non-privileged hash instruction enable", - .index = 5, - }, - { - .name = "PHIE", - .desc = "Privileged hash instruction enable", - .index = 6, - }, -}; - static void print_list(const char *list[], size_t len) { for (size_t i = 0; i < len; i++) { @@ -60,7 +26,7 @@ static void print_dexcr(char *name, unsigned int bits) const char *enabled_aspects[ARRAY_SIZE(aspects) + 1] = {NULL}; size_t j = 0; - printf("%s: %08x", name, bits); + printf("%s: 0x%08x", name, bits); if (bits == 0) { printf("\n"); @@ -103,6 +69,63 @@ static void print_aspect(const struct dexcr_aspect *aspect) printf(" \t(%s)\n", aspect->desc); } +static void print_aspect_config(const struct dexcr_aspect *aspect) +{ + const char *reason = NULL; + const char *reason_hyp = NULL; + const char *reason_prctl = "no prctl"; + bool actual = effective & DEXCR_PR_BIT(aspect->index); + bool expected = actual; /* Assume it's fine if we don't expect a specific set/clear value */ + + if (actual) + reason = "set by unknown"; + else + reason = "cleared by unknown"; + + if (aspect->prctl != -1) { + int ctrl = pr_get_dexcr(aspect->prctl); + + if (ctrl < 0) { + reason_prctl = "failed to read prctl"; + } else { + if (ctrl & PR_PPC_DEXCR_CTRL_SET) { + reason_prctl = "set by prctl"; + expected = true; + } else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { + reason_prctl = "cleared by prctl"; + expected = false; + } else { + reason_prctl = "unknown prctl"; + } + + reason = reason_prctl; + } + } + + if (hdexcr & DEXCR_PR_BIT(aspect->index)) { + reason_hyp = "set by hypervisor"; + reason = reason_hyp; + expected = true; + } else { + reason_hyp = "not modified by hypervisor"; + } + + printf("%12s (%d): %-28s (%s, %s)\n", + aspect->name, + aspect->index, + reason, + reason_hyp, + reason_prctl); + + /* + * The checks are not atomic, so this can technically trigger if the + * hypervisor makes a change while we are checking each source. It's + * far more likely to be a bug if we see this though. + */ + if (actual != expected) + printf(" : ! actual %s does not match config\n", aspect->name); +} + int main(int argc, char *argv[]) { if (!dexcr_exists()) { @@ -114,6 +137,8 @@ int main(int argc, char *argv[]) hdexcr = get_dexcr(HDEXCR); effective = dexcr | hdexcr; + printf("current status:\n"); + print_dexcr(" DEXCR", dexcr); print_dexcr(" HDEXCR", hdexcr); print_dexcr("Effective", effective); @@ -136,6 +161,12 @@ int main(int argc, char *argv[]) else printf("ignored\n"); } + printf("\n"); + + printf("configuration:\n"); + for (size_t i = 0; i < ARRAY_SIZE(aspects); i++) + print_aspect_config(&aspects[i]); + printf("\n"); return 0; } diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 9289d5febe..9fa9cb5bd9 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -5,6 +5,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread $(OUTPUT)/dscr_explicit_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile index ae963eb2dc..70797716f2 100644 --- a/tools/testing/selftests/powerpc/eeh/Makefile +++ b/tools/testing/selftests/powerpc/eeh/Makefile @@ -7,3 +7,4 @@ TEST_FILES := eeh-functions.sh settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk diff --git a/tools/testing/selftests/powerpc/flags.mk b/tools/testing/selftests/powerpc/flags.mk new file mode 100644 index 0000000000..abb9e58d95 --- /dev/null +++ b/tools/testing/selftests/powerpc/flags.mk @@ -0,0 +1,9 @@ +#This checks for any ENV variables and add those. + +ifeq ($(GIT_VERSION),) +GIT_VERSION := $(shell git describe --always --long --dirty || echo "unknown") +export GIT_VERSION +endif + +CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(selfdir)/powerpc/include $(USERCFLAGS) +export CFLAGS diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile index 3948f7c510..b14fd2e0c6 100644 --- a/tools/testing/selftests/powerpc/math/Makefile +++ b/tools/testing/selftests/powerpc/math/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal fpu_denormal vmx_syscall vm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c $(TEST_GEN_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile index 2424513982..ce4ed679aa 100644 --- a/tools/testing/selftests/powerpc/mce/Makefile +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := inject-ra-err include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 4a6608beef..aab058ecb3 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -13,6 +13,7 @@ TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index 0785c2e99d..480d8ba94c 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -1,8 +1,9 @@ -CFLAGS = -O3 -m64 -I./include -I../include - TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk +include ../flags.mk + +CFLAGS = -O3 -m64 -I./include -I../include $(TEST_GEN_FILES): gzip_vas.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_attributes/Makefile b/tools/testing/selftests/powerpc/papr_attributes/Makefile index e899712d49..4064294990 100644 --- a/tools/testing/selftests/powerpc/papr_attributes/Makefile +++ b/tools/testing/selftests/powerpc/papr_attributes/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := attr_test top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -$(TEST_GEN_PROGS): ../harness.c ../utils.c \ No newline at end of file +$(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_sysparm/Makefile b/tools/testing/selftests/powerpc/papr_sysparm/Makefile index 7f79e43763..fed4f2414d 100644 --- a/tools/testing/selftests/powerpc/papr_sysparm/Makefile +++ b/tools/testing/selftests/powerpc/papr_sysparm/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_sysparm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile b/tools/testing/selftests/powerpc/papr_vpd/Makefile index 06b719703b..b09852e408 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/Makefile +++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_vpd top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index a284fa874a..7e9dbf3d0d 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -7,8 +7,11 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests +SUB_DIRS := ebb sampling_tests event_code_tests + +all: $(TEST_GEN_PROGS) $(SUB_DIRS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) @@ -22,12 +25,16 @@ $(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) $(OUTPUT)/per_event_excludes: ../utils.c +$(SUB_DIRS): + BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all + DEFAULT_RUN_TESTS := $(RUN_TESTS) override define RUN_TESTS $(DEFAULT_RUN_TESTS) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests; \ + done; endef emit_tests: @@ -35,34 +42,29 @@ emit_tests: BASENAME_TEST=`basename $$TEST`; \ echo "$(COLLECTION):$$BASENAME_TEST"; \ done - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET emit_tests; \ + done; DEFAULT_INSTALL_RULE := $(INSTALL_RULE) override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install; \ + done; endef DEFAULT_CLEAN := $(CLEAN) override define CLEAN $(DEFAULT_CLEAN) $(RM) $(TEST_GEN_PROGS) $(OUTPUT)/loop.o - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean; \ + done; endef -ebb: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -sampling_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -event_code_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all .PHONY: all run_tests ebb sampling_tests event_code_tests emit_tests diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index 0101606902..1b39af7c10 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -4,16 +4,6 @@ include ../../../../../build/Build.include noarg: $(MAKE) -C ../../ -# The EBB handler is 64-bit code and everything links against it -CFLAGS += -m64 - -TMPOUT = $(OUTPUT)/TMPDIR/ -# Toolchains may build PIE by default which breaks the assembly -no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ - $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) - -LDFLAGS += $(no-pie-option) - TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ cycles_with_freeze_test pmc56_overflow_test \ ebb_vs_cpu_event_test cpu_event_vs_ebb_test \ @@ -28,6 +18,17 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +# The EBB handler is 64-bit code and everything links against it +CFLAGS += -m64 + +TMPOUT = $(OUTPUT)/TMPDIR/ +# Toolchains may build PIE by default which breaks the assembly +no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) + +LDFLAGS += $(no-pie-option) $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \ ebb.c ebb_handler.S trace.c busy_loop.S diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 4e07d70464..fdb080b3fa 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ @@ -11,5 +9,8 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +CFLAGS += -m64 $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c ../sampling_tests/misc.h ../sampling_tests/misc.c diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 9e67351fb2..9f79bec5fc 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test \ mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ @@ -11,5 +9,8 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +CFLAGS += -m64 $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c misc.c misc.h ../loop.S ../branch_loops.S diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 9b9491a632..23bd9a7590 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -I$(CURDIR) - TEST_GEN_PROGS := load_unaligned_zeropad top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -I$(CURDIR) $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 1b39b86849..59ca01d856 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -26,6 +26,7 @@ LOCAL_HDRS += $(patsubst %,$(selfdir)/powerpc/ptrace/%,$(wildcard *.h)) top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk TM_TESTS := $(patsubst %,$(OUTPUT)/%,$(TM_TESTS)) TESTS_64 := $(patsubst %,$(OUTPUT)/%,$(TESTS_64)) diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index e0d979ab02..3328603972 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -5,9 +5,10 @@ TEST_PROGS := mitigation-patching.sh top_srcdir = ../../../../.. -CFLAGS += $(KHDR_INCLUDES) - include ../../lib.mk +include ../flags.mk + +CFLAGS += $(KHDR_INCLUDES) $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index f679d260af..ece95bd52b 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -3,7 +3,6 @@ TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart TEST_GEN_PROGS += sigreturn_kernel TEST_GEN_PROGS += sigreturn_unaligned -CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm $(OUTPUT)/sigfuz: CFLAGS += -pthread -m64 @@ -11,5 +10,8 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -maltivec $(TEST_GEN_PROGS): ../harness.c ../utils.c signal.S diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 9c39f55a58..4c9d9a58c9 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -I$(CURDIR) - EXTRA_SOURCES := ../harness.c build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c >/dev/null 2>&1) then echo "1"; fi) @@ -27,9 +24,13 @@ $(OUTPUT)/strlen_32: CFLAGS += -m32 TEST_GEN_PROGS += strlen_32 endif -ASFLAGS = $(CFLAGS) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +# The loops are all 64-bit code +CFLAGS += -I$(CURDIR) + +ASFLAGS = $(CFLAGS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index bdc081afed..0da2e0a742 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := switch_endian_test -ASFLAGS += -O2 -Wall -g -nostdlib -m64 - EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +ASFLAGS += -O2 -Wall -g -nostdlib -m64 $(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index ee1740ddfb..3bc07af88f 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := ipc_unmuxed rtas_filter -CFLAGS += $(KHDR_INCLUDES) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += $(KHDR_INCLUDES) $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 3876805c2f..f13f0ab360 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -11,6 +11,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index cf65cbf330..61d519a076 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -1,10 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := test-vphn -CFLAGS += -m64 -I$(CURDIR) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -m64 -I$(CURDIR) $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index bbac5f4b03..990d24696f 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -391,7 +391,7 @@ __EOF__ forceflavor="`echo $flavor | sed -e 's/^CONFIG/CONFIG_FORCE/'`" deselectedflavors="`grep -v $flavor $T/rcutasksflavors | tr '\012' ' ' | tr -s ' ' | sed -e 's/ *$//'`" echo " --- Running RCU Tasks Trace flavor $flavor `date`" >> $rtfdir/log - tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 + tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y CONFIG_KPROBES=n CONFIG_RCU_TRACE=n CONFIG_TRACING=n CONFIG_BLK_DEV_IO_TRACE=n CONFIG_UPROBE_EVENTS=n $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 retcode=$? if test "$retcode" -ne 0 then @@ -425,7 +425,7 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((2+HALF_ALLOTED_CPUS/16)) + scfmem=$((3+HALF_ALLOTED_CPUS/16)) torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi @@ -559,7 +559,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index fc45645bb5..9ecd1b4e65 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -10,8 +10,9 @@ CONFIG_NO_HZ_FULL=n CONFIG_RCU_TRACE=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_BOOST=n +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=100 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n -#CHECK#CONFIG_RCU_EXPERT=n +CONFIG_RCU_EXPERT=y CONFIG_KPROBES=n CONFIG_FTRACE=n diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 178a41d4bb..55315ed695 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -128,7 +128,7 @@ static int check_results(struct resctrl_val_param *param, const char *cache_type return fail; } -void cat_test_cleanup(void) +static void cat_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -284,13 +284,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param ret = cat_test(test, uparams, ¶m, span, start_mask); if (ret) - goto out; + return ret; ret = check_results(¶m, test->resource, cache_total_size, full_cache_mask, start_mask); -out: - cat_test_cleanup(); - return ret; } @@ -385,6 +382,7 @@ struct resctrl_test l3_cat_test = { .resource = "L3", .feature_check = test_resource_feature_check, .run_test = cat_run_test, + .cleanup = cat_test_cleanup, }; struct resctrl_test l3_noncont_cat_test = { diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index a81f91222a..0105afec61 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -40,11 +40,11 @@ static int show_results_info(unsigned long sum_llc_val, int no_of_bits, int ret; avg_llc_val = sum_llc_val / num_of_runs; - avg_diff = (long)abs(cache_span - avg_llc_val); + avg_diff = (long)(cache_span - avg_llc_val); diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100; ret = platform && abs((int)diff_percent) > max_diff_percent && - abs(avg_diff) > max_diff; + labs(avg_diff) > max_diff; ksft_print_msg("%s Check cache miss rate within %lu%%\n", ret ? "Fail:" : "Pass:", max_diff_percent); @@ -91,7 +91,7 @@ static int check_results(struct resctrl_val_param *param, size_t span, int no_of MAX_DIFF, MAX_DIFF_PERCENT, runs - 1, true); } -void cmt_test_cleanup(void) +static void cmt_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -161,7 +161,6 @@ static int cmt_run_test(const struct resctrl_test *test, const struct user_param ksft_print_msg("Intel CMT may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); out: - cmt_test_cleanup(); free(span_str); return ret; @@ -178,4 +177,5 @@ struct resctrl_test cmt_test = { .resource = "L3", .feature_check = cmt_feature_check, .run_test = cmt_run_test, + .cleanup = cmt_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 7946e32e85..a6ad39aae1 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -60,8 +60,8 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) /* Memory bandwidth from 100% down to 10% */ for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP; allocation++) { - unsigned long avg_bw_imc, avg_bw_resc; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_bw_imc, avg_bw_resc; int avg_diff_per; float avg_diff; @@ -137,7 +137,7 @@ static int check_results(void) return show_mba_info(bw_imc, bw_resc); } -void mba_test_cleanup(void) +static void mba_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -158,13 +158,10 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param ret = resctrl_val(test, uparams, uparams->benchmark_cmd, ¶m); if (ret) - goto out; + return ret; ret = check_results(); -out: - mba_test_cleanup(); - return ret; } @@ -180,4 +177,5 @@ struct resctrl_test mba_test = { .vendor_specific = ARCH_INTEL, .feature_check = mba_feature_check, .run_test = mba_run_test, + .cleanup = mba_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index d67ffa3ec6..6fec51e1ff 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -17,8 +17,8 @@ static int show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, size_t span) { - unsigned long avg_bw_imc = 0, avg_bw_resc = 0; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_bw_imc = 0, avg_bw_resc = 0; int runs, ret, avg_diff_per; float avg_diff = 0; @@ -105,7 +105,7 @@ static int mbm_setup(const struct resctrl_test *test, return ret; } -void mbm_test_cleanup(void) +static void mbm_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -126,15 +126,12 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param ret = resctrl_val(test, uparams, uparams->benchmark_cmd, ¶m); if (ret) - goto out; + return ret; ret = check_results(DEFAULT_SPAN); if (ret && (get_vendor() == ARCH_INTEL)) ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); -out: - mbm_test_cleanup(); - return ret; } @@ -150,4 +147,5 @@ struct resctrl_test mbm_test = { .vendor_specific = ARCH_INTEL, .feature_check = mbm_feature_check, .run_test = mbm_run_test, + .cleanup = mbm_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 2051bd135e..00d51fa753 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -72,6 +72,7 @@ struct user_params { * @disabled: Test is disabled * @feature_check: Callback to check required resctrl features * @run_test: Callback to run the test + * @cleanup: Callback to cleanup after the test */ struct resctrl_test { const char *name; @@ -82,6 +83,7 @@ struct resctrl_test { bool (*feature_check)(const struct resctrl_test *test); int (*run_test)(const struct resctrl_test *test, const struct user_params *uparams); + void (*cleanup)(void); }; /* @@ -156,9 +158,6 @@ int resctrl_val(const struct resctrl_test *test, const struct user_params *uparams, const char * const *benchmark_cmd, struct resctrl_val_param *param); -void tests_cleanup(void); -void mbm_test_cleanup(void); -void mba_test_cleanup(void); unsigned long create_bit_mask(unsigned int start, unsigned int len); unsigned int count_contiguous_bits(unsigned long val, unsigned int *start); int get_full_cbm(const char *cache_type, unsigned long *mask); @@ -166,11 +165,9 @@ int get_mask_no_shareable(const char *cache_type, unsigned long *mask); int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size); int resource_info_unsigned_get(const char *resource, const char *filename, unsigned int *val); void ctrlc_handler(int signum, siginfo_t *info, void *ptr); -int signal_handler_register(void); +int signal_handler_register(const struct resctrl_test *test); void signal_handler_unregister(void); -void cat_test_cleanup(void); unsigned int count_bits(unsigned long n); -void cmt_test_cleanup(void); void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config); void perf_event_initialize_read_format(struct perf_event_read *pe_read); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index f3dc1b9696..ecbb7605a9 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -81,19 +81,11 @@ static void cmd_help(void) printf("\t-h: help\n"); } -void tests_cleanup(void) -{ - mbm_test_cleanup(); - mba_test_cleanup(); - cmt_test_cleanup(); - cat_test_cleanup(); -} - -static int test_prepare(void) +static int test_prepare(const struct resctrl_test *test) { int res; - res = signal_handler_register(); + res = signal_handler_register(test); if (res) { ksft_print_msg("Failed to register signal handler\n"); return res; @@ -108,8 +100,10 @@ static int test_prepare(void) return 0; } -static void test_cleanup(void) +static void test_cleanup(const struct resctrl_test *test) { + if (test->cleanup) + test->cleanup(); umount_resctrlfs(); signal_handler_unregister(); } @@ -136,7 +130,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p ksft_print_msg("Starting %s test ...\n", test->name); - if (test_prepare()) { + if (test_prepare(test)) { ksft_exit_fail_msg("Abnormal failure when preparing for the test\n"); return; } @@ -151,7 +145,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p ksft_test_result(!ret, "%s: test\n", test->name); cleanup: - test_cleanup(); + test_cleanup(test); } static void init_user_params(struct user_params *uparams) @@ -253,13 +247,13 @@ last_arg: * 2. We execute perf commands */ if (geteuid() != 0) - return ksft_exit_skip("Not running as root. Skipping...\n"); + ksft_exit_skip("Not running as root. Skipping...\n"); if (!check_resctrlfs_support()) - return ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n"); + ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n"); if (umount_resctrlfs()) - return ksft_exit_skip("resctrl FS unmount failed.\n"); + ksft_exit_skip("resctrl FS unmount failed.\n"); filter_dmesg(); diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 5a49f07a6c..f55f5989de 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -62,6 +62,7 @@ struct imc_counter_config { static char mbm_total_path[1024]; static int imcs; static struct imc_counter_config imc_counters_config[MAX_IMCS][2]; +static const struct resctrl_test *current_test; void membw_initialize_perf_event_attr(int i, int j) { @@ -292,6 +293,18 @@ static int initialize_mem_bw_imc(void) return 0; } +static void perf_close_imc_mem_bw(void) +{ + int mc; + + for (mc = 0; mc < imcs; mc++) { + if (imc_counters_config[mc][READ].fd != -1) + close(imc_counters_config[mc][READ].fd); + if (imc_counters_config[mc][WRITE].fd != -1) + close(imc_counters_config[mc][WRITE].fd); + } +} + /* * get_mem_bw_imc: Memory band width as reported by iMC counters * @cpu_no: CPU number that the benchmark PID is binded to @@ -305,26 +318,33 @@ static int initialize_mem_bw_imc(void) static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc) { float reads, writes, of_mul_read, of_mul_write; - int imc, j, ret; + int imc, ret; + + for (imc = 0; imc < imcs; imc++) { + imc_counters_config[imc][READ].fd = -1; + imc_counters_config[imc][WRITE].fd = -1; + } /* Start all iMC counters to log values (both read and write) */ reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1; for (imc = 0; imc < imcs; imc++) { - for (j = 0; j < 2; j++) { - ret = open_perf_event(imc, cpu_no, j); - if (ret) - return -1; - } - for (j = 0; j < 2; j++) - membw_ioctl_perf_event_ioc_reset_enable(imc, j); + ret = open_perf_event(imc, cpu_no, READ); + if (ret) + goto close_fds; + ret = open_perf_event(imc, cpu_no, WRITE); + if (ret) + goto close_fds; + + membw_ioctl_perf_event_ioc_reset_enable(imc, READ); + membw_ioctl_perf_event_ioc_reset_enable(imc, WRITE); } sleep(1); /* Stop counters after a second to get results (both read and write) */ for (imc = 0; imc < imcs; imc++) { - for (j = 0; j < 2; j++) - membw_ioctl_perf_event_ioc_disable(imc, j); + membw_ioctl_perf_event_ioc_disable(imc, READ); + membw_ioctl_perf_event_ioc_disable(imc, WRITE); } /* @@ -340,15 +360,13 @@ static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc) if (read(r->fd, &r->return_value, sizeof(struct membw_read_format)) == -1) { ksft_perror("Couldn't get read b/w through iMC"); - - return -1; + goto close_fds; } if (read(w->fd, &w->return_value, sizeof(struct membw_read_format)) == -1) { ksft_perror("Couldn't get write bw through iMC"); - - return -1; + goto close_fds; } __u64 r_time_enabled = r->return_value.time_enabled; @@ -368,10 +386,7 @@ static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc) writes += w->return_value.value * of_mul_write * SCALE; } - for (imc = 0; imc < imcs; imc++) { - close(imc_counters_config[imc][READ].fd); - close(imc_counters_config[imc][WRITE].fd); - } + perf_close_imc_mem_bw(); if (strcmp(bw_report, "reads") == 0) { *bw_imc = reads; @@ -385,6 +400,10 @@ static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc) *bw_imc = reads + writes; return 0; + +close_fds: + perf_close_imc_mem_bw(); + return -1; } void set_mbm_path(const char *ctrlgrp, const char *mongrp, int domain_id) @@ -472,7 +491,8 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) if (bm_pid) kill(bm_pid, SIGKILL); umount_resctrlfs(); - tests_cleanup(); + if (current_test && current_test->cleanup) + current_test->cleanup(); ksft_print_msg("Ending\n\n"); exit(EXIT_SUCCESS); @@ -482,13 +502,14 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) * Register CTRL-C handler for parent, as it has to kill * child process before exiting. */ -int signal_handler_register(void) +int signal_handler_register(const struct resctrl_test *test) { struct sigaction sigact = {}; int ret = 0; bm_pid = 0; + current_test = test; sigact.sa_sigaction = ctrlc_handler; sigemptyset(&sigact.sa_mask); sigact.sa_flags = SA_SIGINFO; @@ -510,6 +531,7 @@ void signal_handler_unregister(void) { struct sigaction sigact = {}; + current_test = NULL; sigact.sa_handler = SIG_DFL; sigemptyset(&sigact.sa_mask); if (sigaction(SIGINT, &sigact, NULL) || diff --git a/tools/testing/selftests/ring-buffer/.gitignore b/tools/testing/selftests/ring-buffer/.gitignore new file mode 100644 index 0000000000..3aed1a2a6c --- /dev/null +++ b/tools/testing/selftests/ring-buffer/.gitignore @@ -0,0 +1 @@ +map_test diff --git a/tools/testing/selftests/ring-buffer/Makefile b/tools/testing/selftests/ring-buffer/Makefile new file mode 100644 index 0000000000..627c5fa6d1 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -D_GNU_SOURCE + +TEST_GEN_PROGS = map_test + +include ../lib.mk diff --git a/tools/testing/selftests/ring-buffer/config b/tools/testing/selftests/ring-buffer/config new file mode 100644 index 0000000000..d936f8f00e --- /dev/null +++ b/tools/testing/selftests/ring-buffer/config @@ -0,0 +1,2 @@ +CONFIG_FTRACE=y +CONFIG_TRACER_SNAPSHOT=y diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c new file mode 100644 index 0000000000..a9006fa709 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ring-buffer memory mapping tests + * + * Copyright (c) 2024 Vincent Donnefort + */ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "../user_events/user_events_selftests.h" /* share tracefs setup */ +#include "../kselftest_harness.h" + +#define TRACEFS_ROOT "/sys/kernel/tracing" + +static int __tracefs_write(const char *path, const char *value) +{ + int fd, ret; + + fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) + return fd; + + ret = write(fd, value, strlen(value)); + + close(fd); + + return ret == -1 ? -errno : 0; +} + +static int __tracefs_write_int(const char *path, int value) +{ + char *str; + int ret; + + if (asprintf(&str, "%d", value) < 0) + return -1; + + ret = __tracefs_write(path, str); + + free(str); + + return ret; +} + +#define tracefs_write_int(path, value) \ + ASSERT_EQ(__tracefs_write_int((path), (value)), 0) + +#define tracefs_write(path, value) \ + ASSERT_EQ(__tracefs_write((path), (value)), 0) + +static int tracefs_reset(void) +{ + if (__tracefs_write_int(TRACEFS_ROOT"/tracing_on", 0)) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/trace", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/set_event", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/current_tracer", "nop")) + return -1; + + return 0; +} + +struct tracefs_cpu_map_desc { + struct trace_buffer_meta *meta; + int cpu_fd; +}; + +int tracefs_cpu_map(struct tracefs_cpu_map_desc *desc, int cpu) +{ + int page_size = getpagesize(); + char *cpu_path; + void *map; + + if (asprintf(&cpu_path, + TRACEFS_ROOT"/per_cpu/cpu%d/trace_pipe_raw", + cpu) < 0) + return -ENOMEM; + + desc->cpu_fd = open(cpu_path, O_RDONLY | O_NONBLOCK); + free(cpu_path); + if (desc->cpu_fd < 0) + return -ENODEV; + + map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, desc->cpu_fd, 0); + if (map == MAP_FAILED) + return -errno; + + desc->meta = (struct trace_buffer_meta *)map; + + return 0; +} + +void tracefs_cpu_unmap(struct tracefs_cpu_map_desc *desc) +{ + munmap(desc->meta, desc->meta->meta_page_size); + close(desc->cpu_fd); +} + +FIXTURE(map) { + struct tracefs_cpu_map_desc map_desc; + bool umount; +}; + +FIXTURE_VARIANT(map) { + int subbuf_size; +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_4k) { + .subbuf_size = 4, +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_8k) { + .subbuf_size = 8, +}; + +FIXTURE_SETUP(map) +{ + int cpu = sched_getcpu(); + cpu_set_t cpu_mask; + bool fail, umount; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; + + ASSERT_GE(cpu, 0); + + ASSERT_EQ(tracefs_reset(), 0); + + tracefs_write_int(TRACEFS_ROOT"/buffer_subbuf_size_kb", variant->subbuf_size); + + ASSERT_EQ(tracefs_cpu_map(&self->map_desc, cpu), 0); + + /* + * Ensure generated events will be found on this very same ring-buffer. + */ + CPU_ZERO(&cpu_mask); + CPU_SET(cpu, &cpu_mask); + ASSERT_EQ(sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask), 0); +} + +FIXTURE_TEARDOWN(map) +{ + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); + + tracefs_cpu_unmap(&self->map_desc); +} + +TEST_F(map, meta_page_check) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + int cnt = 0; + + ASSERT_EQ(desc->meta->entries, 0); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 0); + + ASSERT_EQ(desc->meta->reader.id, 0); + ASSERT_EQ(desc->meta->reader.read, 0); + + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + ASSERT_EQ(desc->meta->reader.id, 0); + + tracefs_write_int(TRACEFS_ROOT"/tracing_on", 1); + for (int i = 0; i < 16; i++) + tracefs_write_int(TRACEFS_ROOT"/trace_marker", i); +again: + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + + ASSERT_EQ(desc->meta->entries, 16); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 16); + + ASSERT_EQ(desc->meta->reader.id, 1); + + if (!(cnt++)) + goto again; +} + +TEST_F(map, data_mmap) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + unsigned long meta_len, data_len; + void *data; + + meta_len = desc->meta->meta_page_size; + data_len = desc->meta->subbuf_size * desc->meta->nr_subbufs; + + /* Map all the available subbufs */ + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Map all the available subbufs - 1 */ + data_len -= desc->meta->subbuf_size; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Overflow the available subbufs by 1 */ + meta_len += desc->meta->subbuf_size * 2; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_EQ(data, MAP_FAILED); +} + +FIXTURE(snapshot) { + bool umount; +}; + +FIXTURE_SETUP(snapshot) +{ + bool fail, umount; + struct stat sb; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (stat(TRACEFS_ROOT"/snapshot", &sb)) + SKIP(return, "Skipping: %s", "snapshot not available"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; +} + +FIXTURE_TEARDOWN(snapshot) +{ + __tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "!snapshot"); + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); +} + +TEST_F(snapshot, excludes_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_GE(cpu, 0); + tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"); + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), -EBUSY); +} + +TEST_F(snapshot, excluded_by_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), 0); + + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"), -EBUSY); + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/snapshot", + "1"), -EBUSY); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 4a9ff515a3..7ce03d832b 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= hwprobe vector mm +RISCV_SUBTARGETS ?= hwprobe vector mm sigreturn else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/sigreturn/.gitignore b/tools/testing/selftests/riscv/sigreturn/.gitignore new file mode 100644 index 0000000000..35002b8ae7 --- /dev/null +++ b/tools/testing/selftests/riscv/sigreturn/.gitignore @@ -0,0 +1 @@ +sigreturn diff --git a/tools/testing/selftests/riscv/sigreturn/Makefile b/tools/testing/selftests/riscv/sigreturn/Makefile new file mode 100644 index 0000000000..eb8bac9279 --- /dev/null +++ b/tools/testing/selftests/riscv/sigreturn/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2021 ARM Limited +# Originally tools/testing/arm64/abi/Makefile + +CFLAGS += -I$(top_srcdir)/tools/include + +TEST_GEN_PROGS := sigreturn + +include ../../lib.mk + +$(OUTPUT)/sigreturn: sigreturn.c + $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/sigreturn/sigreturn.c b/tools/testing/selftests/riscv/sigreturn/sigreturn.c new file mode 100644 index 0000000000..ed351a1cb9 --- /dev/null +++ b/tools/testing/selftests/riscv/sigreturn/sigreturn.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +#define RISCV_V_MAGIC 0x53465457 +#define DEFAULT_VALUE 2 +#define SIGNAL_HANDLER_OVERRIDE 3 + +static void simple_handle(int sig_no, siginfo_t *info, void *vcontext) +{ + ucontext_t *context = vcontext; + + context->uc_mcontext.__gregs[REG_PC] = context->uc_mcontext.__gregs[REG_PC] + 4; +} + +static void vector_override(int sig_no, siginfo_t *info, void *vcontext) +{ + ucontext_t *context = vcontext; + + // vector state + struct __riscv_extra_ext_header *ext; + struct __riscv_v_ext_state *v_ext_state; + + /* Find the vector context. */ + ext = (void *)(&context->uc_mcontext.__fpregs); + if (ext->hdr.magic != RISCV_V_MAGIC) { + fprintf(stderr, "bad vector magic: %x\n", ext->hdr.magic); + abort(); + } + + v_ext_state = (void *)((char *)(ext) + sizeof(*ext)); + + *(int *)v_ext_state->datap = SIGNAL_HANDLER_OVERRIDE; + + context->uc_mcontext.__gregs[REG_PC] = context->uc_mcontext.__gregs[REG_PC] + 4; +} + +static int vector_sigreturn(int data, void (*handler)(int, siginfo_t *, void *)) +{ + int after_sigreturn; + struct sigaction sig_action = { + .sa_sigaction = handler, + .sa_flags = SA_SIGINFO + }; + + sigaction(SIGSEGV, &sig_action, 0); + + asm(".option push \n\ + .option arch, +v \n\ + vsetivli x0, 1, e32, m1, ta, ma \n\ + vmv.s.x v0, %1 \n\ + # Generate SIGSEGV \n\ + lw a0, 0(x0) \n\ + vmv.x.s %0, v0 \n\ + .option pop" : "=r" (after_sigreturn) : "r" (data)); + + return after_sigreturn; +} + +TEST(vector_restore) +{ + int result; + + result = vector_sigreturn(DEFAULT_VALUE, &simple_handle); + + EXPECT_EQ(DEFAULT_VALUE, result); +} + +TEST(vector_restore_signal_handler_override) +{ + int result; + + result = vector_sigreturn(DEFAULT_VALUE, &vector_override); + + EXPECT_EQ(SIGNAL_HANDLER_OVERRIDE, result); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index b83099160f..94886c82ae 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -194,14 +194,14 @@ int main(int argc, char *argv[]) ksft_set_plan(7); ksft_print_msg("Running on:\n"); - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("uname -a"); ksft_print_msg("Current BPF sysctl settings:\n"); /* Avoid using "sysctl" which may not be installed. */ - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("grep -H . /proc/sys/net/core/bpf_jit_enable"); - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("grep -H . /proc/sys/net/core/bpf_jit_harden"); affinity(); diff --git a/tools/testing/selftests/sigaltstack/current_stack_pointer.h b/tools/testing/selftests/sigaltstack/current_stack_pointer.h index ea9bdf3a90..09da8f1011 100644 --- a/tools/testing/selftests/sigaltstack/current_stack_pointer.h +++ b/tools/testing/selftests/sigaltstack/current_stack_pointer.h @@ -8,7 +8,7 @@ register unsigned long sp asm("sp"); register unsigned long sp asm("esp"); #elif __loongarch64 register unsigned long sp asm("$sp"); -#elif __ppc__ +#elif __powerpc__ register unsigned long sp asm("r1"); #elif __s390x__ register unsigned long sp asm("%15"); diff --git a/tools/testing/selftests/sync/sync_test.c b/tools/testing/selftests/sync/sync_test.c index 414a617db9..93db5aa246 100644 --- a/tools/testing/selftests/sync/sync_test.c +++ b/tools/testing/selftests/sync/sync_test.c @@ -109,6 +109,5 @@ int main(void) ksft_exit_fail_msg("%d out of %d sync tests failed\n", err, ksft_test_num()); - /* need this return to keep gcc happy */ - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c index 47e05fdc32..205b76a4ab 100644 --- a/tools/testing/selftests/timers/adjtick.c +++ b/tools/testing/selftests/timers/adjtick.c @@ -205,7 +205,7 @@ int main(int argc, char **argv) adjtimex(&tx1); if (err) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index 4332b49410..ad52e608b8 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -173,6 +173,6 @@ int main(void) timer_delete(tm1); } if (final_ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c index 992a77f2a7..4421cd562c 100644 --- a/tools/testing/selftests/timers/change_skew.c +++ b/tools/testing/selftests/timers/change_skew.c @@ -89,8 +89,8 @@ int main(int argc, char **argv) if (ret) { printf("[FAIL]"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c index 4b76450d78..73b636f89f 100644 --- a/tools/testing/selftests/timers/freq-step.c +++ b/tools/testing/selftests/timers/freq-step.c @@ -257,7 +257,7 @@ int main(int argc, char **argv) set_frequency(0.0); if (fails) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 23eb398c81..986abbdb15 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -268,7 +268,7 @@ int main(int argc, char **argv) if (ret < 0) { printf("Error: Problem setting STA_INS/STA_DEL!: %s\n", time_state_str(ret)); - return ksft_exit_fail(); + ksft_exit_fail(); } /* Validate STA_INS was set */ @@ -277,7 +277,7 @@ int main(int argc, char **argv) if (tx.status != STA_INS && tx.status != STA_DEL) { printf("Error: STA_INS/STA_DEL not set!: %s\n", time_state_str(ret)); - return ksft_exit_fail(); + ksft_exit_fail(); } if (tai_time) { @@ -295,7 +295,7 @@ int main(int argc, char **argv) se.sigev_value.sival_int = 0; if (timer_create(CLOCK_REALTIME, &se, &tm1) == -1) { printf("Error: timer_create failed\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } its1.it_value.tv_sec = next_leap; its1.it_value.tv_nsec = 0; @@ -366,7 +366,7 @@ int main(int argc, char **argv) if (error_found) { printf("Errors observed\n"); clear_time_state(); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("\n"); if ((iterations != -1) && !(--iterations)) @@ -374,5 +374,5 @@ int main(int argc, char **argv) } clear_time_state(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c index f70802c5dd..8fd065eec9 100644 --- a/tools/testing/selftests/timers/leapcrash.c +++ b/tools/testing/selftests/timers/leapcrash.c @@ -87,7 +87,7 @@ int main(void) tv.tv_usec = 0; if (settimeofday(&tv, NULL)) { printf("Error: You're likely not running with proper (ie: root) permissions\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } tx.modes = 0; adjtimex(&tx); @@ -104,5 +104,5 @@ int main(void) fflush(stdout); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c index 7916cf5cc6..f3179a605b 100644 --- a/tools/testing/selftests/timers/mqueue-lat.c +++ b/tools/testing/selftests/timers/mqueue-lat.c @@ -107,8 +107,8 @@ int main(int argc, char **argv) ret = mqueue_lat_test(); if (ret < 0) { printf("[FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index c001dd7917..07c81c0093 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -260,16 +260,16 @@ int main(int argc, char **argv) ksft_print_msg("based timers if other threads run on the CPU...\n"); if (check_itimer(ITIMER_VIRTUAL) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_itimer(ITIMER_PROF) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_itimer(ITIMER_REAL) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); /* * It's unfortunately hard to reliably test a timer expiration @@ -281,10 +281,10 @@ int main(int argc, char **argv) * find a better solution. */ if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_timer_distribution() < 0) - return ksft_exit_fail(); + ksft_exit_fail(); ksft_finished(); } diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index 6eba203f9d..030143eb09 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -137,11 +137,11 @@ int main(int argc, char **argv) if (tx1.offset || tx2.offset || tx1.freq != tx2.freq || tx1.tick != tx2.tick) { printf(" [SKIP]\n"); - return ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n"); + ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n"); } printf(" [FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf(" [OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c index 688cfd81b5..f7d978721b 100644 --- a/tools/testing/selftests/timers/set-2038.c +++ b/tools/testing/selftests/timers/set-2038.c @@ -128,6 +128,6 @@ out: /* restore clock */ settime(start); if (ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-tai.c b/tools/testing/selftests/timers/set-tai.c index 8c4179ee2c..5b67462efc 100644 --- a/tools/testing/selftests/timers/set-tai.c +++ b/tools/testing/selftests/timers/set-tai.c @@ -61,9 +61,9 @@ int main(int argc, char **argv) ret = get_tai(); if (ret != i) { printf("[FAILED] expected: %i got %i\n", i, ret); - return ksft_exit_fail(); + ksft_exit_fail(); } } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 50da45437d..7ce240c89b 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -278,6 +278,6 @@ int main(void) ret |= do_timer_oneshot(clock_id, 0); } if (ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-tz.c b/tools/testing/selftests/timers/set-tz.c index 62bd33eb16..20daaf1782 100644 --- a/tools/testing/selftests/timers/set-tz.c +++ b/tools/testing/selftests/timers/set-tz.c @@ -102,9 +102,9 @@ int main(int argc, char **argv) printf("[OK]\n"); set_tz(min, dst); - return ksft_exit_pass(); + ksft_exit_pass(); err: set_tz(min, dst); - return ksft_exit_fail(); + ksft_exit_fail(); } diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index 63913f75b3..c8e6bffe4e 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -70,8 +70,8 @@ int main(int argc, char **argv) if (ret) { printf("[FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index 80aed4bf06..76b38e41d9 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -189,5 +189,5 @@ out: /* die */ if (ret) ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index d13ebde203..d500884801 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -320,10 +320,10 @@ int validate_set_offset(void) int main(int argc, char **argv) { if (validate_freq()) - return ksft_exit_fail(); + ksft_exit_fail(); if (validate_set_offset()) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/tty/tty_tstamp_update.c b/tools/testing/selftests/tty/tty_tstamp_update.c index 0ee97943dc..9e1a40f5db 100644 --- a/tools/testing/selftests/tty/tty_tstamp_update.c +++ b/tools/testing/selftests/tty/tty_tstamp_update.c @@ -47,42 +47,60 @@ int main(int argc, char **argv) int r; char tty[PATH_MAX] = {}; struct stat st1, st2; + int result = KSFT_FAIL; ksft_print_header(); ksft_set_plan(1); r = readlink("/proc/self/fd/0", tty, PATH_MAX); - if (r < 0) - ksft_exit_fail_msg("readlink on /proc/self/fd/0 failed: %m\n"); + if (r < 0) { + ksft_print_msg("readlink on /proc/self/fd/0 failed: %m\n"); + goto out; + } + + if (!tty_valid(tty)) { + ksft_print_msg("invalid tty path '%s'\n", tty); + result = KSFT_SKIP; + goto out; - if (!tty_valid(tty)) - ksft_exit_skip("invalid tty path '%s'\n", tty); + } r = stat(tty, &st1); - if (r < 0) - ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty); + if (r < 0) { + ksft_print_msg("stat failed on tty path '%s': %m\n", tty); + goto out; + } /* We need to wait at least 8 seconds in order to observe timestamp change */ /* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fbf47635315ab308c9b58a1ea0906e711a9228de */ sleep(10); r = write_dev_tty(); - if (r < 0) - ksft_exit_fail_msg("failed to write to /dev/tty: %s\n", - strerror(-r)); + if (r < 0) { + ksft_print_msg("failed to write to /dev/tty: %s\n", + strerror(-r)); + goto out; + } r = stat(tty, &st2); - if (r < 0) - ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty); + if (r < 0) { + ksft_print_msg("stat failed on tty path '%s': %m\n", tty); + goto out; + } /* We wrote to the terminal so timestamps should have been updated */ if (st1.st_atim.tv_sec == st2.st_atim.tv_sec && st1.st_mtim.tv_sec == st2.st_mtim.tv_sec) { - ksft_test_result_fail("tty timestamps not updated\n"); - ksft_exit_fail(); + ksft_print_msg("tty timestamps not updated\n"); + goto out; } - ksft_test_result_pass( + ksft_print_msg( "timestamps of terminal '%s' updated after write to /dev/tty\n", tty); - return EXIT_SUCCESS; + result = KSFT_PASS; + +out: + ksft_test_result_report(result, "tty_tstamp_update\n"); + + ksft_finished(); } diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index dcd7509fe2..0bb46793dc 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -261,6 +261,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Register without separator spacing should still match */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;u32 field2"; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ reg.enable_bit = 29; reg.name_args = (__u64)"__test_event u32 field1;"; @@ -288,6 +294,8 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); + unreg.disable_bit = 29; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); /* Delete should have been auto-done after close and unregister */ close(self->data_fd); diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index d53a4d8008..98d8ba2afa 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -1,35 +1,30 @@ # SPDX-License-Identifier: GPL-2.0 -include ../lib.mk - uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu -TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi -TEST_GEN_PROGS += $(OUTPUT)/vdso_test_clock_getres +TEST_GEN_PROGS := vdso_test_gettimeofday +TEST_GEN_PROGS += vdso_test_getcpu +TEST_GEN_PROGS += vdso_test_abi +TEST_GEN_PROGS += vdso_test_clock_getres ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) -TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 +TEST_GEN_PROGS += vdso_standalone_test_x86 endif -TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness +TEST_GEN_PROGS += vdso_test_correctness CFLAGS := -std=gnu99 -CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector -LDFLAGS_vdso_test_correctness := -ldl + ifeq ($(CONFIG_X86_32),y) LDLIBS += -lgcc_s endif -all: $(TEST_GEN_PROGS) +include ../lib.mk $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c $(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c + $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c - $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \ - vdso_standalone_test_x86.c parse_vdso.c \ - -o $@ +$(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector + $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c - $(CC) $(CFLAGS) \ - vdso_test_correctness.c \ - -o $@ \ - $(LDFLAGS_vdso_test_correctness) +$(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config index a7f8e8a956..66290cf289 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config +++ b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config @@ -2,7 +2,7 @@ CONFIG_NONPORTABLE=y CONFIG_ARCH_RV32I=y CONFIG_MMU=y CONFIG_FPU=y -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_RISCV_ISA_FALLBACK=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config index daeb3e5e09..db1aa9f388 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config +++ b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config @@ -1,7 +1,7 @@ CONFIG_ARCH_RV64I=y CONFIG_MMU=y CONFIG_FPU=y -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_RISCV_ISA_FALLBACK=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 507555714b..f314d3789f 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -41,7 +41,6 @@ CONFIG_KALLSYMS=y CONFIG_BUG=y CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y CONFIG_JUMP_LABEL=y -CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_SHMEM=y CONFIG_SLUB=y diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c index d884fd69dd..95aad6d884 100644 --- a/tools/testing/selftests/x86/amx.c +++ b/tools/testing/selftests/x86/amx.c @@ -103,21 +103,6 @@ static void clearhandler(int sig) #define CPUID_LEAF1_ECX_XSAVE_MASK (1 << 26) #define CPUID_LEAF1_ECX_OSXSAVE_MASK (1 << 27) -static inline void check_cpuid_xsave(void) -{ - uint32_t eax, ebx, ecx, edx; - - /* - * CPUID.1:ECX.XSAVE[bit 26] enumerates general - * support for the XSAVE feature set, including - * XGETBV. - */ - __cpuid_count(1, 0, eax, ebx, ecx, edx); - if (!(ecx & CPUID_LEAF1_ECX_XSAVE_MASK)) - fatal_error("cpuid: no CPU xsave support"); - if (!(ecx & CPUID_LEAF1_ECX_OSXSAVE_MASK)) - fatal_error("cpuid: no OS xsave support"); -} static uint32_t xbuf_size; @@ -350,6 +335,7 @@ enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED }; /* arch_prctl() and sigaltstack() test */ +#define ARCH_GET_XCOMP_SUPP 0x1021 #define ARCH_GET_XCOMP_PERM 0x1022 #define ARCH_REQ_XCOMP_PERM 0x1023 @@ -928,8 +914,15 @@ static void test_ptrace(void) int main(void) { - /* Check hardware availability at first */ - check_cpuid_xsave(); + unsigned long features; + long rc; + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features); + if (rc || (features & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE) { + ksft_print_msg("no AMX support\n"); + return KSFT_SKIP; + } + check_cpuid_xtiledata(); init_stashed_xsave(); diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 215b8150b7..0ea4f68139 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -1183,7 +1183,7 @@ int main(int argc, char **argv) if (!cpu_has_lam()) { ksft_print_msg("Unsupported LAM feature!\n"); - return -1; + return KSFT_SKIP; } while ((c = getopt(argc, argv, "ht:")) != -1) { @@ -1237,5 +1237,5 @@ int main(int argc, char **argv) ksft_set_plan(tests_cnt); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c index f0d876d482..d53959e035 100644 --- a/tools/testing/selftests/x86/test_mremap_vdso.c +++ b/tools/testing/selftests/x86/test_mremap_vdso.c @@ -19,6 +19,7 @@ #include #include #include +#include "../kselftest.h" #define PAGE_SIZE 4096 @@ -29,13 +30,13 @@ static int try_to_remap(void *vdso_addr, unsigned long size) /* Searching for memory location where to remap */ dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (dest_addr == MAP_FAILED) { - printf("[WARN]\tmmap failed (%d): %m\n", errno); + ksft_print_msg("WARN: mmap failed (%d): %m\n", errno); return 0; } - printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n", - vdso_addr, (unsigned long)vdso_addr + size, - dest_addr, (unsigned long)dest_addr + size); + ksft_print_msg("Moving vDSO: [%p, %#lx] -> [%p, %#lx]\n", + vdso_addr, (unsigned long)vdso_addr + size, + dest_addr, (unsigned long)dest_addr + size); fflush(stdout); new_addr = mremap(vdso_addr, size, size, @@ -43,10 +44,10 @@ static int try_to_remap(void *vdso_addr, unsigned long size) if ((unsigned long)new_addr == (unsigned long)-1) { munmap(dest_addr, size); if (errno == EINVAL) { - printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n"); + ksft_print_msg("vDSO partial move failed, will try with bigger size\n"); return -1; /* Retry with larger */ } - printf("[FAIL]\tmremap failed (%d): %m\n", errno); + ksft_print_msg("[FAIL]\tmremap failed (%d): %m\n", errno); return 1; } @@ -58,11 +59,12 @@ int main(int argc, char **argv, char **envp) { pid_t child; + ksft_print_header(); + ksft_set_plan(1); + child = fork(); - if (child == -1) { - printf("[WARN]\tfailed to fork (%d): %m\n", errno); - return 1; - } + if (child == -1) + ksft_exit_fail_msg("failed to fork (%d): %m\n", errno); if (child == 0) { unsigned long vdso_size = PAGE_SIZE; @@ -70,9 +72,9 @@ int main(int argc, char **argv, char **envp) int ret = -1; auxval = getauxval(AT_SYSINFO_EHDR); - printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval); + ksft_print_msg("AT_SYSINFO_EHDR is %#lx\n", auxval); if (!auxval || auxval == -ENOENT) { - printf("[WARN]\tgetauxval failed\n"); + ksft_print_msg("WARN: getauxval failed\n"); return 0; } @@ -92,16 +94,13 @@ int main(int argc, char **argv, char **envp) int status; if (waitpid(child, &status, 0) != child || - !WIFEXITED(status)) { - printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n"); - return 1; - } else if (WEXITSTATUS(status) != 0) { - printf("[FAIL]\tChild failed with %d\n", - WEXITSTATUS(status)); - return 1; - } - printf("[OK]\n"); + !WIFEXITED(status)) + ksft_test_result_fail("mremap() of the vDSO does not work on this kernel!\n"); + else if (WEXITSTATUS(status) != 0) + ksft_test_result_fail("Child failed with %d\n", WEXITSTATUS(status)); + else + ksft_test_result_pass("%s\n", __func__); } - return 0; + ksft_finished(); } diff --git a/tools/testing/selftests/x86/test_shadow_stack.c b/tools/testing/selftests/x86/test_shadow_stack.c index 757e6527f6..ee909a7927 100644 --- a/tools/testing/selftests/x86/test_shadow_stack.c +++ b/tools/testing/selftests/x86/test_shadow_stack.c @@ -556,7 +556,7 @@ struct node { * looked at the shadow stack gaps. * 5. See if it landed in the gap. */ -int test_guard_gap(void) +int test_guard_gap_other_gaps(void) { void *free_area, *shstk, *test_map = (void *)0xFFFFFFFFFFFFFFFF; struct node *head = NULL, *cur; @@ -593,11 +593,64 @@ int test_guard_gap(void) if (shstk - test_map - PAGE_SIZE != PAGE_SIZE) return 1; - printf("[OK]\tGuard gap test\n"); + printf("[OK]\tGuard gap test, other mapping's gaps\n"); return 0; } +/* Tests respecting the guard gap of the mapping getting placed */ +int test_guard_gap_new_mappings_gaps(void) +{ + void *free_area, *shstk_start, *test_map = (void *)0xFFFFFFFFFFFFFFFF; + struct node *head = NULL, *cur; + int ret = 0; + + free_area = mmap(0, PAGE_SIZE * 4, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + munmap(free_area, PAGE_SIZE * 4); + + /* Test letting map_shadow_stack find a free space */ + shstk_start = mmap(free_area, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (shstk_start == MAP_FAILED || shstk_start != free_area) + return 1; + + while (test_map > shstk_start) { + test_map = (void *)syscall(__NR_map_shadow_stack, 0, PAGE_SIZE, 0); + if (test_map == MAP_FAILED) { + printf("[INFO]\tmap_shadow_stack MAP_FAILED\n"); + ret = 1; + break; + } + + cur = malloc(sizeof(*cur)); + cur->mapping = test_map; + + cur->next = head; + head = cur; + + if (test_map == free_area + PAGE_SIZE) { + printf("[INFO]\tNew mapping has other mapping in guard gap!\n"); + ret = 1; + break; + } + } + + while (head) { + cur = head; + head = cur->next; + munmap(cur->mapping, PAGE_SIZE); + free(cur); + } + + munmap(shstk_start, PAGE_SIZE); + + if (!ret) + printf("[OK]\tGuard gap test, placement mapping's gaps\n"); + + return ret; +} + /* * Too complicated to pull it out of the 32 bit header, but also get the * 64 bit one needed above. Just define a copy here. @@ -850,9 +903,15 @@ int main(int argc, char *argv[]) goto out; } - if (test_guard_gap()) { + if (test_guard_gap_other_gaps()) { + ret = 1; + printf("[FAIL]\tGuard gap test, other mappings' gaps\n"); + goto out; + } + + if (test_guard_gap_new_mappings_gaps()) { ret = 1; - printf("[FAIL]\tGuard gap test\n"); + printf("[FAIL]\tGuard gap test, placement mapping's gaps\n"); goto out; } diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 47cab97280..d4c8e8d79d 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -21,6 +21,13 @@ #include #include "helpers.h" +#include "../kselftest.h" + +#ifdef __x86_64__ +#define TOTAL_TESTS 13 +#else +#define TOTAL_TESTS 8 +#endif #ifdef __x86_64__ # define VSYS(x) (x) @@ -39,18 +46,6 @@ /* max length of lines in /proc/self/maps - anything longer is skipped here */ #define MAPS_LINE_LEN 128 -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - /* vsyscalls and vDSO */ bool vsyscall_map_r = false, vsyscall_map_x = false; @@ -75,83 +70,25 @@ static void init_vdso(void) if (!vdso) vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); if (!vdso) { - printf("[WARN]\tfailed to find vDSO\n"); + ksft_print_msg("[WARN] failed to find vDSO\n"); return; } vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday"); if (!vdso_gtod) - printf("[WARN]\tfailed to find gettimeofday in vDSO\n"); + ksft_print_msg("[WARN] failed to find gettimeofday in vDSO\n"); vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); if (!vdso_gettime) - printf("[WARN]\tfailed to find clock_gettime in vDSO\n"); + ksft_print_msg("[WARN] failed to find clock_gettime in vDSO\n"); vdso_time = (time_func_t)dlsym(vdso, "__vdso_time"); if (!vdso_time) - printf("[WARN]\tfailed to find time in vDSO\n"); + ksft_print_msg("[WARN] failed to find time in vDSO\n"); vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); if (!vdso_getcpu) - printf("[WARN]\tfailed to find getcpu in vDSO\n"); -} - -static int init_vsys(void) -{ -#ifdef __x86_64__ - int nerrs = 0; - FILE *maps; - char line[MAPS_LINE_LEN]; - bool found = false; - - maps = fopen("/proc/self/maps", "r"); - if (!maps) { - printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); - vsyscall_map_r = true; - return 0; - } - - while (fgets(line, MAPS_LINE_LEN, maps)) { - char r, x; - void *start, *end; - char name[MAPS_LINE_LEN]; - - /* sscanf() is safe here as strlen(name) >= strlen(line) */ - if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", - &start, &end, &r, &x, name) != 5) - continue; - - if (strcmp(name, "[vsyscall]")) - continue; - - printf("\tvsyscall map: %s", line); - - if (start != (void *)0xffffffffff600000 || - end != (void *)0xffffffffff601000) { - printf("[FAIL]\taddress range is nonsense\n"); - nerrs++; - } - - printf("\tvsyscall permissions are %c-%c\n", r, x); - vsyscall_map_r = (r == 'r'); - vsyscall_map_x = (x == 'x'); - - found = true; - break; - } - - fclose(maps); - - if (!found) { - printf("\tno vsyscall map in /proc/self/maps\n"); - vsyscall_map_r = false; - vsyscall_map_x = false; - } - - return nerrs; -#else - return 0; -#endif + ksft_print_msg("[WARN] failed to find getcpu in vDSO\n"); } /* syscalls */ @@ -176,98 +113,76 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node, return syscall(SYS_getcpu, cpu, node, cache); } -static jmp_buf jmpbuf; -static volatile unsigned long segv_err; - -static void sigsegv(int sig, siginfo_t *info, void *ctx_void) -{ - ucontext_t *ctx = (ucontext_t *)ctx_void; - - segv_err = ctx->uc_mcontext.gregs[REG_ERR]; - siglongjmp(jmpbuf, 1); -} - static double tv_diff(const struct timeval *a, const struct timeval *b) { return (double)(a->tv_sec - b->tv_sec) + (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6; } -static int check_gtod(const struct timeval *tv_sys1, - const struct timeval *tv_sys2, - const struct timezone *tz_sys, - const char *which, - const struct timeval *tv_other, - const struct timezone *tz_other) +static void check_gtod(const struct timeval *tv_sys1, + const struct timeval *tv_sys2, + const struct timezone *tz_sys, + const char *which, + const struct timeval *tv_other, + const struct timezone *tz_other) { - int nerrs = 0; double d1, d2; - if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) { - printf("[FAIL] %s tz mismatch\n", which); - nerrs++; - } + if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || + tz_sys->tz_dsttime != tz_other->tz_dsttime)) + ksft_print_msg("%s tz mismatch\n", which); d1 = tv_diff(tv_other, tv_sys1); d2 = tv_diff(tv_sys2, tv_other); - printf("\t%s time offsets: %lf %lf\n", which, d1, d2); - if (d1 < 0 || d2 < 0) { - printf("[FAIL]\t%s time was inconsistent with the syscall\n", which); - nerrs++; - } else { - printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which); - } + ksft_print_msg("%s time offsets: %lf %lf\n", which, d1, d2); - return nerrs; + ksft_test_result(!(d1 < 0 || d2 < 0), "%s gettimeofday()'s timeval\n", which); } -static int test_gtod(void) +static void test_gtod(void) { struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys; struct timezone tz_sys, tz_vdso, tz_vsys; long ret_vdso = -1; long ret_vsys = -1; - int nerrs = 0; - printf("[RUN]\ttest gettimeofday()\n"); + ksft_print_msg("test gettimeofday()\n"); if (sys_gtod(&tv_sys1, &tz_sys) != 0) - err(1, "syscall gettimeofday"); + ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno)); if (vdso_gtod) ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); if (vsyscall_map_x) ret_vsys = vgtod(&tv_vsys, &tz_vsys); if (sys_gtod(&tv_sys2, &tz_sys) != 0) - err(1, "syscall gettimeofday"); + ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno)); if (vdso_gtod) { - if (ret_vdso == 0) { - nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); - } else { - printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso); - nerrs++; - } + if (ret_vdso == 0) + check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); + else + ksft_test_result_fail("vDSO gettimeofday() failed: %ld\n", ret_vdso); + } else { + ksft_test_result_skip("vdso_gtod isn't set\n"); } if (vsyscall_map_x) { - if (ret_vsys == 0) { - nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); - } else { - printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys); - nerrs++; - } + if (ret_vsys == 0) + check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); + else + ksft_test_result_fail("vsys gettimeofday() failed: %ld\n", ret_vsys); + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } - - return nerrs; } -static int test_time(void) { - int nerrs = 0; - - printf("[RUN]\ttest time()\n"); +static void test_time(void) +{ long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0; long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1; + + ksft_print_msg("test time()\n"); t_sys1 = sys_time(&t2_sys1); if (vdso_time) t_vdso = vdso_time(&t2_vdso); @@ -275,56 +190,60 @@ static int test_time(void) { t_vsys = vtime(&t2_vsys); t_sys2 = sys_time(&t2_sys2); if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { - printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2); - nerrs++; - return nerrs; + ksft_print_msg("syscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", + t_sys1, t2_sys1, t_sys2, t2_sys2); + ksft_test_result_skip("vdso_time\n"); + ksft_test_result_skip("vdso_time\n"); + return; } if (vdso_time) { - if (t_vdso < 0 || t_vdso != t2_vdso) { - printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso); - nerrs++; - } else if (t_vdso < t_sys1 || t_vdso > t_sys2) { - printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2); - nerrs++; - } else { - printf("[OK]\tvDSO time() is okay\n"); - } + if (t_vdso < 0 || t_vdso != t2_vdso) + ksft_test_result_fail("vDSO failed (ret:%ld output:%ld)\n", + t_vdso, t2_vdso); + else if (t_vdso < t_sys1 || t_vdso > t_sys2) + ksft_test_result_fail("vDSO returned the wrong time (%ld %ld %ld)\n", + t_sys1, t_vdso, t_sys2); + else + ksft_test_result_pass("vDSO time() is okay\n"); + } else { + ksft_test_result_skip("vdso_time isn't set\n"); } if (vsyscall_map_x) { - if (t_vsys < 0 || t_vsys != t2_vsys) { - printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); - nerrs++; - } else if (t_vsys < t_sys1 || t_vsys > t_sys2) { - printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2); - nerrs++; - } else { - printf("[OK]\tvsyscall time() is okay\n"); - } + if (t_vsys < 0 || t_vsys != t2_vsys) + ksft_test_result_fail("vsyscall failed (ret:%ld output:%ld)\n", + t_vsys, t2_vsys); + else if (t_vsys < t_sys1 || t_vsys > t_sys2) + ksft_test_result_fail("vsyscall returned the wrong time (%ld %ld %ld)\n", + t_sys1, t_vsys, t_sys2); + else + ksft_test_result_pass("vsyscall time() is okay\n"); + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } - - return nerrs; } -static int test_getcpu(int cpu) +static void test_getcpu(int cpu) { - int nerrs = 0; + unsigned int cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; long ret_sys, ret_vdso = -1, ret_vsys = -1; + unsigned int node = 0; + bool have_node = false; + cpu_set_t cpuset; - printf("[RUN]\tgetcpu() on CPU %d\n", cpu); + ksft_print_msg("getcpu() on CPU %d\n", cpu); - cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { - printf("[SKIP]\tfailed to force CPU %d\n", cpu); - return nerrs; + ksft_print_msg("failed to force CPU %d\n", cpu); + ksft_test_result_skip("vdso_getcpu\n"); + ksft_test_result_skip("vsyscall_map_x\n"); + + return; } - unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; - unsigned node = 0; - bool have_node = false; ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); if (vdso_getcpu) ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); @@ -332,10 +251,9 @@ static int test_getcpu(int cpu) ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); if (ret_sys == 0) { - if (cpu_sys != cpu) { - printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu); - nerrs++; - } + if (cpu_sys != cpu) + ksft_print_msg("syscall reported CPU %hu but should be %d\n", + cpu_sys, cpu); have_node = true; node = node_sys; @@ -343,63 +261,84 @@ static int test_getcpu(int cpu) if (vdso_getcpu) { if (ret_vdso) { - printf("[FAIL]\tvDSO getcpu() failed\n"); - nerrs++; + ksft_test_result_fail("vDSO getcpu() failed\n"); } else { if (!have_node) { have_node = true; node = node_vdso; } - if (cpu_vdso != cpu) { - printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu); - nerrs++; - } else { - printf("[OK]\tvDSO reported correct CPU\n"); - } - - if (node_vdso != node) { - printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node); - nerrs++; + if (cpu_vdso != cpu || node_vdso != node) { + if (cpu_vdso != cpu) + ksft_print_msg("vDSO reported CPU %hu but should be %d\n", + cpu_vdso, cpu); + if (node_vdso != node) + ksft_print_msg("vDSO reported node %hu but should be %hu\n", + node_vdso, node); + ksft_test_result_fail("Wrong values\n"); } else { - printf("[OK]\tvDSO reported correct node\n"); + ksft_test_result_pass("vDSO reported correct CPU and node\n"); } } + } else { + ksft_test_result_skip("vdso_getcpu isn't set\n"); } if (vsyscall_map_x) { if (ret_vsys) { - printf("[FAIL]\tvsyscall getcpu() failed\n"); - nerrs++; + ksft_test_result_fail("vsyscall getcpu() failed\n"); } else { if (!have_node) { have_node = true; node = node_vsys; } - if (cpu_vsys != cpu) { - printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu); - nerrs++; + if (cpu_vsys != cpu || node_vsys != node) { + if (cpu_vsys != cpu) + ksft_print_msg("vsyscall reported CPU %hu but should be %d\n", + cpu_vsys, cpu); + if (node_vsys != node) + ksft_print_msg("vsyscall reported node %hu but should be %hu\n", + node_vsys, node); + ksft_test_result_fail("Wrong values\n"); } else { - printf("[OK]\tvsyscall reported correct CPU\n"); - } - - if (node_vsys != node) { - printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node); - nerrs++; - } else { - printf("[OK]\tvsyscall reported correct node\n"); + ksft_test_result_pass("vsyscall reported correct CPU and node\n"); } } + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } +} + +#ifdef __x86_64__ + +static jmp_buf jmpbuf; +static volatile unsigned long segv_err; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; - return nerrs; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed\n"); } -static int test_vsys_r(void) +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { -#ifdef __x86_64__ - printf("[RUN]\tChecking read access to the vsyscall page\n"); + ucontext_t *ctx = (ucontext_t *)ctx_void; + + segv_err = ctx->uc_mcontext.gregs[REG_ERR]; + siglongjmp(jmpbuf, 1); +} + +static void test_vsys_r(void) +{ + ksft_print_msg("Checking read access to the vsyscall page\n"); bool can_read; if (sigsetjmp(jmpbuf, 1) == 0) { *(volatile int *)0xffffffffff600000; @@ -408,32 +347,25 @@ static int test_vsys_r(void) can_read = false; } - if (can_read && !vsyscall_map_r) { - printf("[FAIL]\tWe have read access, but we shouldn't\n"); - return 1; - } else if (!can_read && vsyscall_map_r) { - printf("[FAIL]\tWe don't have read access, but we should\n"); - return 1; - } else if (can_read) { - printf("[OK]\tWe have read access\n"); - } else { - printf("[OK]\tWe do not have read access: #PF(0x%lx)\n", - segv_err); - } -#endif - - return 0; + if (can_read && !vsyscall_map_r) + ksft_test_result_fail("We have read access, but we shouldn't\n"); + else if (!can_read && vsyscall_map_r) + ksft_test_result_fail("We don't have read access, but we should\n"); + else if (can_read) + ksft_test_result_pass("We have read access\n"); + else + ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err); } -static int test_vsys_x(void) +static void test_vsys_x(void) { -#ifdef __x86_64__ if (vsyscall_map_x) { /* We already tested this adequately. */ - return 0; + ksft_test_result_pass("vsyscall_map_x is true\n"); + return; } - printf("[RUN]\tMake sure that vsyscalls really page fault\n"); + ksft_print_msg("Make sure that vsyscalls really page fault\n"); bool can_exec; if (sigsetjmp(jmpbuf, 1) == 0) { @@ -443,20 +375,14 @@ static int test_vsys_x(void) can_exec = false; } - if (can_exec) { - printf("[FAIL]\tExecuting the vsyscall did not page fault\n"); - return 1; - } else if (segv_err & (1 << 4)) { /* INSTR */ - printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n", - segv_err); - } else { - printf("[FAIL]\tExecution failed with the wrong error: #PF(0x%lx)\n", - segv_err); - return 1; - } -#endif - - return 0; + if (can_exec) + ksft_test_result_fail("Executing the vsyscall did not page fault\n"); + else if (segv_err & (1 << 4)) /* INSTR */ + ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n", + segv_err); + else + ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n", + segv_err); } /* @@ -470,14 +396,13 @@ static int test_vsys_x(void) * fact that ptrace() ever worked was a nice courtesy of old kernels, * but the code to support it is fairly gross. */ -static int test_process_vm_readv(void) +static void test_process_vm_readv(void) { -#ifdef __x86_64__ char buf[4096]; struct iovec local, remote; int ret; - printf("[RUN]\tprocess_vm_readv() from vsyscall page\n"); + ksft_print_msg("process_vm_readv() from vsyscall page\n"); local.iov_base = buf; local.iov_len = 4096; @@ -489,27 +414,71 @@ static int test_process_vm_readv(void) * We expect process_vm_readv() to work if and only if the * vsyscall page is readable. */ - printf("[%s]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", vsyscall_map_r ? "FAIL" : "OK", ret, errno); - return vsyscall_map_r ? 1 : 0; + ksft_test_result(!vsyscall_map_r, + "process_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno); + return; } - if (vsyscall_map_r) { - if (!memcmp(buf, remote.iov_base, sizeof(buf))) { - printf("[OK]\tIt worked and read correct data\n"); - } else { - printf("[FAIL]\tIt worked but returned incorrect data\n"); - return 1; + if (vsyscall_map_r) + ksft_test_result(!memcmp(buf, remote.iov_base, sizeof(buf)), "Read data\n"); + else + ksft_test_result_fail("process_rm_readv() succeeded, but it should have failed in this configuration\n"); +} + +static void init_vsys(void) +{ + int nerrs = 0; + FILE *maps; + char line[MAPS_LINE_LEN]; + bool found = false; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + ksft_test_result_skip("Could not open /proc/self/maps -- assuming vsyscall is r-x\n"); + vsyscall_map_r = true; + return; + } + + while (fgets(line, MAPS_LINE_LEN, maps)) { + char r, x; + void *start, *end; + char name[MAPS_LINE_LEN]; + + /* sscanf() is safe here as strlen(name) >= strlen(line) */ + if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", + &start, &end, &r, &x, name) != 5) + continue; + + if (strcmp(name, "[vsyscall]")) + continue; + + ksft_print_msg("vsyscall map: %s", line); + + if (start != (void *)0xffffffffff600000 || + end != (void *)0xffffffffff601000) { + ksft_print_msg("address range is nonsense\n"); + nerrs++; } - } else { - printf("[FAIL]\tprocess_rm_readv() succeeded, but it should have failed in this configuration\n"); - return 1; + + ksft_print_msg("vsyscall permissions are %c-%c\n", r, x); + vsyscall_map_r = (r == 'r'); + vsyscall_map_x = (x == 'x'); + + found = true; + break; } -#endif - return 0; + fclose(maps); + + if (!found) { + ksft_print_msg("no vsyscall map in /proc/self/maps\n"); + vsyscall_map_r = false; + vsyscall_map_x = false; + } + + ksft_test_result(!nerrs, "vsyscall map\n"); } -#ifdef __x86_64__ static volatile sig_atomic_t num_vsyscall_traps; static void sigtrap(int sig, siginfo_t *info, void *ctx_void) @@ -521,15 +490,17 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) num_vsyscall_traps++; } -static int test_emulation(void) +static void test_emulation(void) { time_t tmp; bool is_native; - if (!vsyscall_map_x) - return 0; + if (!vsyscall_map_x) { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); + return; + } - printf("[RUN]\tchecking that vsyscalls are emulated\n"); + ksft_print_msg("checking that vsyscalls are emulated\n"); sethandler(SIGTRAP, sigtrap, 0); set_eflags(get_eflags() | X86_EFLAGS_TF); vtime(&tmp); @@ -545,36 +516,35 @@ static int test_emulation(void) */ is_native = (num_vsyscall_traps > 1); - printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n", - (is_native ? "FAIL" : "OK"), - (is_native ? "native" : "emulated"), - (int)num_vsyscall_traps); - - return is_native; + ksft_test_result(!is_native, "vsyscalls are %s (%d instructions in vsyscall page)\n", + (is_native ? "native" : "emulated"), (int)num_vsyscall_traps); } #endif int main(int argc, char **argv) { - int nerrs = 0; + int total_tests = TOTAL_TESTS; - init_vdso(); - nerrs += init_vsys(); + ksft_print_header(); + ksft_set_plan(total_tests); - nerrs += test_gtod(); - nerrs += test_time(); - nerrs += test_getcpu(0); - nerrs += test_getcpu(1); - - sethandler(SIGSEGV, sigsegv, 0); - nerrs += test_vsys_r(); - nerrs += test_vsys_x(); + init_vdso(); +#ifdef __x86_64__ + init_vsys(); +#endif - nerrs += test_process_vm_readv(); + test_gtod(); + test_time(); + test_getcpu(0); + test_getcpu(1); #ifdef __x86_64__ - nerrs += test_emulation(); + sethandler(SIGSEGV, sigsegv, 0); + test_vsys_r(); + test_vsys_x(); + test_process_vm_readv(); + test_emulation(); #endif - return nerrs ? 1 : 0; + ksft_finished(); } diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config index 6d4ba77847..0b7ecfb30d 100644 --- a/tools/tracing/rtla/Makefile.config +++ b/tools/tracing/rtla/Makefile.config @@ -3,7 +3,7 @@ STOP_ERROR := LIBTRACEEVENT_MIN_VERSION = 1.5 -LIBTRACEFS_MIN_VERSION = 1.3 +LIBTRACEFS_MIN_VERSION = 1.6 define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 01870d5094..7be17d09f7 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -36,13 +36,14 @@ struct osnoise_hist_params { cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; - char no_header; char no_summary; char no_index; char with_zeros; int bucket_size; int entries; + int warmup; + int buffer_size; }; struct osnoise_hist_cpu { @@ -436,9 +437,9 @@ static void osnoise_hist_usage(char *usage) static const char * const msg[] = { "", " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] \\", + " [-T us] [-t[file]] [-e sys[:event]] [--filter ] [--trigger ] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [-C[=cgroup_name]]", + " [--no-index] [--with-zeros] [-C[=cgroup_name]] [--warm-up]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -452,7 +453,7 @@ static void osnoise_hist_usage(char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -468,6 +469,8 @@ static void osnoise_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -531,13 +534,15 @@ static struct osnoise_hist_params {"with-zeros", no_argument, 0, '3'}, {"trigger", required_argument, 0, '4'}, {"filter", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -640,9 +645,13 @@ static struct osnoise_hist_params params->threshold = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '0') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; @@ -680,6 +689,12 @@ static struct osnoise_hist_params osnoise_hist_usage("--filter requires a previous -e\n"); } break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_hist_usage("Invalid option"); } @@ -886,6 +901,11 @@ int osnoise_hist_main(int argc, char *argv[]) goto out_hist; } + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } /* @@ -899,6 +919,25 @@ int osnoise_hist_main(int argc, char *argv[]) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_hist; + } + + } + tool->start_time = time(NULL); osnoise_hist_set_signals(params); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 457360db07..07ba55d4ec 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -40,6 +40,8 @@ struct osnoise_top_params { int set_sched; int cgroup; int hk_cpus; + int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -281,8 +283,8 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) static const char * const msg[] = { " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] \\", - " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]]", + " [-T us] [-t[file]] [-e sys[:event]] [--filter ] [--trigger ] \\", + " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -296,7 +298,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -307,6 +309,8 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -381,13 +385,15 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) {"trace", optional_argument, 0, 't'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, + {"warm-up", required_argument, 0, '2'}, + {"trace-buffer-size", required_argument, 0, '3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, &option_index); /* Detect the end of the options. */ @@ -480,9 +486,13 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; @@ -511,6 +521,12 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) osnoise_top_usage(params, "--filter requires a previous -e\n"); } break; + case '2': + params->warmup = get_llong_from_str(optarg); + break; + case '3': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_top_usage(params, "Invalid option"); } @@ -719,6 +735,12 @@ int osnoise_top_main(int argc, char **argv) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } /* @@ -732,6 +754,25 @@ int osnoise_top_main(int argc, char **argv) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_top; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_top; + } + + } + tool->start_time = time(NULL); osnoise_top_set_signals(params); diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 5b869caed1..a3907c390d 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -40,6 +40,7 @@ struct timerlat_hist_params { int no_aa; int dump_tasks; int user_workload; + int kernel_workload; int user_hist; cpu_set_t hk_cpu_set; struct sched_attr sched_param; @@ -52,6 +53,8 @@ struct timerlat_hist_params { char with_zeros; int bucket_size; int entries; + int warmup; + int buffer_size; }; struct timerlat_hist_cpu { @@ -425,8 +428,135 @@ timerlat_print_summary(struct timerlat_hist_params *params, trace_seq_reset(trace->seq); } +static void +timerlat_print_stats_all(struct timerlat_hist_params *params, + struct trace_instance *trace, + struct timerlat_hist_data *data) +{ + struct timerlat_hist_cpu *cpu_data; + struct timerlat_hist_cpu sum; + int cpu; + + if (params->no_summary) + return; + + memset(&sum, 0, sizeof(sum)); + sum.min_irq = ~0; + sum.min_thread = ~0; + sum.min_user = ~0; + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !CPU_ISSET(cpu, ¶ms->monitored_cpus)) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + cpu_data = &data->hist[cpu]; + + sum.irq_count += cpu_data->irq_count; + update_min(&sum.min_irq, &cpu_data->min_irq); + update_sum(&sum.sum_irq, &cpu_data->sum_irq); + update_max(&sum.max_irq, &cpu_data->max_irq); + + sum.thread_count += cpu_data->thread_count; + update_min(&sum.min_thread, &cpu_data->min_thread); + update_sum(&sum.sum_thread, &cpu_data->sum_thread); + update_max(&sum.max_thread, &cpu_data->max_thread); + + sum.user_count += cpu_data->user_count; + update_min(&sum.min_user, &cpu_data->min_user); + update_sum(&sum.sum_user, &cpu_data->sum_user); + update_max(&sum.max_user, &cpu_data->max_user); + } + + if (!params->no_index) + trace_seq_printf(trace->seq, "ALL: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, " IRQ"); + + if (!params->no_thread) + trace_seq_printf(trace->seq, " Thr"); + + if (params->user_hist) + trace_seq_printf(trace->seq, " Usr"); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "count:"); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9d ", + sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9d ", + sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9d ", + sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "min: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_user); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "avg: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_irq / sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_thread / sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_user / sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "max: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_user); + + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + /* - * timerlat_print_stats - print data for all CPUs + * timerlat_print_stats - print data for each CPUs */ static void timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *tool) @@ -509,6 +639,7 @@ timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *t trace_seq_reset(trace->seq); timerlat_print_summary(params, trace, data); + timerlat_print_stats_all(params, trace, data); } /* @@ -521,9 +652,10 @@ static void timerlat_hist_usage(char *usage) char *msg[] = { "", " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", - " [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", + " [-t[file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u]", + " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", + " [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -537,7 +669,7 @@ static void timerlat_hist_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -558,8 +690,11 @@ static void timerlat_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -621,6 +756,7 @@ static struct timerlat_hist_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"event", required_argument, 0, 'e'}, {"no-irq", no_argument, 0, '0'}, @@ -634,13 +770,15 @@ static struct timerlat_hist_params {"dma-latency", required_argument, 0, '8'}, {"no-aa", no_argument, 0, '9'}, {"dump-task", no_argument, 0, '\1'}, + {"warm-up", required_argument, 0, '\2'}, + {"trace-buffer-size", required_argument, 0, '\3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:uU0123456:7:8:9\1", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3", long_options, &option_index); /* detect the end of the options. */ @@ -723,6 +861,9 @@ static struct timerlat_hist_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = 1; + break; case 'n': params->output_divisor = 1; break; @@ -744,9 +885,13 @@ static struct timerlat_hist_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; break; @@ -809,6 +954,12 @@ static struct timerlat_hist_params case '\1': params->dump_tasks = 1; break; + case '\2': + params->warmup = get_llong_from_str(optarg); + break; + case '\3': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_hist_usage("Invalid option"); } @@ -831,6 +982,9 @@ static struct timerlat_hist_params if (!params->stop_us && !params->stop_total_us) params->no_aa = 1; + if (params->kernel_workload && params->user_workload) + timerlat_hist_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -906,6 +1060,22 @@ timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_param auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_hist = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_hist) { retval = osnoise_set_workload(tool->context, 0); if (retval) { @@ -1043,6 +1213,12 @@ int timerlat_hist_main(int argc, char *argv[]) if (retval) goto out_hist; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } if (!params->no_aa) { @@ -1063,22 +1239,6 @@ int timerlat_hist_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - tool->start_time = time(NULL); - timerlat_hist_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -1098,6 +1258,29 @@ int timerlat_hist_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + tool->start_time = time(NULL); + timerlat_hist_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); @@ -1123,6 +1306,7 @@ int timerlat_hist_main(int argc, char *argv[]) } } } + if (params->user_workload && !params_u.stopped_running) { params_u.should_run = 0; sleep(1); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 2665e0bb5f..8c16419fe2 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -44,6 +44,10 @@ struct timerlat_top_params { int hk_cpus; int user_top; int user_workload; + int kernel_workload; + int pretty_output; + int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -118,6 +122,37 @@ cleanup: return NULL; } +static void +timerlat_top_reset_sum(struct timerlat_top_cpu *summary) +{ + memset(summary, 0, sizeof(*summary)); + summary->min_irq = ~0; + summary->min_thread = ~0; + summary->min_user = ~0; +} + +static void +timerlat_top_update_sum(struct osnoise_tool *tool, int cpu, struct timerlat_top_cpu *sum) +{ + struct timerlat_top_data *data = tool->data; + struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu]; + + sum->irq_count += cpu_data->irq_count; + update_min(&sum->min_irq, &cpu_data->min_irq); + update_sum(&sum->sum_irq, &cpu_data->sum_irq); + update_max(&sum->max_irq, &cpu_data->max_irq); + + sum->thread_count += cpu_data->thread_count; + update_min(&sum->min_thread, &cpu_data->min_thread); + update_sum(&sum->sum_thread, &cpu_data->sum_thread); + update_max(&sum->max_thread, &cpu_data->max_thread); + + sum->user_count += cpu_data->user_count; + update_min(&sum->min_user, &cpu_data->min_user); + update_sum(&sum->sum_user, &cpu_data->sum_user); + update_max(&sum->max_user, &cpu_data->max_user); +} + /* * timerlat_hist_update - record a new timerlat occurent on cpu, updating data */ @@ -179,19 +214,22 @@ timerlat_top_handler(struct trace_seq *s, struct tep_record *record, /* * timerlat_top_header - print the header of the tool output */ -static void timerlat_top_header(struct osnoise_tool *top) +static void timerlat_top_header(struct timerlat_top_params *params, struct osnoise_tool *top) { - struct timerlat_top_params *params = top->params; struct trace_seq *s = top->trace.seq; char duration[26]; get_duration(top->start_time, duration, sizeof(duration)); - trace_seq_printf(s, "\033[2;37;40m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;37;40m"); + trace_seq_printf(s, " Timer Latency "); if (params->user_top) trace_seq_printf(s, " "); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); trace_seq_printf(s, "%-6s | IRQ Timer Latency (%s) | Thread Timer Latency (%s)", duration, @@ -204,11 +242,15 @@ static void timerlat_top_header(struct osnoise_tool *top) } trace_seq_printf(s, "\n"); - trace_seq_printf(s, "\033[2;30;47m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;30;47m"); + trace_seq_printf(s, "CPU COUNT | cur min avg max | cur min avg max"); if (params->user_top) trace_seq_printf(s, " | cur min avg max"); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); } @@ -277,6 +319,77 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) } } +/* + * timerlat_top_print_sum - prints the summary output + */ +static void +timerlat_top_print_sum(struct osnoise_tool *top, struct timerlat_top_cpu *summary) +{ + const char *split = "----------------------------------------"; + struct timerlat_top_params *params = top->params; + unsigned long long count = summary->irq_count; + int divisor = params->output_divisor; + struct trace_seq *s = top->trace.seq; + int e = 0; + + if (divisor == 0) + return; + + /* + * Skip if no data is available: is this cpu offline? + */ + if (!summary->irq_count && !summary->thread_count) + return; + + while (count > 999999) { + e++; + count /= 10; + } + + trace_seq_printf(s, "%.*s|%.*s|%.*s", 15, split, 40, split, 39, split); + if (params->user_top) + trace_seq_printf(s, "-|%.*s", 39, split); + trace_seq_printf(s, "\n"); + + trace_seq_printf(s, "ALL #%-6llu e%d |", count, e); + + if (!summary->irq_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_irq / params->output_divisor); + trace_seq_printf(s, "%9llu ", (summary->sum_irq / summary->irq_count) / divisor); + trace_seq_printf(s, "%9llu |", summary->max_irq / divisor); + } + + if (!summary->thread_count) { + trace_seq_printf(s, "%s %s %s %s", no_value, no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_thread / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_thread / summary->thread_count) / divisor); + trace_seq_printf(s, "%9llu", summary->max_thread / divisor); + } + + if (!params->user_top) { + trace_seq_printf(s, "\n"); + return; + } + + trace_seq_printf(s, " |"); + + if (!summary->user_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_user / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_user / summary->user_count) / divisor); + trace_seq_printf(s, "%9llu\n", summary->max_user / divisor); + } +} + /* * clear_terminal - clears the output terminal */ @@ -293,6 +406,7 @@ static void timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *top) { struct trace_instance *trace = &top->trace; + struct timerlat_top_cpu summary; static int nr_cpus = -1; int i; @@ -305,14 +419,19 @@ timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *to if (!params->quiet) clear_terminal(trace->seq); - timerlat_top_header(top); + timerlat_top_reset_sum(&summary); + + timerlat_top_header(params, top); for (i = 0; i < nr_cpus; i++) { if (params->cpus && !CPU_ISSET(i, ¶ms->monitored_cpus)) continue; timerlat_top_print(top, i); + timerlat_top_update_sum(top, i, &summary); } + timerlat_top_print_sum(top, &summary); + trace_seq_do_printf(trace->seq); trace_seq_reset(trace->seq); } @@ -327,8 +446,8 @@ static void timerlat_top_usage(char *usage) static const char *const msg[] = { "", " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", - " [[-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", - " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u]", + " [[-t[file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", + " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u|-k] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -343,7 +462,7 @@ static void timerlat_top_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " -D/--debug: print debug info", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -357,8 +476,11 @@ static void timerlat_top_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -418,6 +540,7 @@ static struct timerlat_top_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, @@ -425,13 +548,15 @@ static struct timerlat_top_params {"no-aa", no_argument, 0, '3'}, {"dump-tasks", no_argument, 0, '4'}, {"aa-only", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:uU0:1:2:345:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -516,6 +641,9 @@ static struct timerlat_top_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = true; + break; case 'n': params->output_divisor = 1; break; @@ -540,9 +668,13 @@ static struct timerlat_top_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; @@ -588,6 +720,12 @@ static struct timerlat_top_params case '4': params->dump_tasks = 1; break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_top_usage("Invalid option"); } @@ -607,6 +745,9 @@ static struct timerlat_top_params if (params->no_aa && params->aa_only) timerlat_top_usage("--no-aa and --aa-only are mutually exclusive!"); + if (params->kernel_workload && params->user_workload) + timerlat_top_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -685,6 +826,22 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_top = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_top) { retval = osnoise_set_workload(top->context, 0); if (retval) { @@ -693,6 +850,9 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * } } + if (isatty(1) && !params->quiet) + params->pretty_output = 1; + return 0; out_err: @@ -823,6 +983,12 @@ int timerlat_top_main(int argc, char *argv[]) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } if (!params->no_aa) { @@ -852,22 +1018,6 @@ int timerlat_top_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa && aa != top) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - top->start_time = time(NULL); - timerlat_top_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -887,6 +1037,27 @@ int timerlat_top_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa && aa != top) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + top->start_time = time(NULL); + timerlat_top_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c index e1ba6d9f42..170a706248 100644 --- a/tools/tracing/rtla/src/trace.c +++ b/tools/tracing/rtla/src/trace.c @@ -540,3 +540,18 @@ int trace_is_off(struct trace_instance *tool, struct trace_instance *trace) return 0; } + +/* + * trace_set_buffer_size - set the per-cpu tracing buffer size. + */ +int trace_set_buffer_size(struct trace_instance *trace, int size) +{ + int retval; + + debug_msg("Setting trace buffer size to %d Kb\n", size); + retval = tracefs_instance_set_buffer_size(trace->inst, size, -1); + if (retval) + err_msg("Error setting trace buffer size\n"); + + return retval; +} diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h index 2e9a89a256..c7c92dc9a1 100644 --- a/tools/tracing/rtla/src/trace.h +++ b/tools/tracing/rtla/src/trace.h @@ -48,3 +48,4 @@ int trace_events_enable(struct trace_instance *instance, int trace_event_add_filter(struct trace_events *event, char *filter); int trace_event_add_trigger(struct trace_events *event, char *trigger); int trace_is_off(struct trace_instance *tool, struct trace_instance *trace); +int trace_set_buffer_size(struct trace_instance *trace, int size); diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py index a8856a9c45..9e964c5be4 100644 --- a/tools/workqueue/wq_monitor.py +++ b/tools/workqueue/wq_monitor.py @@ -32,16 +32,13 @@ https://github.com/osandov/drgn. rescued The number of work items executed by the rescuer. """ -import sys import signal -import os import re import time import json import drgn -from drgn.helpers.linux.list import list_for_each_entry,list_empty -from drgn.helpers.linux.cpumask import for_each_possible_cpu +from drgn.helpers.linux.list import list_for_each_entry import argparse parser = argparse.ArgumentParser(description=desc, @@ -54,10 +51,6 @@ parser.add_argument('-j', '--json', action='store_true', help='Output in json') args = parser.parse_args() -def err(s): - print(s, file=sys.stderr, flush=True) - sys.exit(1) - workqueues = prog['workqueues'] WQ_UNBOUND = prog['WQ_UNBOUND'] diff --git a/tools/writeback/wb_monitor.py b/tools/writeback/wb_monitor.py new file mode 100644 index 0000000000..5e3591f1f9 --- /dev/null +++ b/tools/writeback/wb_monitor.py @@ -0,0 +1,172 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Kemeng Shi +# Copyright (C) 2024 Huawei Inc + +desc = """ +This is a drgn script based on wq_monitor.py to monitor writeback info on +backing dev. For more info on drgn, visit https://github.com/osandov/drgn. + + writeback(kB) Amount of dirty pages are currently being written back to + disk. + + reclaimable(kB) Amount of pages are currently reclaimable. + + dirtied(kB) Amount of pages have been dirtied. + + wrttien(kB) Amount of dirty pages have been written back to disk. + + avg_wb(kBps) Smoothly estimated write bandwidth of writing dirty pages + back to disk. +""" + +import signal +import re +import time +import json + +import drgn +from drgn.helpers.linux.list import list_for_each_entry + +import argparse +parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument('bdi', metavar='REGEX', nargs='*', + help='Target backing device name patterns (all if empty)') +parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1, + help='Monitoring interval (0 to print once and exit)') +parser.add_argument('-j', '--json', action='store_true', + help='Output in json') +parser.add_argument('-c', '--cgroup', action='store_true', + help='show writeback of bdi in cgroup') +args = parser.parse_args() + +bdi_list = prog['bdi_list'] + +WB_RECLAIMABLE = prog['WB_RECLAIMABLE'] +WB_WRITEBACK = prog['WB_WRITEBACK'] +WB_DIRTIED = prog['WB_DIRTIED'] +WB_WRITTEN = prog['WB_WRITTEN'] +NR_WB_STAT_ITEMS = prog['NR_WB_STAT_ITEMS'] + +PAGE_SHIFT = prog['PAGE_SHIFT'] + +def K(x): + return x << (PAGE_SHIFT - 10) + +class Stats: + def dict(self, now): + return { 'timestamp' : now, + 'name' : self.name, + 'writeback' : self.stats[WB_WRITEBACK], + 'reclaimable' : self.stats[WB_RECLAIMABLE], + 'dirtied' : self.stats[WB_DIRTIED], + 'written' : self.stats[WB_WRITTEN], + 'avg_wb' : self.avg_bw, } + + def table_header_str(): + return f'{"":>16} {"writeback":>10} {"reclaimable":>12} ' \ + f'{"dirtied":>9} {"written":>9} {"avg_bw":>9}' + + def table_row_str(self): + out = f'{self.name[-16:]:16} ' \ + f'{self.stats[WB_WRITEBACK]:10} ' \ + f'{self.stats[WB_RECLAIMABLE]:12} ' \ + f'{self.stats[WB_DIRTIED]:9} ' \ + f'{self.stats[WB_WRITTEN]:9} ' \ + f'{self.avg_bw:9} ' + return out + + def show_header(): + if Stats.table_fmt: + print() + print(Stats.table_header_str()) + + def show_stats(self): + if Stats.table_fmt: + print(self.table_row_str()) + else: + print(self.dict(Stats.now)) + +class WbStats(Stats): + def __init__(self, wb): + bdi_name = wb.bdi.dev_name.string_().decode() + # avoid to use bdi.wb.memcg_css which is only defined when + # CONFIG_CGROUP_WRITEBACK is enabled + if wb == wb.bdi.wb.address_of_(): + ino = "1" + else: + ino = str(wb.memcg_css.cgroup.kn.id.value_()) + self.name = bdi_name + '_' + ino + + self.stats = [0] * NR_WB_STAT_ITEMS + for i in range(NR_WB_STAT_ITEMS): + if wb.stat[i].count >= 0: + self.stats[i] = int(K(wb.stat[i].count)) + else: + self.stats[i] = 0 + + self.avg_bw = int(K(wb.avg_write_bandwidth)) + +class BdiStats(Stats): + def __init__(self, bdi): + self.name = bdi.dev_name.string_().decode() + self.stats = [0] * NR_WB_STAT_ITEMS + self.avg_bw = 0 + + def collectStats(self, wb_stats): + for i in range(NR_WB_STAT_ITEMS): + self.stats[i] += wb_stats.stats[i] + + self.avg_bw += wb_stats.avg_bw + +exit_req = False + +def sigint_handler(signr, frame): + global exit_req + exit_req = True + +def main(): + # handle args + Stats.table_fmt = not args.json + interval = args.interval + cgroup = args.cgroup + + re_str = None + if args.bdi: + for r in args.bdi: + if re_str is None: + re_str = r + else: + re_str += '|' + r + + filter_re = re.compile(re_str) if re_str else None + + # monitoring loop + signal.signal(signal.SIGINT, sigint_handler) + + while not exit_req: + Stats.now = time.time() + + Stats.show_header() + for bdi in list_for_each_entry('struct backing_dev_info', bdi_list.address_of_(), 'bdi_list'): + bdi_stats = BdiStats(bdi) + if filter_re and not filter_re.search(bdi_stats.name): + continue + + for wb in list_for_each_entry('struct bdi_writeback', bdi.wb_list.address_of_(), 'bdi_node'): + wb_stats = WbStats(wb) + bdi_stats.collectStats(wb_stats) + if cgroup: + wb_stats.show_stats() + + bdi_stats.show_stats() + if cgroup and Stats.table_fmt: + print() + + if interval == 0: + break + time.sleep(interval) + +if __name__ == "__main__": + main() -- cgit v1.2.3